# Titanic: Machine Learning from the disaster

## Problema
### Background
- O Titanic foi um famoso navio britânico que afundou no ano de 1912 após atingir um iceberg.
- Haviam 2224 pessoas a bordo, 1500 morreram no desastre

### Problemática
- Através dos dados fornecidos prever quais passageiros sobreviveriam no desastre.

## Enviando solução
- Enviar arquivo csv com 418 registros, mais o cabeçalho
- Arquivo deve conter somente 2 colunas
 - PassengerId (Qualquer ordem)
 - Survived (1 para sobrevivente, 0 para vítima)

# Imports

In [1]:
#####################################################
# Locais 
from libs.fastai.imports import *
from libs.fastai.structured import *
from libs import ml_helper

#####################################################
# Libs instaladas na venv do anaconda
# --> Pré-processamento de dados
from sklearn.preprocessing import LabelEncoder, StandardScaler

# --> Análise de testes
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, log_loss

# --> Modelos
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn import svm



# Variáveis globais

In [2]:
PATH = os.path.abspath(os.getcwd()) + '\\data\\'

In [3]:
df_raw_train = pd.read_csv(f'{PATH}\\train.csv', low_memory = False)
df_raw_test  = pd.read_csv(f'{PATH}\\test.csv', low_memory = False)

# Funções

In [4]:
def display_all(df):
    with pd.option_context('display.max_rows', 1000, 'display.max_columns', 1000):
        IPython.display.display(df)

# Dados

In [5]:
df_raw_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
df_raw_test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [7]:
def fill_na_with_mean(_df, _column):
    valores_column = pd.unique(_df[_column])
    valor_medio = round(np.nanmean(valores_column), 0)
    _df[_column] = _df[_column].fillna(valor_medio)

In [8]:
fill_na_with_mean(df_raw_train, 'Age')
fill_na_with_mean(df_raw_test, 'Age')


# Pre-Processando os dados

In [9]:
def pre_processor(df):
    
    
    Embarked = set(df.loc[df["Embarked"].notnull()]["Embarked"])
    Embarked_to_number = {ni: indi for indi, ni in enumerate(set(Embarked))}
    df['Embarked'] = df['Embarked'].map(Embarked_to_number)
    
    df['Age'].fillna(df['Age'].median(), inplace = True)
    df['Embarked'].fillna(df['Embarked'].median(), inplace = True)
    df['Fare'].fillna(df['Fare'].median(), inplace = True)
    
    df['Title'] = df['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
    title_names = (df['Title'].value_counts() < 10)
    df['Title'] = df['Title'].apply(lambda x: 'Misc' if title_names.loc[x] == True else x)


    labelencoder = LabelEncoder()
    df.Sex = labelencoder.fit_transform(df.Sex)
    
    is_sector_ticket_number = lambda x: x[1] if len(x) > 1 else ''
    lambda_ticket_number = lambda x: x[0] if x[0].isdigit() else is_sector_ticket_number(x)    
    df.Ticket_Number = [lambda_ticket_number(x) for x in df.Ticket.str.split(' ')]

    df = df.drop(['Ticket', 'Name', 'Cabin'], axis = 1)
    
    train_cats(df)
    
    
    
    return pd.get_dummies(df)
   # return pd.get_dummies(df, prefix=['PClass', 'Sex', 'SibSP', 'Parch', 'Ticket'], columns= )

In [10]:
df_raw_train = pre_processor(df_raw_train)
df_raw_test = pre_processor(df_raw_test)



In [11]:
# Salvando training set para arquivo feather
os.makedirs('tmp', exist_ok =True)
df_raw_train.to_feather('tmp/titanic_train')

# Salvando training set para arquivo feather
df_raw_test.to_feather('tmp/titanic_test')

# Primeiros testes do modelo

In [12]:
df_train = pd.read_feather('tmp/titanic_train')
df_test = pd.read_feather('tmp/titanic_test')

In [13]:
x_train, y_train, nas_train = proc_df(df_train, 'Survived')

In [14]:
m = RandomForestClassifier(n_estimators = 100, max_depth = 5, n_jobs=-1)
m.fit(x_train, y_train)
m.score(x_train, y_train)
predictions = m.predict(df_test)

In [15]:
print(m.score(x_train, y_train))

0.8428731762065096


# Avaliando dentre algorítmos de classificação, qual é o que possui o melhor indicador para focalizar esforços
## O procedimento a seguir foi retirado de um notebook do kaggle de jeffd23
link: *https://www.kaggle.com/jeffd23/10-classifier-showdown-in-scikit-learn*

In [16]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size = 0.3, random_state =1)

In [17]:
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf", C=0.025, probability=True),
    NuSVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis()]

# Logging for Visual Comparison
log_cols=["Classifier", "Accuracy", "Log Loss"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    clf.fit(x_train, y_train)
    name = clf.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****')
    train_predictions = clf.predict(x_test)
    acc = accuracy_score(y_test, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    
    train_predictions = clf.predict_proba(x_test)
    ll = log_loss(y_test, train_predictions)
    print("Log Loss: {}".format(ll))
    
    log_entry = pd.DataFrame([[name, acc*100, ll]], columns=log_cols)
    log = log.append(log_entry)
    
print("="*30)

KNeighborsClassifier
****Results****
Accuracy: 58.9552%
Log Loss: 6.182371454696778
SVC
****Results****
Accuracy: 57.0896%
Log Loss: 0.6342387462617194
NuSVC
****Results****
Accuracy: 76.1194%
Log Loss: 0.5360770542400297
DecisionTreeClassifier
****Results****
Accuracy: 73.8806%
Log Loss: 9.021322192700552
RandomForestClassifier
****Results****
Accuracy: 78.3582%
Log Loss: 0.6156812398007528
AdaBoostClassifier
****Results****
Accuracy: 77.2388%
Log Loss: 0.6783896580309434
GradientBoostingClassifier
****Results****
Accuracy: 77.2388%
Log Loss: 0.500825945161359
GaussianNB
****Results****
Accuracy: 77.2388%
Log Loss: 1.4089901646498368
LinearDiscriminantAnalysis
****Results****
Accuracy: 79.1045%
Log Loss: 0.5525765549702472
QuadraticDiscriminantAnalysis
****Results****
Accuracy: 55.2239%
Log Loss: 12.904542190486456




# Otimizando o processo de teste
## O procedimento a seguir foi retirado do notebook do kaggle de westen30
link: *https://www.kaggle.com/westen30/titanic-data-v2*

### Avaliando o melhor resultado possível alterando os hyperparemeters de RandomForestClassifier

In [18]:
steps = [('scaler', StandardScaler())]
pipe = Pipeline([('scaler', StandardScaler()), ('classifier', RandomForestClassifier())])

# Criando o grid de parametros

param_grid = [
    {'classifier': [RandomForestClassifier(random_state=1)],
     'classifier__n_estimators' : [50, 100, 200, 500],
     'classifier__max_features' : ['auto', 'sqrt', 'log2'],
     # 'classifier__min_samples_leaf': [0.04, 0.06, 0.08],
     'classifier__max_depth': [3,4,5,6,7],
     'classifier__oob_score': [True, False],
     'classifier__criterion': ['gini', 'entropy']}
]



# Criando o grid search object

clf = GridSearchCV(pipe, param_grid = param_grid, cv = 10, verbose=True, n_jobs = -1, scoring= 'accuracy')

# Treinando os dados

best_clf = clf.fit(x_train, y_train)


Fitting 10 folds for each of 240 candidates, totalling 2400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   24.5s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   53.3s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 2400 out of 2400 | elapsed:  5.2min finished


In [19]:
best_prediction = best_clf.predict(x_test)
print(best_clf.score(x_test, y_test))
print(best_clf.best_estimator_)
print(best_clf.best_params_)
print(best_clf.best_score_)

0.7723880597014925
Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('classifier',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=7, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=100, n_jobs=None,
                                        oob_score=True, random_state=1,
                                        verbose=0, warm_start=False))],
         verbose=False)
{'

### Avaliando o melhor resultado possível alterando os hyperparameters de GradientBoostingClassifier

In [20]:
steps = [('scaler', StandardScaler())]
pipe_grad_boosting = Pipeline([('scaler', StandardScaler()), ('classifier', GradientBoostingClassifier())])

# Criando o grid de parametros

param_grid_grad_boosting = [
    {'classifier': [GradientBoostingClassifier(random_state=2)],
     'classifier__learning_rate': [1.0, 0.5, 0.1],
     'classifier__n_estimators' : [50, 100, 200, 500],
     'classifier__subsample': [1.0, 0.5, 0.1],
     'classifier__criterion': ['friedman_mse', 'mse', 'mae'],
     'classifier__max_depth': [3,4,5,6,7],
     'classifier__max_features' : ['auto', 'sqrt', 'log2'],
    }
]



# Criando o grid search object

clf_grad_boosting = GridSearchCV(pipe, param_grid = param_grid_grad_boosting, cv = 10, verbose=True, n_jobs = -1, scoring= 'accuracy')

# Treinando os dados

best_clf_grad_boostin = clf_grad_boosting.fit(x_train, y_train)


Fitting 10 folds for each of 1620 candidates, totalling 16200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 248 tasks      | elapsed:   12.5s
[Parallel(n_jobs=-1)]: Done 498 tasks      | elapsed:   26.2s
[Parallel(n_jobs=-1)]: Done 848 tasks      | elapsed:   48.2s
[Parallel(n_jobs=-1)]: Done 1298 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1848 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 2498 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 3248 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 4098 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 5048 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 6098 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 7248 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 8498 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done 9848 tasks      | elapsed: 12.0min
[Parallel(n_jobs=-1)]: Done 11298 tasks      |

In [21]:
best_clf_grad_boostin

best_prediction_grad_boosting = best_clf_grad_boostin.predict(x_test)
print(best_clf_grad_boostin.score(x_test, y_test))
print(best_clf_grad_boostin.best_estimator_)
print(best_clf_grad_boostin.best_params_)
print(best_clf_grad_boostin.best_score_)

0.7611940298507462
Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('classifier',
                 GradientBoostingClassifier(ccp_alpha=0.0, criterion='mae',
                                            init=None, learning_rate=0.1,
                                            loss='deviance', max_depth=5,
                                            max_features='auto',
                                            max_leaf_nodes=None,
                                            min_impurity_decrease=0.0,
                                            min_impurity_split=None,
                                            min_samples_leaf=1,
                                            min_samples_split=2,
                                            min_weight_fraction_leaf=0.0,
                                            n_estimators=50,
                                            n_iter_no_change=None,


In [22]:
def predict_best_clf(clf, _name):
        clf_pred = clf.predict(df_test)
        output_clf = pd.DataFrame({'PassengerId' : df_test.PassengerId, 'Survived' : clf_pred})
        output_clf.to_csv(f'{_name}.csv', index=False)  

In [24]:
predict_best_clf(best_clf,'submit_rand_forest')
predict_best_clf(best_clf_grad_boostin,'submit_grad_boost')