In [3]:
import pandas as pd
import numpy as np
import random as rnd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

In [4]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
combine = [train_df, test_df]
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
print(train_df.columns.values)

['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
 'Ticket' 'Fare' 'Cabin' 'Embarked']


In [6]:
train_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
test_df.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [8]:
for dataset in combine:
    # Fill missing values for Age
    dataset['Age'].fillna(dataset['Age'].mean(), inplace=True)
    dataset['Fare'].fillna(dataset['Fare'].mean(), inplace=True)
    dataset['Family_count'] = dataset['SibSp'] + dataset['Parch']
    dataset.drop(['SibSp', 'Parch'], axis=1, inplace=True)
    
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Ticket,Fare,Cabin,Embarked,Family_count
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,PC 17599,71.2833,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,113803,53.1,C123,S,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,373450,8.05,,S,0


In [9]:
train_df.drop(['PassengerId'], axis=1, inplace=True)

In [10]:
# Clean categorical variables
# Create an indicator cariable in place of cabin
for dataset in combine:
    dataset['Cabin_ind'] = np.where(dataset['Cabin'].isnull(), 0, 1)
    # Convert Sex to a numeric variable
    gender_num = {'male': 0, 'female': 1}
    dataset['Sex'] = dataset['Sex'].map(gender_num)
    # Drop irrelevant/repetitive variables(Cabin, Embarked, Name, Ticket)
    dataset.drop(['Cabin', 'Embarked', 'Name', 'Ticket'], axis=1, inplace=True)

In [11]:
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Family_count,Cabin_ind
0,0,3,0,22.0,7.25,1,0
1,1,1,1,38.0,71.2833,1,1
2,1,3,1,26.0,7.925,0,0
3,1,1,1,35.0,53.1,1,1
4,0,3,0,35.0,8.05,0,0


In [12]:
# split train data into train and validation?
# features = train_df.drop('Survived', axis=1)
# labels = train_df['Survived']

# X_train, X_val, y_train, y_val = train_test_split(features, labels, test_size=0.4, random_state=42)

# Not gonna use

In [13]:
X_train = train_df.drop('Survived', axis=1)
Y_train = train_df['Survived']
X_test  = test_df.drop("PassengerId", axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

((891, 6), (891,), (418, 6))

In [14]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))
    
    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std, 3), params))

In [15]:
lr = LogisticRegression()
parameters = {
    'penalty': ['l1', 'l2'],
    'tol': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'random_state': [0, 1, 10, 100, 1000, None],
}

cv = GridSearchCV(lr, parameters, cv=5)
cv.fit(X_train, Y_train.values.ravel())

# print_results(cv)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'penalty': ['l1', 'l2'], 'tol': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'random_state': [0, 1, 10, 100, 1000, None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [16]:
lr1 = LogisticRegression(C=100, penalty='l1', random_state=None, tol=1)
lr1.fit(X_train, Y_train.values.ravel())

lr2 = LogisticRegression(C=1000, penalty='l1', random_state=1000, tol=0.01)
lr2.fit(X_train, Y_train.values.ravel())

lr3 = LogisticRegression(C=10, penalty='l1', random_state=0, tol=0.001)
lr3.fit(X_train, Y_train.values.ravel())

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=0, solver='warn',
          tol=0.001, verbose=0, warm_start=False)

In [17]:
for mdl in [lr1, lr2, lr3]:
    y_pred = mdl.predict(X_train)
    accuracy = round(accuracy_score(Y_train, y_pred), 3)
    precision = round(precision_score(Y_train, y_pred), 3)
    recall = round(recall_score(Y_train, y_pred), 3)
    print('C: {} / penalty: {} / random_state: {} / tol: {} -- A: {} / P: {} / R: {}'. format(mdl.C, mdl.penalty, mdl.random_state, mdl.tol, accuracy, precision, recall))

C: 100 / penalty: l1 / random_state: None / tol: 1 -- A: 0.807 / P: 0.81 / R: 0.649
C: 1000 / penalty: l1 / random_state: 1000 / tol: 0.01 -- A: 0.802 / P: 0.755 / R: 0.719
C: 10 / penalty: l1 / random_state: 0 / tol: 0.001 -- A: 0.805 / P: 0.755 / R: 0.728


In [18]:
rf = RandomForestClassifier()
parameters = {
    'n_estimators': [5, 50, 100],
    'max_depth': [2, 10, 20, None]
}

cv = GridSearchCV(rf, parameters, cv=5)
cv.fit(X_train, Y_train.values.ravel())

print_results(cv)

BEST PARAMS: {'max_depth': 10, 'n_estimators': 50}

0.78 (+/-0.052) for {'max_depth': 2, 'n_estimators': 5}
0.772 (+/-0.04) for {'max_depth': 2, 'n_estimators': 50}
0.793 (+/-0.036) for {'max_depth': 2, 'n_estimators': 100}
0.827 (+/-0.021) for {'max_depth': 10, 'n_estimators': 5}
0.837 (+/-0.025) for {'max_depth': 10, 'n_estimators': 50}
0.836 (+/-0.034) for {'max_depth': 10, 'n_estimators': 100}
0.807 (+/-0.028) for {'max_depth': 20, 'n_estimators': 5}
0.824 (+/-0.037) for {'max_depth': 20, 'n_estimators': 50}
0.819 (+/-0.031) for {'max_depth': 20, 'n_estimators': 100}
0.808 (+/-0.024) for {'max_depth': None, 'n_estimators': 5}
0.816 (+/-0.036) for {'max_depth': None, 'n_estimators': 50}
0.825 (+/-0.031) for {'max_depth': None, 'n_estimators': 100}


In [19]:
rf1 = RandomForestClassifier(n_estimators=100, max_depth=10)
rf1.fit(X_train, Y_train.values.ravel())

rf2 = RandomForestClassifier(n_estimators=100, max_depth=20)
rf2.fit(X_train, Y_train.values.ravel())

rf3 = RandomForestClassifier(n_estimators=5, max_depth=10)
rf3.fit(X_train, Y_train.values.ravel())

rf4 = RandomForestClassifier()
rf4.fit(X_train, Y_train.values.ravel())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [20]:
for mdl in [rf1, rf2, rf3, rf4]:
    y_pred = mdl.predict(X_train)
    accuracy = round(accuracy_score(Y_train, y_pred), 3)
    precision = round(precision_score(Y_train, y_pred), 3)
    recall = round(recall_score(Y_train, y_pred), 3)
    print('MAX DEPTH: {} / # OF EST: {} -- A: {} / P: {} / R: {}'. format(mdl.max_depth, mdl.n_estimators, accuracy, precision, recall))

MAX DEPTH: 10 / # OF EST: 100 -- A: 0.933 / P: 0.946 / R: 0.874
MAX DEPTH: 20 / # OF EST: 100 -- A: 0.987 / P: 0.994 / R: 0.971
MAX DEPTH: 10 / # OF EST: 5 -- A: 0.914 / P: 0.932 / R: 0.836
MAX DEPTH: None / # OF EST: 10 -- A: 0.978 / P: 0.985 / R: 0.956


In [21]:
svc = SVC()
parameters = {
    'tol': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'random_state': [0, 1, 10, 100, 1000, None],
}

cv = GridSearchCV(svc, parameters, cv=5)
cv.fit(X_train, Y_train.values.ravel())

# print_results(cv)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'tol': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'random_state': [0, 1, 10, 100, 1000, None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [22]:
svc1 = SVC(C=10, random_state=0, tol=1)
svc1.fit(X_train, Y_train.values.ravel())

svc2 = SVC(C=10, random_state=0, tol=0.1)
svc2.fit(X_train, Y_train.values.ravel())

svc3 = SVC(C=10, random_state=0, tol=0.01)
svc3.fit(X_train, Y_train.values.ravel())

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=0,
  shrinking=True, tol=0.01, verbose=False)

In [23]:
for mdl in [svc1, svc2, svc3]:
    y_pred = mdl.predict(X_train)
    accuracy = round(accuracy_score(Y_train, y_pred), 3)
    precision = round(precision_score(Y_train, y_pred), 3)
    recall = round(recall_score(Y_train, y_pred), 3)
    print('C: {} / random_state: {} / tol: {} -- A: {} / P: {} / R: {}'. format(mdl.C, mdl.random_state, mdl.tol, accuracy, precision, recall))

C: 10 / random_state: 0 / tol: 1 -- A: 0.945 / P: 0.951 / R: 0.904
C: 10 / random_state: 0 / tol: 0.1 -- A: 0.941 / P: 0.942 / R: 0.901
C: 10 / random_state: 0 / tol: 0.01 -- A: 0.941 / P: 0.942 / R: 0.901


In [62]:
y_pred = svc1.predict(X_test)
rf2.score(X_train, Y_train)
acc = round(svc1.score(X_train, Y_train) * 100, 2)
acc

94.5

In [63]:
submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": y_pred
    })
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,1
3,895,0
4,896,0


In [64]:
submission.to_csv('submission.csv', index=False)

In [50]:
# !pip install kaggle
!kaggle competitions submit -c titanic -f submission.csv -m "tutorial submition random forest"

Successfully submitted to Titanic: Machine Learning from Disaster



  0%|          | 0.00/3.18k [00:00<?, ?B/s]
100%|██████████| 3.18k/3.18k [00:00<00:00, 14.9kB/s]
