In [1]:
import pandas as pd
import numpy as np

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
data_train = pd.read_csv('data/train.csv')
data_test = pd.read_csv('data/test.csv')

print(data_train.head())
print(data_test.head())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
  

In [3]:
print(data_train.info())
print(data_test.info())
print(data_train.describe())
print(data_train.describe(include=['O']))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null 

** Drop Ticket and Cabin **

In [4]:
#help(data_train.drop)
print('Before Shape : Train - ', data_train.shape)
print('Before Shape : Test - ', data_test.shape)

data_test.drop(['Ticket','Cabin'], axis=1, inplace=True)
data_train.drop(['Ticket','Cabin'], axis=1, inplace=True)

print('After Shape : Train - ', data_train.shape)
print('After Shape : Test - ', data_test.shape)


Before Shape : Train -  (891, 12)
Before Shape : Test -  (418, 11)
After Shape : Train -  (891, 10)
After Shape : Test -  (418, 9)


** Change Sex Label to numeric **

In [5]:
#help(data_test.replace)
data_train.replace({'Sex': {'male' : 0, 'female' : 1}}, inplace=True)
data_test.replace({'Sex': {'male' : 0, 'female' : 1}}, inplace=True)
print(data_train.head())
print(data_test.head())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name  Sex   Age  SibSp  Parch  \
0                            Braund, Mr. Owen Harris    0  22.0      1      0   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...    1  38.0      1      0   
2                             Heikkinen, Miss. Laina    1  26.0      0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)    1  35.0      1      0   
4                           Allen, Mr. William Henry    0  35.0      0      0   

      Fare Embarked  
0   7.2500        S  
1  71.2833        C  
2   7.9250        S  
3  53.1000        S  
4   8.0500        S  
   PassengerId  Pclass                                          Name  Sex  \
0          892       3                              Kelly, Mr. James    0   
1         

** Handling Name **

In [6]:
help(data_train.Name.str.extract)
data_train['Title'] = data_train.Name.str.extract(' ([A-Za-z]+)\.', expand=True)
data_test['Title'] = data_test.Name.str.extract(' ([A-Za-z]+)\.', expand=True)

print(data_train.head())
print(data_test.head())



Help on method extract in module pandas.core.strings:

extract(pat, flags=0, expand=None) method of pandas.core.strings.StringMethods instance
    For each subject string in the Series, extract groups from the
    first match of regular expression pat.
    
    .. versionadded:: 0.13.0
    
    Parameters
    ----------
    pat : string
        Regular expression pattern with capturing groups
    flags : int, default 0 (no flags)
        re module flags, e.g. re.IGNORECASE
    
    .. versionadded:: 0.18.0
    expand : bool, default False
        * If True, return DataFrame.
        * If False, return Series/Index/DataFrame.
    
    Returns
    -------
    DataFrame with one row for each subject string, and one column for
    each group. Any capture group names in regular expression pat will
    be used for column names; otherwise capture group numbers will be
    used. The dtype of each result column is always object, even when
    no match is found. If expand=False and pat has only 

In [7]:
#help(pd.crosstab)
pd.crosstab(data_train['Title'], data_train['Sex'])

Sex,0,1
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,1,0
Col,2,0
Countess,0,1
Don,1,0
Dr,6,1
Jonkheer,1,0
Lady,0,1
Major,2,0
Master,40,0
Miss,0,182


In [8]:
data_train[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

combine = [data_train, data_test]

for dataset in combine:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

data_train[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

Unnamed: 0,Title,Survived
0,Master,0.575
1,Miss,0.702703
2,Mr,0.156673
3,Mrs,0.793651
4,Rare,0.347826


In [9]:
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
for dataset in combine:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

data_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,7.25,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,71.2833,C,3
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,7.925,S,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,53.1,S,3
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,8.05,S,1


In [10]:
print(data_train.shape)
print(data_test.shape)
data_train = data_train.drop(['PassengerId', 'Name'], axis=1)
data_test = data_test.drop(['Name'], axis=1)
print(data_train.shape)
print(data_test.shape)


(891, 11)
(418, 10)
(891, 9)
(418, 9)


In [11]:
data_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,0,22.0,1,0,7.25,S,1
1,1,1,1,38.0,1,0,71.2833,C,3
2,1,3,1,26.0,0,0,7.925,S,2
3,1,1,1,35.0,1,0,53.1,S,3
4,0,3,0,35.0,0,0,8.05,S,1


** Handle Age **

In [12]:
combine = [data_train, data_test]
guess_ages = np.zeros((2,3))


In [13]:
for dataset in combine:
    for i in range(0, 2):
        for j in range(0, 3):
            guess_df = dataset[(dataset['Sex'] == i) & \
                                  (dataset['Pclass'] == j+1)]['Age'].dropna()

            # age_mean = guess_df.mean()
            # age_std = guess_df.std()
            # age_guess = rnd.uniform(age_mean - age_std, age_mean + age_std)

            age_guess = guess_df.median()

            # Convert random age float to nearest .5 age
            guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
            
    for i in range(0, 2):
        for j in range(0, 3):
            dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j+1),\
                    'Age'] = guess_ages[i,j]

    dataset['Age'] = dataset['Age'].astype(int)

data_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,0,22,1,0,7.25,S,1
1,1,1,1,38,1,0,71.2833,C,3
2,1,3,1,26,0,0,7.925,S,2
3,1,1,1,35,1,0,53.1,S,3
4,0,3,0,35,0,0,8.05,S,1


In [14]:
#print(data_train.describe())
#print(data_test.describe())
data_train['AgeBand'] = pd.cut(data_train['Age'], 5)
data_train[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)

Unnamed: 0,AgeBand,Survived
0,"(-0.08, 16]",0.55
1,"(16, 32]",0.337374
2,"(32, 48]",0.412037
3,"(48, 64]",0.434783
4,"(64, 80]",0.090909


In [15]:
for dataset in combine:    
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
data_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,AgeBand
0,0,3,0,1,1,0,7.25,S,1,"(16, 32]"
1,1,1,1,2,1,0,71.2833,C,3,"(32, 48]"
2,1,3,1,1,0,0,7.925,S,2,"(16, 32]"
3,1,1,1,2,1,0,53.1,S,3,"(32, 48]"
4,0,3,0,2,0,0,8.05,S,1,"(32, 48]"


In [16]:
data_train = data_train.drop(['AgeBand'], axis=1)
combine = [data_train, data_test]
data_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,0,1,1,0,7.25,S,1
1,1,1,1,2,1,0,71.2833,C,3
2,1,3,1,1,0,0,7.925,S,2
3,1,1,1,2,1,0,53.1,S,3
4,0,3,0,2,0,0,8.05,S,1


** Handle Sibsp & Parch **

In [17]:
for dataset in combine:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

data_train[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,FamilySize,Survived
3,4,0.724138
2,3,0.578431
1,2,0.552795
6,7,0.333333
0,1,0.303538
4,5,0.2
5,6,0.136364
7,8,0.0
8,11,0.0


In [18]:
for dataset in combine:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

data_train[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean()

Unnamed: 0,IsAlone,Survived
0,0,0.50565
1,1,0.303538


In [19]:
print(data_train.head())
data_train = data_train.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
data_test = data_test.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
combine = [data_train, data_test]

data_train.head()

   Survived  Pclass  Sex  Age  SibSp  Parch     Fare Embarked  Title  \
0         0       3    0    1      1      0   7.2500        S      1   
1         1       1    1    2      1      0  71.2833        C      3   
2         1       3    1    1      0      0   7.9250        S      2   
3         1       1    1    2      1      0  53.1000        S      3   
4         0       3    0    2      0      0   8.0500        S      1   

   FamilySize  IsAlone  
0           2        0  
1           2        0  
2           1        1  
3           2        0  
4           1        1  


Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,IsAlone
0,0,3,0,1,7.25,S,1,0
1,1,1,1,2,71.2833,C,3,0
2,1,3,1,1,7.925,S,2,1
3,1,1,1,2,53.1,S,3,0
4,0,3,0,2,8.05,S,1,1


** Artifical Feature **

In [20]:
for dataset in combine:
    dataset['Age*Class'] = dataset.Age * dataset.Pclass

data_train.loc[:, ['Age*Class', 'Age', 'Pclass']].head(10)

Unnamed: 0,Age*Class,Age,Pclass
0,3,1,3
1,2,2,1
2,3,1,3
3,2,2,1
4,6,2,3
5,3,1,3
6,3,3,1
7,0,0,3
8,3,1,3
9,0,0,2


** Handle Embarked ** 

In [21]:
data_train.describe(include=['O'])

Unnamed: 0,Embarked
count,889
unique,3
top,S
freq,644


In [22]:
#data_train.Embarked.dropna().mode()[0]
freq_port = data_train.Embarked.dropna().mode()[0]


In [23]:
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)
    
data_train[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Embarked,Survived
0,C,0.553571
1,Q,0.38961
2,S,0.339009


In [24]:
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

data_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,IsAlone,Age*Class
0,0,3,0,1,7.25,0,1,0,3
1,1,1,1,2,71.2833,1,3,0,2
2,1,3,1,1,7.925,0,2,1,3
3,1,1,1,2,53.1,0,3,0,2
4,0,3,0,2,8.05,0,1,1,6


In [25]:
data_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 9 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Sex            418 non-null int64
Age            418 non-null int32
Fare           417 non-null float64
Embarked       418 non-null int32
Title          418 non-null int64
IsAlone        418 non-null int64
Age*Class      418 non-null int64
dtypes: float64(1), int32(2), int64(6)
memory usage: 26.2 KB


In [26]:
data_test['Fare'].fillna(data_test['Fare'].dropna().median(), inplace=True)
data_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 9 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Sex            418 non-null int64
Age            418 non-null int32
Fare           418 non-null float64
Embarked       418 non-null int32
Title          418 non-null int64
IsAlone        418 non-null int64
Age*Class      418 non-null int64
dtypes: float64(1), int32(2), int64(6)
memory usage: 26.2 KB


In [27]:
data_train['FareBand'] = pd.qcut(data_train['Fare'], 4)
data_train[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean().sort_values(by='FareBand', ascending=True)

Unnamed: 0,FareBand,Survived
0,"[0, 7.91]",0.197309
1,"(7.91, 14.454]",0.303571
2,"(14.454, 31]",0.454955
3,"(31, 512.329]",0.581081


In [28]:
combine = [data_train, data_test]
for dataset in combine:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

data_train = data_train.drop(['FareBand'], axis=1)
combine = [data_train, data_test]
    
data_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,IsAlone,Age*Class
0,0,3,0,1,0,0,1,0,3
1,1,1,1,2,3,1,3,0,2
2,1,3,1,1,1,0,2,1,3
3,1,1,1,2,3,0,3,0,2
4,0,3,0,2,1,0,1,1,6


In [29]:
print(data_train.head())
print(data_test.head())


   Survived  Pclass  Sex  Age  Fare  Embarked  Title  IsAlone  Age*Class
0         0       3    0    1     0         0      1        0          3
1         1       1    1    2     3         1      3        0          2
2         1       3    1    1     1         0      2        1          3
3         1       1    1    2     3         0      3        0          2
4         0       3    0    2     1         0      1        1          6
   PassengerId  Pclass  Sex  Age  Fare  Embarked  Title  IsAlone  Age*Class
0          892       3    0    2     0         2      1        1          6
1          893       3    1    2     0         0      3        0          6
2          894       2    0    3     1         2      1        1          6
3          895       3    0    1     1         0      1        1          3
4          896       3    1    1     1         0      3        0          3


** Model Training **

In [30]:
X_train = data_train.drop("Survived", axis=1)
Y_train = data_train["Survived"]
X_test  = data_test.drop("PassengerId", axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

((891, 8), (891,), (418, 8))

In [31]:
# Support Vector Machines

svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
acc_svc

83.840000000000003

In [32]:
submission = pd.DataFrame({
        "PassengerId": data_test["PassengerId"],
        "Survived": Y_pred
    })
submission.to_csv('output/submission.csv', index=False)

** Cross Validation **

In [33]:
from sklearn.grid_search import GridSearchCV
from sklearn.grid_search import RandomizedSearchCV




** 1 - Logistic Regression **

In [34]:
param_grid = dict(C=[0.001, 0.01, 0.1, 1, 10, 100, 1000])
print(param_grid)

logreg = LogisticRegression()

# instantiate the grid
grid = GridSearchCV(logreg, param_grid, cv=10, scoring='accuracy')

# fit the grid with data
grid.fit(X_train, Y_train)

print(grid.grid_scores_)

# examine the best model
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)

acc_log = grid.best_score_

Y_pred = grid.predict(X_test)

submission = pd.DataFrame({
        "PassengerId": data_test["PassengerId"],
        "Survived": Y_pred
    })
submission.to_csv('output/submission_log.csv', index=False)

{'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
[mean: 0.72054, std: 0.04977, params: {'C': 0.001}, mean: 0.76992, std: 0.06255, params: {'C': 0.01}, mean: 0.80696, std: 0.03483, params: {'C': 0.1}, mean: 0.80359, std: 0.03140, params: {'C': 1}, mean: 0.80247, std: 0.02810, params: {'C': 10}, mean: 0.80247, std: 0.02810, params: {'C': 100}, mean: 0.80247, std: 0.02810, params: {'C': 1000}]
0.8069584736251403
{'C': 0.1}
LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


** 2 - SVM **

In [35]:
help(np.logspace)

C_range = np.logspace(-2, 2, 5)
print(C_range)
gamma_range = np.logspace(-9, 3, 13)
print(gamma_range)


Help on function logspace in module numpy.core.function_base:

logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None)
    Return numbers spaced evenly on a log scale.
    
    In linear space, the sequence starts at ``base ** start``
    (`base` to the power of `start`) and ends with ``base ** stop``
    (see `endpoint` below).
    
    Parameters
    ----------
    start : float
        ``base ** start`` is the starting value of the sequence.
    stop : float
        ``base ** stop`` is the final value of the sequence, unless `endpoint`
        is False.  In that case, ``num + 1`` values are spaced over the
        interval in log-space, of which all but the last (a sequence of
        length ``num``) are returned.
    num : integer, optional
        Number of samples to generate.  Default is 50.
    endpoint : boolean, optional
        If true, `stop` is the last sample. Otherwise, it is not included.
        Default is True.
    base : float, optional
        The base o

In [36]:
#C_range = np.logspace(-2, 10, 13)
C_range = np.logspace(-3, 3, 7)

print(C_range)
gamma_range = np.logspace(-3, 3, 7)
print(gamma_range)

degree_range = [3,4,5]

param_grid1 = dict(gamma=gamma_range, C=C_range)

param_grid2 = dict(gamma=gamma_range, C=C_range, degree=degree_range)


svc1 = SVC()
svc2 = SVC(kernel='poly', verbose=True)

# instantiate the grid
grid1 = GridSearchCV(svc1, param_grid1, cv=10, scoring='accuracy')

# fit the grid with data
grid1.fit(X_train, Y_train)

print(grid1.grid_scores_)

# examine the best model
print(grid1.best_score_)
print(grid1.best_params_)
print(grid1.best_estimator_)

'''
# instantiate the grid
grid2 = RandomizedSearchCV(svc2, param_grid2, cv=10, scoring='accuracy', n_iter=5)

# fit the grid with data
grid2.fit(X_train, Y_train)

print(grid2.grid_scores_)

# examine the best model
print(grid2.best_score_)
print(grid2.best_params_)
print(grid2.best_estimator_)

if grid1.best_score_ >= grid2.best_score_ :
    acc_svc = grid1.best_score_
    grid = grid1
else :
    acc_svc = grid2.best_score_
    grid = grid2

'''
Y_pred = grid1.predict(X_test)

submission = pd.DataFrame({
        "PassengerId": data_test["PassengerId"],
        "Survived": Y_pred
    })
submission.to_csv('output/submission_svc.csv', index=False)



[  1.00000000e-03   1.00000000e-02   1.00000000e-01   1.00000000e+00
   1.00000000e+01   1.00000000e+02   1.00000000e+03]
[  1.00000000e-03   1.00000000e-02   1.00000000e-01   1.00000000e+00
   1.00000000e+01   1.00000000e+02   1.00000000e+03]
[mean: 0.61616, std: 0.00284, params: {'C': 0.001, 'gamma': 0.001}, mean: 0.61616, std: 0.00284, params: {'C': 0.001, 'gamma': 0.01}, mean: 0.61616, std: 0.00284, params: {'C': 0.001, 'gamma': 0.10000000000000001}, mean: 0.61616, std: 0.00284, params: {'C': 0.001, 'gamma': 1.0}, mean: 0.61616, std: 0.00284, params: {'C': 0.001, 'gamma': 10.0}, mean: 0.61616, std: 0.00284, params: {'C': 0.001, 'gamma': 100.0}, mean: 0.61616, std: 0.00284, params: {'C': 0.001, 'gamma': 1000.0}, mean: 0.61616, std: 0.00284, params: {'C': 0.01, 'gamma': 0.001}, mean: 0.61616, std: 0.00284, params: {'C': 0.01, 'gamma': 0.01}, mean: 0.61616, std: 0.00284, params: {'C': 0.01, 'gamma': 0.10000000000000001}, mean: 0.61616, std: 0.00284, params: {'C': 0.01, 'gamma': 1.0}, 

** 3 - KNN ** 

In [42]:
x = list(range(2,21))
print(x)

[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]


In [44]:
param_grid = dict(n_neighbors = list(range(2,30)))
print(param_grid)

knn = KNeighborsClassifier()

# instantiate the grid
grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')

# fit the grid with data
grid.fit(X_train, Y_train)

print(grid.grid_scores_)

# examine the best model
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)

acc_knn = grid.best_score_

Y_pred = grid.predict(X_test)

submission = pd.DataFrame({
        "PassengerId": data_test["PassengerId"],
        "Survived": Y_pred
    })
submission.to_csv('output/submission_knn.csv', index=False)

{'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]}
[mean: 0.77778, std: 0.04803, params: {'n_neighbors': 2}, mean: 0.78788, std: 0.07599, params: {'n_neighbors': 3}, mean: 0.78676, std: 0.04385, params: {'n_neighbors': 4}, mean: 0.79574, std: 0.04821, params: {'n_neighbors': 5}, mean: 0.78676, std: 0.04080, params: {'n_neighbors': 6}, mean: 0.78900, std: 0.04793, params: {'n_neighbors': 7}, mean: 0.78676, std: 0.05792, params: {'n_neighbors': 8}, mean: 0.80022, std: 0.05725, params: {'n_neighbors': 9}, mean: 0.80584, std: 0.04366, params: {'n_neighbors': 10}, mean: 0.80135, std: 0.04037, params: {'n_neighbors': 11}, mean: 0.80471, std: 0.04335, params: {'n_neighbors': 12}, mean: 0.79798, std: 0.04747, params: {'n_neighbors': 13}, mean: 0.80359, std: 0.04464, params: {'n_neighbors': 14}, mean: 0.80920, std: 0.04966, params: {'n_neighbors': 15}, mean: 0.80247, std: 0.04378, params: {'n_neighbors': 16}, mean: 0.80808, 

** 4 - Random Forest **

In [48]:
n_estimators_range = [10,20,30,40,50,75,100,150,200,250,300,400,500,600,700,800,900,1000]
min_samples_split_range = [2,3,4]
criterion_range = ['gini', 'entropy']
min_samples_leaf_range = [1,2]

param_grid = dict(n_estimators = n_estimators_range, criterion = criterion_range, min_samples_split = min_samples_split_range, min_samples_leaf = min_samples_leaf_range)
print(param_grid)

rand_forest = RandomForestClassifier()

# instantiate the grid
grid = RandomizedSearchCV(rand_forest, param_grid, cv=10, scoring='accuracy', n_iter=50)

# fit the grid with data
grid.fit(X_train, Y_train)

print(grid.grid_scores_)

# examine the best model
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)

acc_rand_forest = grid.best_score_

Y_pred = grid.predict(X_test)

submission = pd.DataFrame({
        "PassengerId": data_test["PassengerId"],
        "Survived": Y_pred
    })
submission.to_csv('output/submission_rand_forest.csv', index=False)

{'n_estimators': [10, 20, 30, 40, 50, 75, 100, 150, 200, 250, 300, 400, 500, 600, 700, 800, 900, 1000], 'criterion': ['gini', 'entropy'], 'min_samples_split': [2, 3, 4], 'min_samples_leaf': [1, 2]}
[mean: 0.81257, std: 0.03313, params: {'n_estimators': 10, 'min_samples_split': 4, 'min_samples_leaf': 1, 'criterion': 'gini'}, mean: 0.81594, std: 0.04241, params: {'n_estimators': 1000, 'min_samples_split': 2, 'min_samples_leaf': 2, 'criterion': 'entropy'}, mean: 0.81818, std: 0.04232, params: {'n_estimators': 300, 'min_samples_split': 4, 'min_samples_leaf': 1, 'criterion': 'entropy'}, mean: 0.81257, std: 0.04205, params: {'n_estimators': 20, 'min_samples_split': 3, 'min_samples_leaf': 2, 'criterion': 'entropy'}, mean: 0.81594, std: 0.05349, params: {'n_estimators': 75, 'min_samples_split': 3, 'min_samples_leaf': 1, 'criterion': 'gini'}, mean: 0.80359, std: 0.03806, params: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'criterion': 'entropy'}, mean: 0.82043, std: 0.0

** RESULT **

In [50]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest'],
    'Score': [acc_svc, acc_knn, acc_log, 
              acc_rand_forest]})
models.sort_values(by='Score', ascending=False)



Unnamed: 0,Model,Score
0,Support Vector Machines,83.84
3,Random Forest,0.824916
1,KNN,0.809203
2,Logistic Regression,0.806958


** Ensembeling **

(C:\Users\VIKAS\Anaconda3) D:\Samsung-SISC\Courses\Kaggle\titanic>python Kaggle-Ensemble-Guide-master/correlations.py output\submission_knn.csv output\submission_log.csv
Finding correlation between: output\submission_knn.csv and output\submission_log.csv
Column to be measured: Survived
Pearson's correlation score: 0.73194
Kendall's correlation score: 0.73194
Spearman's correlation score: 0.73194

(C:\Users\VIKAS\Anaconda3) D:\Samsung-SISC\Courses\Kaggle\titanic>python Kaggle-Ensemble-Guide-master/correlations.py output\submission_knn.csv output\submission_rand_forest.csv
Finding correlation between: output\submission_knn.csv and output\submission_rand_forest.csv
Column to be measured: Survived
Pearson's correlation score: 0.80360
Kendall's correlation score: 0.80360
Spearman's correlation score: 0.80360

(C:\Users\VIKAS\Anaconda3) D:\Samsung-SISC\Courses\Kaggle\titanic>python Kaggle-Ensemble-Guide-master/correlations.py output\submission_knn.csv output\submission_svc.csv
Finding correlation between: output\submission_knn.csv and output\submission_svc.csv
Column to be measured: Survived
Pearson's correlation score: 0.81368
Kendall's correlation score: 0.81368
Spearman's correlation score: 0.81368

(C:\Users\VIKAS\Anaconda3) D:\Samsung-SISC\Courses\Kaggle\titanic>python Kaggle-Ensemble-Guide-master/correlations.py output\submission_log.csv output\submission_svc.csv
Finding correlation between: output\submission_log.csv and output\submission_svc.csv
Column to be measured: Survived
Pearson's correlation score: 0.79052
Kendall's correlation score: 0.79052
Spearman's correlation score: 0.79052

(C:\Users\VIKAS\Anaconda3) D:\Samsung-SISC\Courses\Kaggle\titanic>python Kaggle-Ensemble-Guide-master/correlations.py output\submission_log.csv output\submission_svc.csv
Finding correlation between: output\submission_log.csv and output\submission_svc.csv
Column to be measured: Survived
Pearson's correlation score: 0.79052
Kendall's correlation score: 0.79052
Spearman's correlation score: 0.79052

(C:\Users\VIKAS\Anaconda3) D:\Samsung-SISC\Courses\Kaggle\titanic>python Kaggle-Ensemble-Guide-master/correlations.py output\submission_log.csv output\submission_rand_forest.csv
Finding correlation between: output\submission_log.csv and output\submission_rand_forest.csv
Column to be measured: Survived
Pearson's correlation score: 0.75126
Kendall's correlation score: 0.75126
Spearman's correlation score: 0.75126

(C:\Users\VIKAS\Anaconda3) D:\Samsung-SISC\Courses\Kaggle\titanic>python Kaggle-Ensemble-Guide-master/correlations.py output\submission_svc.csv output\submission_rand_forest.csv
Finding correlation between: output\submission_svc.csv and output\submission_rand_forest.csv
Column to be measured: Survived
Pearson's correlation score: 0.81930
Kendall's correlation score: 0.81930
Spearman's correlation score: 0.81930
