In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
import xgboost as xgb

In [2]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
predictions = pd.read_csv('gender_submission.csv')

In [3]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
data = [train_data,test_data]

# Preprocessing

In [6]:
for dataset in data:
    dataset.info()
    print(".-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         

#### Dropping Unusable

In [7]:
for dataset in data:
    dataset.drop(['PassengerId','Cabin','Ticket','Fare'],axis=1,inplace=True)

#### Preprocessing Names

In [8]:
for dataset in data:
    initials = []
    for i in list(dataset["Name"].str.split(", ")):
        initials.append(i[1][:4])
    dataset["Name"] = initials

In [9]:
data[0].Name.value_counts()

Mr.     517
Miss    182
Mrs.    125
Mast     40
Dr.       7
Rev.      6
Majo      2
Mlle      2
Col.      2
Lady      1
Mme.      1
Ms.       1
Jonk      1
Sir.      1
the       1
Capt      1
Don.      1
Name: Name, dtype: int64

In [10]:
data[1].Name.value_counts()

Mr.     240
Miss     78
Mrs.     72
Mast     21
Rev.      2
Col.      2
Dr.       1
Dona      1
Ms.       1
Name: Name, dtype: int64

In [11]:
for dataset in data:
    dataset['Name'] = dataset['Name'].replace(['Majo','the ','Dr. '], 'Rare')
    dataset['Name'] = dataset['Name'].replace(['Mlle','Mme.','Ms. '],'Miss')
    dataset['Name'] = dataset['Name'].replace(['Mme.','Lady','Dona'],'Mrs.')
    dataset['Name'] = dataset['Name'].replace(['Capt','Sir.','Don.','Col.','Jonk','Rev.',],'Mr. ')

In [12]:
pd.crosstab(data[0].Name, data[0].Sex)

Sex,female,male
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Mast,0,40
Miss,186,0
Mr.,0,529
Mrs.,126,0
Rare,2,8


In [13]:
title = {'Mast': 4, 'Miss': 3, 'Mr. ': 2,'Mrs.':1, 'Rare': 0}
for dataset in data:
    dataset['Name'] = dataset['Name'].map( title ).astype(int)

#### Preprocessing Embarked

In [14]:
data[0][['Embarked','Survived']].groupby(['Embarked']).mean()

Unnamed: 0_level_0,Survived
Embarked,Unnamed: 1_level_1
C,0.553571
Q,0.38961
S,0.336957


In [15]:
Embarked = {'S': 2,'Q':1, 'C': 0}
for dataset in data:
    dataset['Embarked'].fillna("S",inplace=True)
    dataset['Embarked'] = dataset['Embarked'].map( Embarked ).astype(int)

#### Preprocessing Age

In [16]:
train_data_ = train_data.transpose()
avn = train_data[['Name','Age']].groupby(['Name']).mean().transpose()
avn

Name,0,1,2,3,4
Age,42.444444,36.009174,32.815854,21.86,4.574167


In [17]:
for dataset in data:
    dataset_ = dataset.transpose()
    for i in range(len(dataset)):
        if(not dataset_[i]['Age'] > 0):
            dataset_[i]['Age'] = avn[dataset_[i]['Name']]['Age']
    dataset_ = dataset_.transpose()
    new_Age = dataset_['Age'].values
    dataset['Age'] = np.float64(new_Age)
    dataset.info()
    print(".-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Name        891 non-null int64
Sex         891 non-null object
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Embarked    891 non-null int64
dtypes: float64(1), int64(6), object(1)
memory usage: 55.8+ KB
.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
Pclass      418 non-null int64
Name        418 non-null int64
Sex         418 non-null object
Age         418 non-null float64
SibSp       418 non-null int64
Parch       418 non-null int64
Embarked    418 non-null int64
dtypes: float64(1), int64(5), object(1)
memory usage: 22.9+ KB
.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-


In [18]:
for dataset in data:
    dataset.loc[dataset['Age'] <= 18, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 18) & (dataset['Age'] <= 35), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 35) & (dataset['Age'] <= 64), 'Age'] = 2
    dataset.loc[ dataset['Age'] > 64, 'Age'] = 3

#### Preprocessing Sex

In [19]:
for dataset in data:
    dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

#### Preprocessing Parch and SibSp

In [20]:
for dataset in data:
    dataset['FamM'] = dataset['SibSp'] + dataset['Parch']
    dataset['Alone'] = 0
    dataset.loc[dataset['FamM'] == 0, 'Alone'] = 1
    dataset.drop(['SibSp','Parch','FamM'],axis=1,inplace=True)

#  Correlation

In [21]:
data[0].corr()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Embarked,Alone
Survived,1.0,-0.338481,0.064779,0.543351,-0.046674,-0.167675,-0.203367
Pclass,-0.338481,1.0,0.149374,-0.1319,-0.316668,0.162098,0.135207
Name,0.064779,0.149374,1.0,0.043178,-0.474037,-0.019736,-0.013292
Sex,0.543351,-0.1319,0.043178,1.0,-0.050193,-0.108262,-0.303646
Age,-0.046674,-0.316668,-0.474037,-0.050193,1.0,-0.017504,0.136814
Embarked,-0.167675,0.162098,-0.019736,-0.108262,-0.017504,1.0,0.063532
Alone,-0.203367,0.135207,-0.013292,-0.303646,0.136814,0.063532,1.0


# Trainning Model

In [22]:
X_train = data[0].drop("Survived", axis=1)
Y_train = train_data["Survived"]
X_test = data[1]

In [23]:
from sklearn.model_selection import train_test_split
X_tr, X_te, y_tr, y_te = train_test_split(X_train, Y_train, test_size=0.33, random_state=42)

In [24]:
classifier = DecisionTreeClassifier(criterion="entropy",)
classifier.fit(X_train, Y_train)
#print("Test Accuracy:",classifier.score(X_te, y_te) * 100)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [25]:
classifier1 = RandomForestClassifier(n_estimators=1)
classifier1.fit(X_train, Y_train)
#print("Test Accuracy:",classifier1.score(X_te, y_te) * 100)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [26]:
classifier2 = SVC(C = 6000, gamma='scale')
classifier2.fit(X_train, Y_train)
#print("Test Accuracy:",classifier2.score(X_te, y_te) * 100)

SVC(C=6000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [27]:
classifier3 = LogisticRegression(max_iter=1000, C=100, solver='lbfgs')
classifier3.fit(X_train, Y_train)
#print("Test Accuracy:",classifier3.score(X_te, y_te) * 100)

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [28]:
classifier4 = xgb.XGBClassifier(max_depth=10,
                           min_child_weight=1,
                           learning_rate=0.1,
                           n_estimators=100,
                           silent=True,
                           objective='binary:logistic',
                           gamma=0,
                           max_delta_step=0,
                           subsample=1,
                           colsample_bytree=1,
                           colsample_bylevel=1,
                           reg_alpha=0,
                           reg_lambda=0,
                           scale_pos_weight=1,
                           seed=1,
                           missing=None)
classifier4.fit(X_train, Y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=10, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=0, scale_pos_weight=1, seed=1, silent=True,
       subsample=1)

In [29]:
classifier5 = KNeighborsClassifier()
classifier5.fit(X_train, Y_train)
#print("Test Accuracy:",classifier3.score(X_te, y_te) * 100)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [30]:
classifier6 = MLPClassifier(hidden_layer_sizes=[14,25],max_iter=2000)
classifier6.fit(X_train, Y_train)
#print("Test Accuracy:",classifier3.score(X_te, y_te) * 100)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=[14, 25], learning_rate='constant',
       learning_rate_init=0.001, max_iter=2000, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [31]:
y = classifier.predict(X_train)
y1 = classifier1.predict(X_train)
y2 = classifier2.predict(X_train)
y3 = classifier3.predict(X_train)
y4 = classifier4.predict(X_train)
y5 = classifier5.predict(X_train)
y6 = classifier6.predict(X_train)

In [32]:
Xy = np.vstack((y,y1,y2,y3,y4,y5,y6)).T

In [33]:
from sklearn.ensemble import GradientBoostingClassifier
classifier7 = GradientBoostingClassifier()
classifier7.fit(Xy, Y_train)
print("Test Accuracy:",classifier7.score(Xy, Y_train) * 100)

Test Accuracy: 83.83838383838383


# Predicting

In [34]:
Y_pred = classifier.predict(X_test)
Y_pred1 = classifier1.predict(X_test)
Y_pred2 = classifier2.predict(X_test)
Y_pred3 = classifier3.predict(X_test)
Y_pred4 = classifier4.predict(X_test)
Y_pred5 = classifier5.predict(X_test)
Y_pred6 = classifier6.predict(X_test)

In [35]:
Xy_pred = np.vstack((Y_pred,Y_pred1,Y_pred2,Y_pred3,Y_pred4,Y_pred5,Y_pred6)).T

In [36]:
Y_pred_final = classifier7.predict(Xy_pred)

In [37]:
predictions['Survived'] = Y_pred_final

In [38]:
predictions.to_csv("predictions.csv",index = False)