In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier

In [2]:
Train_Base = pd.read_csv('Titanic-train.csv')
Test_Base = pd.read_csv('Titanic-test.csv')

In [3]:
print('top 5 rows:')
Train_Base.head()

top 5 rows:


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
Train_Base.shape

(891, 12)

In [5]:
Train_Base.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
Train_Base.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
def valueCounts(variable):
    print('Value counts of {}:\n{}\n'.format(variable,Train_Base[variable].value_counts()))


valueCounts("Survived")
valueCounts("Pclass")
valueCounts("Sex")
valueCounts("Embarked")

Value counts of Survived:
0    549
1    342
Name: Survived, dtype: int64

Value counts of Pclass:
3    491
1    216
2    184
Name: Pclass, dtype: int64

Value counts of Sex:
male      577
female    314
Name: Sex, dtype: int64

Value counts of Embarked:
S    644
C    168
Q     77
Name: Embarked, dtype: int64



In [8]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder())])

numeric_features = ["Age", "SibSp", "Parch", "Fare"]
categorical_features = ["Pclass", "Sex", "Embarked"]


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [9]:
y_train = Train_Base["Survived"]
X_train = preprocessor.fit_transform(Train_Base)

print( X_train.shape)

(891, 12)


In [10]:
svm_clf1 = SVC(C=5,kernel='rbf',probability = True)
svm_scores = cross_val_score(svm_clf1, X_train, y_train, cv=10)
svm_clf1.fit(X_train,y_train)
print('SVM score:', svm_scores.mean())

Rf_clf1 = RandomForestClassifier(bootstrap= False, max_depth= 10,  max_features='sqrt',min_samples_leaf=2, min_samples_split=5, n_estimators=100,random_state=42)
Rf_scores = cross_val_score(Rf_clf1, X_train, y_train, cv=10)
Rf_clf1.fit(X_train,y_train)
print('RF score:',Rf_scores.mean())

Lr_clf1 = LogisticRegression()
Lr_scores = cross_val_score(Lr_clf1, X_train, y_train, cv=10)
Lr_clf1.fit(X_train,y_train)
print('Lr score:',Lr_scores.mean())

vot_hard = VotingClassifier(estimators = [('svm',svm_clf1),('Lr',Lr_clf1),('rf',Rf_clf1)], voting ='hard')
Vh_scores = cross_val_score(vot_hard, X_train, y_train, cv=10)
vot_hard.fit(X_train, y_train) 
print('VH score:',Vh_scores.mean())

SVM score: 0.826067415730337
RF score: 0.8339700374531835
Lr score: 0.7991260923845193
VH score: 0.8283270911360798


In [11]:
Pass_ID_test = Test_Base['PassengerId'].to_numpy()
print( Pass_ID_test.shape)

X_test = preprocessor.fit_transform(Test_Base)
print( X_test.shape)

(418,)
(418, 12)


In [12]:
y_predict_svm = svm_clf1.predict(X_test)
Predicted_DF = pd.DataFrame({'PassengerId':Pass_ID_test,  'Survived':y_predict_svm})
Predicted_DF.to_csv('Survival_Prediction_svm.csv', index = False)

In [13]:
y_predict_rf = Rf_clf1.predict(X_test)
Predicted_Rf_DF = pd.DataFrame({'PassengerId':Pass_ID_test,  'Survived':y_predict_rf})
Predicted_Rf_DF.to_csv('Survival_Prediction_rf.csv', index = False)

In [14]:
y_predict_vh = vot_hard.predict(X_test) 

In [16]:
param_grid = {'C': [1,5,10,20],  'kernel':['linear', 'poly', 'rbf', 'sigmoid']}

grid_search = GridSearchCV(SVC(), param_grid, cv=10, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)
grid_search.best_params_

Fitting 10 folds for each of 16 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done 153 out of 160 | elapsed:    8.7s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:    8.8s finished


{'C': 5, 'kernel': 'rbf'}

In [17]:
q=Train_Base[['Pclass','Survived']].groupby('Pclass', as_index=False).mean()

print(q)
print(Train_Base.shape)

   Pclass  Survived
0       1  0.629630
1       2  0.472826
2       3  0.242363
(891, 12)


In [18]:
param_grid_rf ={'bootstrap': [True, False],  'max_depth': [10, 20, 30],
                'max_features': ['auto', 'sqrt'],'min_samples_leaf': [1, 2, 4],'min_samples_split': [2, 5, 10],
                'n_estimators': [100,200,300]}

grid_search = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=10, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)
grid_search.best_params_

Fitting 10 folds for each of 324 candidates, totalling 3240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   26.8s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 1977 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 2584 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 3240 out of 3240 | elapsed:  9.4min finished


{'bootstrap': True,
 'max_depth': 10,
 'max_features': 'auto',
 'min_samples_leaf': 4,
 'min_samples_split': 5,
 'n_estimators': 200}

In [20]:
forest_clf = RandomForestClassifier(n_estimators=200,bootstrap=True,max_depth= 10, max_features= 'auto', min_samples_leaf= 4, min_samples_split= 5, random_state=42)
forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
forest_scores.mean()

0.8328339575530587