In [127]:
# Analysing the Titanic dataset without any help

In [153]:
import pandas as pd
import numpy as np

In [154]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
print(train_df.columns, test_df.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object') Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [155]:
# drop the name as they are not required right now
train_y_data = train_df.Survived
passenger_ids_test = test_df.PassengerId

In [156]:
train_df['Relatives'] = train_df['SibSp'] + train_df['Parch']
test_df['Relatives'] = test_df['SibSp'] + test_df['Parch']

In [157]:
# Change sex to binary
train_df.Sex = train_df.Sex.replace({"male" : 0, "female" : 1})
test_df.Sex = test_df.Sex.replace({"male" : 0, "female" : 1})

In [158]:
train_df = train_df.drop(["Name", "PassengerId", "Cabin", "Ticket", "Survived", "SibSp", "Parch"], axis=1)
test_df = test_df.drop(["Name", "PassengerId", "Cabin", "Ticket", "SibSp", "Parch"], axis=1)

In [159]:
train_df

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Relatives
0,3,0,22.0,7.2500,S,1
1,1,1,38.0,71.2833,C,1
2,3,1,26.0,7.9250,S,0
3,1,1,35.0,53.1000,S,1
4,3,0,35.0,8.0500,S,0
...,...,...,...,...,...,...
886,2,0,27.0,13.0000,S,0
887,1,1,19.0,30.0000,S,0
888,3,1,,23.4500,S,3
889,1,0,26.0,30.0000,C,0


In [162]:
categorical_attributes = ["Embarked", "Pclass"]
numerical_attributes = ["Age", "Fare", "Relatives"]
# Leave sex out since it has binary

In [163]:
# Find total categories that will exist after encoding
for i in categorical_attributes:
    print(train_df[i].unique())

['S' 'C' 'Q' nan]
[3 1 2]


In [164]:
# Check for empty attributes
train_df.isnull().any()

Pclass       False
Sex          False
Age           True
Fare         False
Embarked      True
Relatives    False
dtype: bool

In [165]:
test_df.isnull().any()

Pclass       False
Sex          False
Age           True
Fare          True
Embarked     False
Relatives    False
dtype: bool

In [166]:
# Categorical attribute Embarked has empty cells
# Numerical attribute Age has empty cells

In [167]:
# Create a pipeline for numerical_attributes
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
numerical_pipeline = Pipeline([
    ("Imputer", SimpleImputer(strategy="median")),
    ("Scaler", StandardScaler())
])

In [168]:
# Create a pipeline for categorical_attributes
from sklearn.preprocessing import OneHotEncoder

categorical_pipeline = Pipeline([
    ("Imputer", SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
    ("Encoder", OneHotEncoder(categories="auto"))
])

In [169]:
numerical_df = numerical_pipeline.fit_transform(train_df[numerical_attributes])
# train_df[numerical_attributes]
numerical_df.shape

(891, 3)

In [170]:
categorical_df = categorical_pipeline.fit_transform(train_df[categorical_attributes])
categorical_df = categorical_df.todense()
categorical_df.shape

(891, 6)

In [171]:
num_df = pd.DataFrame(columns=numerical_attributes, data=numerical_df)
num_df

Unnamed: 0,Age,Fare,Relatives
0,-0.565736,-0.502445,0.059160
1,0.663861,0.786845,0.059160
2,-0.258337,-0.488854,-0.560975
3,0.433312,0.420730,0.059160
4,0.433312,-0.486337,-0.560975
...,...,...,...
886,-0.181487,-0.386671,-0.560975
887,-0.796286,-0.044381,-0.560975
888,-0.104637,-0.176263,1.299429
889,-0.258337,-0.044381,-0.560975


In [172]:
# Get the categories after onehotencoding is done
ohe = OneHotEncoder(categories="auto")
si = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputed_df = pd.DataFrame(columns=categorical_attributes, data = si.fit_transform(train_df[categorical_attributes]))
imputed_df
cat_ohe_df = ohe.fit_transform(imputed_df)
np.array(ohe.categories_)

array([['C', 'Q', 'S'],
       [1, 2, 3]], dtype=object)

In [173]:
ohe_categorical_columns = []
for i in ohe.categories_:
    for j in i:
        ohe_categorical_columns.append(j)
ohe_categorical_columns

['C', 'Q', 'S', 1, 2, 3]

In [174]:
cat_ohe_df.shape

(891, 6)

In [175]:
cat_df = pd.DataFrame(data = categorical_df, columns=ohe_categorical_columns)
cat_df

Unnamed: 0,C,Q,S,1,2,3
0,0.0,0.0,1.0,0.0,0.0,1.0
1,1.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0
3,0.0,0.0,1.0,1.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...
886,0.0,0.0,1.0,0.0,1.0,0.0
887,0.0,0.0,1.0,1.0,0.0,0.0
888,0.0,0.0,1.0,0.0,0.0,1.0
889,1.0,0.0,0.0,1.0,0.0,0.0


In [177]:
overall_trainig_dataframe = pd.concat([cat_df, num_df], axis=1)
# Add the sex feature as well to training dataframe 
overall_trainig_dataframe['Sex'] = train_df.Sex
overall_trainig_dataframe

Unnamed: 0,C,Q,S,1,2,3,Age,Fare,Relatives,Sex
0,0.0,0.0,1.0,0.0,0.0,1.0,-0.565736,-0.502445,0.059160,0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.663861,0.786845,0.059160,1
2,0.0,0.0,1.0,0.0,0.0,1.0,-0.258337,-0.488854,-0.560975,1
3,0.0,0.0,1.0,1.0,0.0,0.0,0.433312,0.420730,0.059160,1
4,0.0,0.0,1.0,0.0,0.0,1.0,0.433312,-0.486337,-0.560975,0
...,...,...,...,...,...,...,...,...,...,...
886,0.0,0.0,1.0,0.0,1.0,0.0,-0.181487,-0.386671,-0.560975,0
887,0.0,0.0,1.0,1.0,0.0,0.0,-0.796286,-0.044381,-0.560975,1
888,0.0,0.0,1.0,0.0,0.0,1.0,-0.104637,-0.176263,1.299429,1
889,1.0,0.0,0.0,1.0,0.0,0.0,-0.258337,-0.044381,-0.560975,0


In [151]:
# Create a similar data for testing

In [180]:
test_df

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Relatives
0,3,0,34.5,7.8292,Q,0
1,3,1,47.0,7.0000,S,1
2,2,0,62.0,9.6875,Q,0
3,3,0,27.0,8.6625,S,0
4,3,1,22.0,12.2875,S,2
...,...,...,...,...,...,...
413,3,0,,8.0500,S,0
414,1,1,39.0,108.9000,C,0
415,3,0,38.5,7.2500,S,0
416,3,0,,8.0500,S,0


In [181]:
test_num_df = pd.DataFrame(data = numerical_pipeline.fit_transform(test_df[numerical_attributes]), columns=numerical_attributes)
test_num_df

Unnamed: 0,Age,Fare,Relatives
0,0.386231,-0.497413,-0.553443
1,1.371370,-0.512278,0.105643
2,2.553537,-0.464100,-0.553443
3,-0.204852,-0.482475,-0.553443
4,-0.598908,-0.417492,0.764728
...,...,...,...
413,-0.204852,-0.493455,-0.553443
414,0.740881,1.314435,-0.553443
415,0.701476,-0.507796,-0.553443
416,-0.204852,-0.493455,-0.553443


In [182]:
test_cat_df = pd.DataFrame(data = categorical_pipeline.fit_transform(test_df[categorical_attributes]).todense(), columns=ohe_categorical_columns)
test_cat_df

Unnamed: 0,C,Q,S,1,2,3
0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.0,0.0,1.0,0.0,0.0,1.0
2,0.0,1.0,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,0.0,1.0
4,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...
413,0.0,0.0,1.0,0.0,0.0,1.0
414,1.0,0.0,0.0,1.0,0.0,0.0
415,0.0,0.0,1.0,0.0,0.0,1.0
416,0.0,0.0,1.0,0.0,0.0,1.0


In [184]:
# Aggregate both
overall_testing_dataframe = pd.concat([test_num_df, test_cat_df], axis=1)
overall_testing_dataframe['Sex'] = test_df.Sex
overall_testing_dataframe

Unnamed: 0,Age,Fare,Relatives,C,Q,S,1,2,3,Sex
0,0.386231,-0.497413,-0.553443,0.0,1.0,0.0,0.0,0.0,1.0,0
1,1.371370,-0.512278,0.105643,0.0,0.0,1.0,0.0,0.0,1.0,1
2,2.553537,-0.464100,-0.553443,0.0,1.0,0.0,0.0,1.0,0.0,0
3,-0.204852,-0.482475,-0.553443,0.0,0.0,1.0,0.0,0.0,1.0,0
4,-0.598908,-0.417492,0.764728,0.0,0.0,1.0,0.0,0.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...
413,-0.204852,-0.493455,-0.553443,0.0,0.0,1.0,0.0,0.0,1.0,0
414,0.740881,1.314435,-0.553443,1.0,0.0,0.0,1.0,0.0,0.0,1
415,0.701476,-0.507796,-0.553443,0.0,0.0,1.0,0.0,0.0,1.0,0
416,-0.204852,-0.493455,-0.553443,0.0,0.0,1.0,0.0,0.0,1.0,0


# Train different models and test

In [185]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(overall_trainig_dataframe, train_y_data, random_state=42)

# 1. LogisticRegression

In [186]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train, y_train)
lr.score(x_test, y_test)

0.7982062780269058

# 2. DecisionTreeClassifier

In [187]:
from sklearn.tree import DecisionTreeClassifier
dtf = DecisionTreeClassifier()
dtf.fit(x_train, y_train)
dtf.score(x_test, y_test)

0.7623318385650224

# 3.RandomForest

In [188]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)
rfc.score(x_test, y_test)

0.7982062780269058

# SVC

In [207]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(x_train, y_train)
svc.score(x_test, y_test)

0.8026905829596412

# Upload the best 2 model result on kaggle

In [201]:
# 1. RandomForestClassifier

In [202]:
# Train using the complete data and predict on test data

In [203]:
rfc2 = RandomForestClassifier()
rfc2.fit(overall_trainig_dataframe, train_y_data)
overall_testing_dataframe

Unnamed: 0,Age,Fare,Relatives,C,Q,S,1,2,3,Sex
0,0.386231,-0.497413,-0.553443,0.0,1.0,0.0,0.0,0.0,1.0,0
1,1.371370,-0.512278,0.105643,0.0,0.0,1.0,0.0,0.0,1.0,1
2,2.553537,-0.464100,-0.553443,0.0,1.0,0.0,0.0,1.0,0.0,0
3,-0.204852,-0.482475,-0.553443,0.0,0.0,1.0,0.0,0.0,1.0,0
4,-0.598908,-0.417492,0.764728,0.0,0.0,1.0,0.0,0.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...
413,-0.204852,-0.493455,-0.553443,0.0,0.0,1.0,0.0,0.0,1.0,0
414,0.740881,1.314435,-0.553443,1.0,0.0,0.0,1.0,0.0,0.0,1
415,0.701476,-0.507796,-0.553443,0.0,0.0,1.0,0.0,0.0,1.0,0
416,-0.204852,-0.493455,-0.553443,0.0,0.0,1.0,0.0,0.0,1.0,0


In [204]:
# Create a combined dataframe for passengerID and survivability
rfc_submission_df = pd.concat([passenger_ids, pd.DataFrame(data=rfc2.predict(overall_testing_dataframe), columns=['Survived'])] ,axis=1)
rfc_submission_df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [205]:
rfc_submission_df.to_csv("submission.csv", index=False)

In [68]:
# 2. Test using SVC Classifier

In [208]:
# from sklearn
svc = SVC()

In [209]:
svc.fit(overall_trainig_dataframe, train_y_data)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [210]:
yhat = svc.predict(overall_testing_dataframe)

In [211]:
svc_submission_df = pd.concat([passenger_ids, pd.DataFrame(yhat, columns=['Survived'])], axis=1)

In [212]:
svc_submission_df.to_csv("submission.csv", index=False)

In [103]:
# 3. Train using LogisticRegresssion

In [104]:
lrm = LogisticRegression()
lrm.fit(overall_trainig_dataframe, train_y_data)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [105]:
yhat = lrm.predict(overall_testing_dataframe)

In [107]:
lrm_submission_df = pd.concat([passenger_ids, pd.DataFrame(yhat, columns=['Survived'])], axis=1)

In [108]:
lrm_submission_df.to_csv("submission.csv", index=False)