In [108]:
# Analysing the Titanic dataset without any help

In [109]:
import pandas as pd
import numpy as np

In [146]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
print(train_df.columns, test_df.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object') Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [147]:
# drop the name as they are not required right now
train_y = train_df.Survived
passenger_ids = test_df.PassengerId
train_df = train_df.drop(["Name", "PassengerId", "Cabin", "Ticket", "Survived"], axis=1)
test_df = test_df.drop(["Name", "PassengerId", "Cabin", "Ticket"], axis=1)

In [112]:
train_df

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.2500,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.9250,S
3,1,female,35.0,1,0,53.1000,S
4,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...
886,2,male,27.0,0,0,13.0000,S
887,1,female,19.0,0,0,30.0000,S
888,3,female,,1,2,23.4500,S
889,1,male,26.0,0,0,30.0000,C


In [113]:
categorical_attributes = ["Embarked", "Sex", "Pclass"]
numerical_attributes = ["Age", "SibSp", "Parch", "Fare"]

In [114]:
# Find total categories that will exist after encoding
for i in categorical_attributes:
    print(train_df[i].unique())

['S' 'C' 'Q' nan]
['male' 'female']
[3 1 2]


In [115]:
# Check for empty attributes
train_df.isnull().any()

Pclass      False
Sex         False
Age          True
SibSp       False
Parch       False
Fare        False
Embarked     True
dtype: bool

In [116]:
# Categorical attribute Embarked has empty cells
# Numerical attribute Age has empty cells

In [117]:
# Create a pipeline for numerical_attributes
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
numerical_pipeline = Pipeline([
    ("Imputer", SimpleImputer(strategy="mean")),
    ("Scaler", StandardScaler())
])

In [118]:
# Create a pipeline for categorical_attributes
from sklearn.preprocessing import OneHotEncoder

categorical_pipeline = Pipeline([
    ("Imputer", SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
    ("Encoder", OneHotEncoder(categories="auto"))
])

In [119]:
numerical_df = numerical_pipeline.fit_transform(train_df[numerical_attributes])
# train_df[numerical_attributes]
numerical_df.shape

(891, 4)

In [120]:
categorical_df = categorical_pipeline.fit_transform(train_df[categorical_attributes])
categorical_df = categorical_df.todense()
categorical_df.shape

(891, 8)

In [121]:
num_df = pd.DataFrame(columns=numerical_attributes, data=numerical_df)
num_df

Unnamed: 0,Age,SibSp,Parch,Fare
0,-0.592481,0.432793,-0.473674,-0.502445
1,0.638789,0.432793,-0.473674,0.786845
2,-0.284663,-0.474545,-0.473674,-0.488854
3,0.407926,0.432793,-0.473674,0.420730
4,0.407926,-0.474545,-0.473674,-0.486337
...,...,...,...,...
886,-0.207709,-0.474545,-0.473674,-0.386671
887,-0.823344,-0.474545,-0.473674,-0.044381
888,0.000000,0.432793,2.008933,-0.176263
889,-0.284663,-0.474545,-0.473674,-0.044381


In [122]:
# Get the categories after onehotencoding is done
ohe = OneHotEncoder(categories="auto")
si = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputed_df = pd.DataFrame(columns=categorical_attributes, data = si.fit_transform(train_df[categorical_attributes]))
imputed_df
cat_ohe_df = ohe.fit_transform(imputed_df)
np.array(ohe.categories_)

array([array(['C', 'Q', 'S'], dtype=object),
       array(['female', 'male'], dtype=object),
       array([1, 2, 3], dtype=object)], dtype=object)

In [123]:
ohe_categorical_columns = []
for i in ohe.categories_:
    for j in i:
        ohe_categorical_columns.append(j)
ohe_categorical_columns

['C', 'Q', 'S', 'female', 'male', 1, 2, 3]

In [124]:
cat_ohe_df.shape

(891, 8)

In [125]:
cat_df = pd.DataFrame(data = categorical_df, columns=ohe_categorical_columns)
cat_df

Unnamed: 0,C,Q,S,female,male,1,2,3
0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
3,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
4,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...
886,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
887,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
888,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
889,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [126]:
overall_trainig_dataframe = pd.concat([cat_df, num_df], axis=1)
overall_trainig_dataframe

Unnamed: 0,C,Q,S,female,male,1,2,3,Age,SibSp,Parch,Fare
0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,-0.592481,0.432793,-0.473674,-0.502445
1,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.638789,0.432793,-0.473674,0.786845
2,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,-0.284663,-0.474545,-0.473674,-0.488854
3,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.407926,0.432793,-0.473674,0.420730
4,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.407926,-0.474545,-0.473674,-0.486337
...,...,...,...,...,...,...,...,...,...,...,...,...
886,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,-0.207709,-0.474545,-0.473674,-0.386671
887,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,-0.823344,-0.474545,-0.473674,-0.044381
888,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.000000,0.432793,2.008933,-0.176263
889,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,-0.284663,-0.474545,-0.473674,-0.044381


In [127]:
# Create a similar data for testing

In [129]:
test_df

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,34.5,0,0,7.8292,Q
1,3,female,47.0,1,0,7.0000,S
2,2,male,62.0,0,0,9.6875,Q
3,3,male,27.0,0,0,8.6625,S
4,3,female,22.0,1,1,12.2875,S
...,...,...,...,...,...,...,...
413,3,male,,0,0,8.0500,S
414,1,female,39.0,0,0,108.9000,C
415,3,male,38.5,0,0,7.2500,S
416,3,male,,0,0,8.0500,S


In [159]:
test_num_df = pd.DataFrame(data = numerical_pipeline.fit_transform(test_df[numerical_attributes]), columns=numerical_attributes)
test_num_df

Unnamed: 0,Age,SibSp,Parch,Fare
0,0.334993,-0.499470,-0.400248,-0.498407
1,1.325530,0.616992,-0.400248,-0.513274
2,2.514175,-0.499470,-0.400248,-0.465088
3,-0.259330,-0.499470,-0.400248,-0.483466
4,-0.655545,0.616992,0.619896,-0.418471
...,...,...,...,...
413,0.000000,-0.499470,-0.400248,-0.494448
414,0.691586,-0.499470,-0.400248,1.313753
415,0.651965,-0.499470,-0.400248,-0.508792
416,0.000000,-0.499470,-0.400248,-0.494448


In [160]:
test_cat_df = pd.DataFrame(data = categorical_pipeline.fit_transform(test_df[categorical_attributes]).todense(), columns=ohe_categorical_columns)
test_cat_df

Unnamed: 0,C,Q,S,female,male,1,2,3
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
2,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
4,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...
413,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
414,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
415,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
416,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


In [161]:
# Aggregate both
overall_testing_dataframe = pd.concat([test_num_df, test_cat_df], axis=1)
overall_testing_dataframe

Unnamed: 0,Age,SibSp,Parch,Fare,C,Q,S,female,male,1,2,3
0,0.334993,-0.499470,-0.400248,-0.498407,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,1.325530,0.616992,-0.400248,-0.513274,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
2,2.514175,-0.499470,-0.400248,-0.465088,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,-0.259330,-0.499470,-0.400248,-0.483466,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
4,-0.655545,0.616992,0.619896,-0.418471,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
413,0.000000,-0.499470,-0.400248,-0.494448,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
414,0.691586,-0.499470,-0.400248,1.313753,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
415,0.651965,-0.499470,-0.400248,-0.508792,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
416,0.000000,-0.499470,-0.400248,-0.494448,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


# Train different models and test

In [176]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(overall_trainig_dataframe, train_y, random_state=42)

# 1. LogisticRegression

In [177]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train, y_train)
lr.score(x_test, y_test)

0.8026905829596412

# 2. DecisionTreeClassifier

In [178]:
from sklearn.tree import DecisionTreeClassifier
dtf = DecisionTreeClassifier()
dtf.fit(x_train, y_train)
dtf.score(x_test, y_test)

0.7488789237668162

# 3.RandomForest

In [179]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)
rfc.score(x_test, y_test)

0.7982062780269058

# SVC

In [180]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(x_train, y_train)
svc.score(x_test, y_test)

0.820627802690583

# Upload the best 2 model result on kaggle

In [187]:
# 1. RandomForestClassifier

In [188]:
# Train using the complete data and predict on test data

In [195]:
rfc2 = RandomForestClassifier()
rfc2.fit(overall_trainig_dataframe, train_y)
overall_testing_dataframe

Unnamed: 0,Age,SibSp,Parch,Fare,C,Q,S,female,male,1,2,3
0,0.334993,-0.499470,-0.400248,-0.498407,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,1.325530,0.616992,-0.400248,-0.513274,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
2,2.514175,-0.499470,-0.400248,-0.465088,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,-0.259330,-0.499470,-0.400248,-0.483466,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
4,-0.655545,0.616992,0.619896,-0.418471,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
413,0.000000,-0.499470,-0.400248,-0.494448,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
414,0.691586,-0.499470,-0.400248,1.313753,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
415,0.651965,-0.499470,-0.400248,-0.508792,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
416,0.000000,-0.499470,-0.400248,-0.494448,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


In [201]:
# Create a combined dataframe for passengerID and survivability
rfc_submission_df = pd.concat([passenger_ids, pd.DataFrame(data=rfc2.predict(overall_testing_dataframe), columns=['Survived'])] ,axis=1)
rfc_submission_df

Unnamed: 0,PassengerId,Survived
0,892,1
1,893,1
2,894,1
3,895,1
4,896,1
...,...,...
413,1305,1
414,1306,0
415,1307,1
416,1308,1


In [202]:
rfc_submission_df.to_csv("submission.csv", index=False)