### Data sets were acquired from https://www.kaggle.com/c/titanic

In [1]:
import warnings
warnings.filterwarnings('ignore')

##### Import datasets

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
%matplotlib inline

df_train = pd.read_csv('train.csv', index_col=['PassengerId'])
X_test = pd.read_csv('test.csv', index_col=['PassengerId'])
X_train = df_train.drop(columns=['Survived']).copy()
y_train = df_train['Survived'].copy()
train_test = pd.concat([X_train, X_test])
test_index = X_test.index

In [3]:
train_test = train_test.rename(columns=str.lower)

In [4]:
train_test.head()

Unnamed: 0_level_0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Exploratory Data Analysis

In [5]:
train_test['title'] = train_test['name'].str.extract(r'([a-zA-Z]+)\.')

In [6]:
pd.crosstab(train_test['title'], train_test['sex']).sort_values(by='title')

sex,female,male
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,4
Countess,1,0
Don,0,1
Dona,1,0
Dr,1,7
Jonkheer,0,1
Lady,1,0
Major,0,2
Master,0,61


In [7]:
# Convert underrepresented titles to 'Other'
train_test.title = train_test.title.replace(['Lady', 'Countess','Capt', 'Col', \
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')
train_test.title = train_test.title.replace('Mlle', 'Miss')
train_test.title = train_test.title.replace('Ms', 'Miss')
train_test.title = train_test.title.replace('Mme', 'Mrs')

In [8]:
train_test['relatives'] = train_test['sibsp'] + train_test['parch']

In [9]:
train_test.head()

Unnamed: 0_level_0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,title,relatives
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,1
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,1
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,0
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs,1
5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr,0


In [10]:
# Impute age
mean_age = train_test['age'].mean()
std_age = train_test['age'].std()
random_age_filled = np.random.uniform(mean_age - std_age, mean_age + std_age, size=train_test['age'].isnull().sum())
train_test.loc[train_test['age'].isna(), 'age'] = random_age_filled

#### Cleaning data

In [11]:
def remove_name(df):
    return df.drop(columns=['name', 'sibsp', 'parch', 'ticket', 'cabin'])

def imp(df):
    df.fillna(inplace=True, value={'fare': df['fare'].median(), 'embarked': df['embarked'].mode()}) 
    return df

def encode_cat(df):
    cat_cols = df.select_dtypes(exclude='number')
    cat_cols = list(cat_cols.columns)
    df = pd.get_dummies(df, drop_first=True, columns=cat_cols)
    return df

def drop_dup(df):
    return df.drop_duplicates() 

In [12]:
train_test = train_test.pipe(remove_name).pipe(imp).pipe(encode_cat)

In [13]:
X_test = pd.DataFrame(train_test, index=test_index)
X_train = pd.DataFrame(train_test, index=list(set(train_test.index) - set(test_index)))

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=1, stratify=y_train)

In [15]:
import lazypredict
from lazypredict.Supervised import LazyClassifier
clf = LazyClassifier(verbose=0,ignore_warnings=True)
models, predictions = clf.fit(X_train, X_val, y_train, y_val)
models

ModuleNotFoundError: No module named 'lazypredict'