In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import klib
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import joblib

: 

In [None]:
sns.set()

: 

In [None]:
url = 'https://raw.githubusercontent.com/BigDataGal/Python-for-Data-Science/master/titanic-train.csv'
titanic = pd.read_csv(url)
titanic.columns = ['PassengerId','Survived','Pclass','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked']

: 

In [None]:
titanic

: 

In [None]:
titanic.describe()

: 

In [None]:
def data_split(df):
    train_set, test_set = train_test_split(df, test_size=0.33, random_state=42)
    return train_set, test_set

: 

In [None]:
titanic_train, titanic_test = data_split(titanic)
titanic_train = titanic_train.copy()
titanic_test = titanic_test.copy()

: 

In [None]:
titanic_train.info()

: 

In [None]:
titanic_train.isnull().sum()

: 

In [None]:
titanic_train.shape

: 

In [None]:
titanic_train.columns

: 

In [None]:
klib.missingval_plot(titanic_train)

: 

In [None]:
titanic_train.hist(bins=50, figsize=(20,15))
plt.show()

: 

In [None]:
sns.pairplot(titanic_train)

: 

In [None]:
titanic_train.columns

: 

In [None]:
# titanic_train.drop(labels=['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
# titanic_train

: 

In [None]:
total_males = titanic_train.groupby('Sex').count()['Survived']['male']
total_females = titanic_train.groupby('Sex').count()['Survived']['female']

print('Total Number of Males : ', total_males)
print('Total Number of Females : ', total_females)

total_males_survived = titanic_train.groupby('Sex').sum()['Survived']['male']
total_females_survived = titanic_train.groupby('Sex').sum()['Survived']['female']

print('Total Number of Males Survived : ', total_males_survived)
print('Total Number of Females Survived : ', total_females_survived)

percent_of_survived_males = (total_males_survived / total_males) * 100
percent_of_survived_females = (total_females_survived / total_females) * 100

print('Percentage of Survived Males : {:.2f}%'.format(percent_of_survived_males))
print('Percentage of Survived Females : {:.2f}%'.format(percent_of_survived_females))

: 

In [None]:
sns.catplot(data=titanic_train, x='Sex', y='Survived', hue='Sex', kind='bar').set(title='% of Male & Female Survivals')
plt.show()

: 

In [None]:
total_class_1_passenger = titanic_train.groupby('Pclass').count()['Survived'][1]
total_class_2_passenger = titanic_train.groupby('Pclass').count()['Survived'][2]
total_class_3_passenger = titanic_train.groupby('Pclass').count()['Survived'][3]

print('Total Number of Class 1 Passenger : ', total_class_1_passenger)
print('Total Number of Class 2 Passenger : ', total_class_2_passenger)
print('Total Number of Class 3 Passenger : ', total_class_3_passenger)

total_class1_survived = titanic_train.groupby('Pclass').sum()['Survived'][1]
total_class2_survived = titanic_train.groupby('Pclass').sum()['Survived'][2]
total_class3_survived = titanic_train.groupby('Pclass').sum()['Survived'][3]

print('Total Number of Class 1 Passenger Survived : ', total_class1_survived)
print('Total Number of Class 2 Passenger Survived : ', total_class2_survived)
print('Total Number of Class 3 Passenger Survived : ', total_class3_survived)

percent_of_class1_survived = (total_class1_survived / total_class_1_passenger) * 100
percent_of_class2_survived = (total_class2_survived / total_class_2_passenger) * 100
percent_of_class3_survived = (total_class3_survived / total_class_3_passenger) * 100

print('Percentage of Class 1 Passenger Survived : {:.2f}%'.format(percent_of_class1_survived))
print('Percentage of Class 2 Passenger Survived : {:.2f}%'.format(percent_of_class2_survived))
print('Percentage of Class 3 Passenger Survived : {:.2f}%'.format(percent_of_class3_survived))

: 

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(18, 8))

sns.barplot(data=titanic_train, x='Pclass', y='Survived', ax=axes[0]).set(title='% of Passenger class Survivals')
sns.barplot(data=titanic_train, x='Pclass', y='Survived', hue='Sex', ax=axes[1]).set(title='% of Passenger class Survivals')
plt.show()

: 

In [None]:
titanic_train.columns

: 

In [None]:
titanic_train['Age'].dtype

: 

In [None]:
titanic_train['age_category'] = pd.cut(titanic_train['Age'], bins=[0, 18, 30, 60, 100, np.inf]) 
titanic_train

: 

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(18, 8))

sns.countplot(x='age_category', data=titanic_train, ax=axes[0]).set(title='Age Category of People onboard')
sns.barplot(x='age_category', y='Survived', hue='Sex', data=titanic_train,ax=axes[1]).set(title='% of Age Category that Survived')

plt.show()

: 

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(18, 8))

sns.countplot(x='SibSp', data=titanic_train, ax=axes[0]).set(title='Number of Sibling Sprouse that survived')
sns.countplot(x='Parch', data=titanic_train, ax=axes[1]).set(title='Number of Parent Child that survived')


: 

In [None]:
klib.corr_mat(data=titanic_train)

: 

In [None]:
klib.corr_plot(data=titanic_train)

: 

In [None]:
titanic.Pclass.unique()

: 

In [None]:
titanic_train.head()

: 

In [None]:
titanic_train.dtypes

: 

In [None]:
titanic.columns

: 

In [None]:
features_name = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
target_name =  ['Survived']

: 

In [None]:
X_train = titanic_train[features_name]
y_train = titanic_train[target_name]

: 

In [None]:
categorical_features = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']
numerical_features = ['Age', 'Fare']

: 

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

: 

In [None]:
numerical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaling', StandardScaler())
])

full_transformer = ColumnTransformer([
    ('numerical_trans', numerical_transformer, numerical_features),
    ('encoder', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

: 

In [None]:
X_train.isna().sum()

: 

In [None]:
X_train

: 

In [None]:
prepared_data = full_transformer.fit_transform(X_train)
prepared_data

: 

In [None]:
prepared_data.toarray().shape

: 

In [None]:
tree_clf = DecisionTreeClassifier()

: 

In [None]:
tree_clf.fit(X=prepared_data, y=y_train)

: 

In [None]:
print('Training Score is {:.2f}%'.format(tree_clf.score(prepared_data, y_train) * 100))

: 

In [None]:
cross_val_score(estimator=tree_clf, X=prepared_data, y=y_train, cv=5, n_jobs=-1)

: 

In [None]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': range(2,40,1),
    'min_samples_split' : range(2,10,1),
    'min_samples_leaf' : range(1,10,1),
    'splitter' : ['best', 'random']
}

: 

In [None]:
grid_search = GridSearchCV(estimator=tree_clf, param_grid=param_grid, cv=5, n_jobs=-1)

: 

In [None]:
grid_search.fit(prepared_data, y_train)

: 

In [None]:
grid_search.best_estimator_

: 

In [None]:
best_tree = grid_search.best_estimator_
best_tree.fit(prepared_data, y_train)

: 

In [None]:
best_tree.score(prepared_data, y_train)

: 

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

: 

In [None]:
y_pred = best_tree.predict(prepared_data)

: 

In [None]:
confusion_matrix(y_true=y_train, y_pred=y_pred)

: 

In [None]:
print(classification_report(y_true=y_train, y_pred=y_pred))

: 

### Testing with the Test Data

In [None]:
prepared_test_data = full_transformer.transform(titanic_test[features_name])

: 

In [None]:
y_pred = best_tree.predict(prepared_test_data)
y_pred

: 

In [None]:
y_test = titanic_test['Survived']

: 

In [None]:
print(classification_report(y_true=y_test, y_pred=y_pred))

: 

In [None]:
confusion_matrix(y_true=y_test, y_pred=y_pred)

: 

In [None]:
import joblib

: 

In [None]:
joblib.dump(value=full_transformer, filename='model/full_transformer.pickle')
joblib.dump(value=best_tree, filename='model/tree_clf.pickle')

: 

In [None]:
!ls

: 

In [None]:
df = pd.read_csv('Sample File/test.csv')

: 

In [None]:
features_name = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

: 

In [None]:
df

: 

In [None]:
df = df[features_name]
df

: 

In [None]:
full_transformer2 = joblib.load('model/full_transformer.pickle')

: 

In [None]:
prepared_data2 = full_transformer2.transform(df)
prepared_data2

: 

In [None]:
tree_clf2 = joblib.load('model/tree_clf.pickle')

tree_clf2.predict(prepared_data2)

: 

In [None]:
prepared_data2.shape

: 

In [None]:
!ls

: 

: 