In [1]:
#!pip install yellowbrick

In [2]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import  f1_score, auc, precision_recall_curve, recall_score , precision_score
import warnings
warnings.filterwarnings('ignore')
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

In [3]:
df_base_train =pd.read_csv('train.csv')
df_base_test =pd.read_csv('test.csv')

In [4]:
df_base_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [5]:
df_base_train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

1 - We will drop cabin as almost 77% of the data is empty 2 - Tickets and Name will be withdrawn as well
3 - Embarked Null will be withdrawn

In [6]:
df_base_train= df_base_train.drop(['Name' , 'Ticket' , 'Cabin'] , axis =1) 
df_base_test = df_base_test.drop(['Name' , 'Ticket' ,  'Cabin'] , axis =1)

In [7]:
df_base_train = df_base_train[df_base_train['Embarked'].notna()]
df_base_test = df_base_test[df_base_test['Embarked'].notna()]

In [8]:
df_base_train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Fare             0
Embarked         0
dtype: int64

In [9]:
df_base_train['Age']=df_base_train['Age'].fillna(df_base_train['Age'].mean())
df_base_test['Age']=df_base_test['Age'].fillna(df_base_test['Age'].mean())   
df_base_test['Fare']=df_base_test['Fare'].fillna(df_base_test['Fare'].mean())   


In [10]:
df_base_train = pd.get_dummies(df_base_train , columns = ['Sex'])
df_base_train = pd.get_dummies(df_base_train , columns = ['Embarked'])

df_base_test = pd.get_dummies(df_base_test , columns = ['Sex'])
df_base_test = pd.get_dummies(df_base_test , columns = ['Embarked'])

In [11]:
df_base_test.isna().sum()

PassengerId    0
Pclass         0
Age            0
SibSp          0
Parch          0
Fare           0
Sex_female     0
Sex_male       0
Embarked_C     0
Embarked_Q     0
Embarked_S     0
dtype: int64

In [12]:
df_base_train

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,22.000000,1,0,7.2500,0,1,0,0,1
1,2,1,1,38.000000,1,0,71.2833,1,0,1,0,0
2,3,1,3,26.000000,0,0,7.9250,1,0,0,0,1
3,4,1,1,35.000000,1,0,53.1000,1,0,0,0,1
4,5,0,3,35.000000,0,0,8.0500,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,27.000000,0,0,13.0000,0,1,0,0,1
887,888,1,1,19.000000,0,0,30.0000,1,0,0,0,1
888,889,0,3,29.642093,1,2,23.4500,1,0,0,0,1
889,890,1,1,26.000000,0,0,30.0000,0,1,1,0,0


In [13]:
df_base_test

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,892,3,34.50000,0,0,7.8292,0,1,0,1,0
1,893,3,47.00000,1,0,7.0000,1,0,0,0,1
2,894,2,62.00000,0,0,9.6875,0,1,0,1,0
3,895,3,27.00000,0,0,8.6625,0,1,0,0,1
4,896,3,22.00000,1,1,12.2875,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,30.27259,0,0,8.0500,0,1,0,0,1
414,1306,1,39.00000,0,0,108.9000,1,0,1,0,0
415,1307,3,38.50000,0,0,7.2500,0,1,0,0,1
416,1308,3,30.27259,0,0,8.0500,0,1,0,0,1


In [14]:
df_base_train= df_base_train.drop(['PassengerId'] , axis =1) 

### Train and test division

In [15]:
df_train, df_test = train_test_split(df_base_train,
                                         test_size = 0.5 ,
                                         random_state = 0)

In [16]:
df_train['Survived'].value_counts(normalize=True)

0    0.617117
1    0.382883
Name: Survived, dtype: float64

In [17]:
df_test['Survived'].value_counts(normalize=True)

0    0.617978
1    0.382022
Name: Survived, dtype: float64

In [18]:
X_train , y_train = df_train.drop('Survived' , axis=1) , df_train['Survived']
X_test , y_test = df_test.drop('Survived' , axis=1) , df_test['Survived']

### Data pre processing

In [19]:
X_train[['Pclass', 'Age', 'SibSp','Parch','Fare']] = StandardScaler().fit_transform(X_train[['Pclass',
                                                                                             'Age', 
                                                                                             'SibSp',
                                                                                             'Parch',
                                                                                             'Fare']])

In [20]:
over_sampler = RandomOverSampler(random_state=123)

In [21]:
X_train, y_train = over_sampler.fit_resample(X_train, y_train)
print(f"Training target statistics: {Counter(y_train)}")
print(f"Testing target statistics: {Counter(y_test)}")

Training target statistics: Counter({1: 274, 0: 274})
Testing target statistics: Counter({0: 275, 1: 170})


### Running the model -> Decision tree

In [22]:
clf = DecisionTreeClassifier(criterion="entropy")

In [23]:
model = clf.fit(X_train, y_train)

In [24]:
model

DecisionTreeClassifier(criterion='entropy')

In [25]:
allScores = cross_val_score(clf, X_train, y_train , cv=10)
# cross_val_score retorna array com as 10 validações
allScores.mean() # tomamos a média do score

0.8214478114478114

In [26]:
X_train

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,-0.368700,-0.392709,-0.483961,1.689548,-0.360090,1,0,0,0,1
1,-0.368700,-0.605904,0.348902,-0.469184,-0.422575,0,1,0,0,1
2,-0.368700,-0.748034,0.348902,-0.469184,-0.120564,1,0,0,0,1
3,0.843913,0.008248,-0.483961,-0.469184,-0.500682,0,1,0,1,0
4,-1.581312,-0.748034,-0.483961,1.689548,-0.114664,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
543,-1.581312,1.383920,0.348902,-0.469184,0.523640,0,1,1,0,0
544,-1.581312,1.312855,-0.483961,-0.469184,-0.122039,1,0,0,0,1
545,-0.368700,-0.748034,-0.483961,-0.469184,-0.443404,0,1,0,0,1
546,0.843913,0.246878,2.014630,-0.469184,-0.331972,1,0,0,0,1


In [27]:
X_test

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
14,3,14.000000,0,0,7.8542,1,0,0,0,1
159,3,29.642093,8,2,69.5500,0,1,0,0,1
763,1,36.000000,1,2,120.0000,1,0,0,0,1
741,1,36.000000,1,0,78.8500,0,1,0,0,1
483,3,63.000000,0,0,9.5875,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
109,3,29.642093,1,0,24.1500,1,0,0,1,0
214,3,29.642093,1,0,7.7500,0,1,0,1,0
586,2,47.000000,0,0,15.0000,0,1,0,0,1
550,1,17.000000,0,2,110.8833,0,1,1,0,0


In [28]:
clf.feature_importances_

array([0.11662644, 0.24162353, 0.07529825, 0.02348001, 0.24502979,
       0.25231051, 0.        , 0.        , 0.        , 0.04563148])

In [29]:
 y_pred = clf.predict(X_test)
print('f1_score' ,round(f1_score(y_test, y_pred, average="macro"),4))
print('precision_score',round(precision_score(y_test, y_pred, average="macro"),4))
print('recall_score', round(recall_score(y_test, y_pred, average="macro"),4)) 

f1_score 0.5388
precision_score 0.7159
recall_score 0.5748


In [30]:
df_base_test1 = df_base_test.copy()

In [31]:
df_base_test1

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,892,3,34.50000,0,0,7.8292,0,1,0,1,0
1,893,3,47.00000,1,0,7.0000,1,0,0,0,1
2,894,2,62.00000,0,0,9.6875,0,1,0,1,0
3,895,3,27.00000,0,0,8.6625,0,1,0,0,1
4,896,3,22.00000,1,1,12.2875,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,30.27259,0,0,8.0500,0,1,0,0,1
414,1306,1,39.00000,0,0,108.9000,1,0,1,0,0
415,1307,3,38.50000,0,0,7.2500,0,1,0,0,1
416,1308,3,30.27259,0,0,8.0500,0,1,0,0,1


In [32]:
df_base_test1 = df_base_test1.drop(['PassengerId'], axis=1)

In [33]:
predictions = model.predict(df_base_test1)

In [34]:
predictions

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [35]:
output = pd.DataFrame({'PassengerId': df_base_test.PassengerId, 
                       'Survived': predictions})

In [36]:
output

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [37]:
output.to_csv('submission.csv', index=False)