In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

**The Data**

Let's start by reading in the titanic_train.csv file into a pandas dataframe.

In [2]:
df = pd.read_csv('/content/drive/MyDrive/IST347/Week_7/titanic_mod.csv')

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


**Data Cleaning**

We want to fill in missing age data instead of just dropping the missing age data rows. One way to do this is by filling in the mean age of all the passengers (imputation).
However we can be smarter about this and check the average age by passenger class. For example:


In [4]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [5]:
df.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)

In [6]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.0,7.25
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.925
3,1,1,female,35.0,53.1
4,0,3,male,35.0,8.05


In [7]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
Fare          0
dtype: int64

In [8]:
df['Age'].fillna((df['Age'].mean()), inplace=True)

Great! Let's go ahead and drop the Cabin column and the row in Embarked that is NaN.

In [9]:
df.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
Fare        0
dtype: int64

**Converting Categorical Features**

We'll need to convert categorical features to dummy variables using pandas! Otherwise our machine learning algorithm won't be able to directly take in those features as inputs.

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       891 non-null    float64
 4   Fare      891 non-null    float64
dtypes: float64(2), int64(2), object(1)
memory usage: 34.9+ KB


In [11]:
sex = pd.get_dummies(df['Sex'],drop_first=True)

In [12]:
df.drop(['Sex'],axis=1,inplace=True)

In [13]:
df = pd.concat([df,sex],axis=1)

In [14]:
df.head()

Unnamed: 0,Survived,Pclass,Age,Fare,male
0,0,3,22.0,7.25,1
1,1,1,38.0,71.2833,0
2,1,3,26.0,7.925,0
3,1,1,35.0,53.1,0
4,0,3,35.0,8.05,1


In [15]:
df.columns

Index(['Survived', 'Pclass', 'Age', 'Fare', 'male'], dtype='object')

Great! Our data is ready for our model!

**Building a Naive bayes model**

Let's start by splitting our data into a training set and test set (there is another test.csv file that you can play around with in case you want to use all this data for training).


In [16]:
X = df[['Pclass', 'Age', 'Fare', 'male']]
y = df['Survived']

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=101)

**Training and Predicting**

In [21]:
from sklearn.naive_bayes import GaussianNB

In [22]:
model = GaussianNB()

In [23]:
clf = model.fit(X_train, y_train)

In [24]:
y_pred = clf.predict(X_test)

Let's move on to evaluate our model!

**Evaluation**

In [25]:
from sklearn.metrics import classification_report, confusion_matrix

We can check precision,recall,f1-score using classification report!

In [26]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.81      0.79       127
           1       0.73      0.67      0.70        96

    accuracy                           0.75       223
   macro avg       0.75      0.74      0.74       223
weighted avg       0.75      0.75      0.75       223



In [27]:
print(confusion_matrix(y_test, y_pred))

[[103  24]
 [ 32  64]]


In [28]:
y_pred_prob = clf.predict_proba(X_test)

In [31]:
y_pred_prob[:10]

array([[8.07807137e-01, 1.92192863e-01],
       [3.09359704e-11, 1.00000000e+00],
       [6.58907589e-01, 3.41092411e-01],
       [9.54528780e-01, 4.54712198e-02],
       [9.34637838e-01, 6.53621618e-02],
       [9.72961758e-01, 2.70382415e-02],
       [8.02178574e-01, 1.97821426e-01],
       [9.69049838e-01, 3.09501625e-02],
       [5.48751184e-02, 9.45124882e-01],
       [4.68144289e-01, 5.31855711e-01]])

Can cross fold validation help?

In [43]:
from sklearn.model_selection import cross_validate

In [44]:
model_v = GaussianNB()

In [45]:
scores = cross_validate(model_v, X_train, y_train, scoring=['accuracy', 'f1'], cv = 10)

In [46]:
scores_df = pd.DataFrame(scores)
print(scores_df)

   fit_time  score_time  test_accuracy   test_f1
0  0.003645    0.002730       0.835821  0.765957
1  0.003568    0.004314       0.791045  0.708333
2  0.003749    0.001700       0.850746  0.807692
3  0.001731    0.001637       0.656716  0.530612
4  0.001810    0.001619       0.880597  0.833333
5  0.002998    0.002393       0.761194  0.703704
6  0.002840    0.002148       0.805970  0.754717
7  0.001996    0.001657       0.820896  0.769231
8  0.001697    0.001583       0.696970  0.583333
9  0.001725    0.001609       0.772727  0.716981


In [42]:
scores_df.test_accuracy.mean()

0.7872682044323837

In [51]:
new_df = pd.DataFrame(cross_validate(model_v, X_train, y_train, scoring=['accuracy', 'f1'], cv = 50))

In [53]:
new_df.test_accuracy.mean()

0.7858241758241761