# Data Mining Kaggle Competition - Tiffany Lee

In [1]:
import pandas as pd
import collections
from sklearn.metrics import silhouette_score
import numpy as np
from sklearn.preprocessing import normalize, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Activation

from keras import models
from keras.layers import Dense, Dropout
from keras.utils import to_categorical
from keras.datasets import mnist
from keras.utils.vis_utils import model_to_dot

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from keras.optimizers import SGD

from IPython.display import SVG

import livelossplot
plot_losses = livelossplot.PlotLossesKeras()

%matplotlib inline

# Data Preparation and Exploration

## Loading data

The gender_submission.csv file is a sample submission file where the `Survived` column is hard coded to `0` is the sex of this passenger is male and `1` if this passenger is female.

In [2]:
gender_sub = pd.read_csv('gender_submission.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

## Data Exploratory Analysis

### Data Dictionary

| Column | Description |
|:----------|:-----------|
PassengerID| ID number to identify each passenger
Survived| Whether the passenger survived or not (0=No, 1=Yes)
Pclass| The class of the ticket the passenger purchased (1=1st, 2=2nd, 3=3rd)
Sex| The passenger’s sex
Age| The passenger’s age in years
SibSp| The number of siblings or spouses the passenger had aboard the Titanic
Parch| The number of parents or children the passenger had aboard the Titanic
Ticket| The passenger’s ticket number
Fare| The fare the passenger paid
Cabin| The passenger’s cabin number
Embarked| The port where the passenger embarked (C=Cherbourg, Q=Queenstown, S=Southampton)

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
# Check state of dataset
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [5]:
test.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q


The test.csv data set does not contain the column `Survived`. The corresponding values are expected to be predicted by a classifier to be implemented. I'm guessing they are part of the submission.

In [6]:
train[['Sex']].value_counts()

Sex   
male      577
female    314
dtype: int64

In [7]:
test[['Sex']].value_counts()

Sex   
male      266
female    152
dtype: int64

It appears that the data sets are moderately imbalanced in terms of gender distribution.

## Data Cleaning

### Checking for Missing Data

In [8]:
train.isnull().mean()

PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.198653
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Cabin          0.771044
Embarked       0.002245
dtype: float64

In [9]:
test.isnull().mean()

PassengerId    0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.205742
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.002392
Cabin          0.782297
Embarked       0.000000
dtype: float64

### Handling Missing Data

It is usually safer to use the median instead of mean to fill the missing numerical data.  
This is particularly preferred if:  
- The distribution of the data is skewed (not symmetric).  
- You want to guard against outliers.

In [10]:
train[['Embarked']].mode().iloc[0,0]

'S'

In [11]:
def titanic_clean(df):
    df_clean = (df
       .assign(
           Age=lambda x: np.where(x.Age.isnull(), x.Age.median(), x.Age),
           Fare=lambda x: np.where(x.Fare.isnull(), x.Fare.median(), x.Fare),
           Cabin=lambda x: np.where(x.Cabin.isnull(), 'Unavailable', x.Cabin),
           Embarked = lambda x: np.where(
               x.Embarked.isnull(),train[['Embarked']].mode().iloc[0,0], x.Embarked),
       )
#       .drop('Cabin',axis=1)
    )
    return df_clean

In [12]:
train_clean = titanic_clean(train)
train_clean.isnull().mean()

PassengerId    0.0
Survived       0.0
Pclass         0.0
Name           0.0
Sex            0.0
Age            0.0
SibSp          0.0
Parch          0.0
Ticket         0.0
Fare           0.0
Cabin          0.0
Embarked       0.0
dtype: float64

In [13]:
test_clean = titanic_clean(test)
test_clean.isnull().mean()

PassengerId    0.0
Pclass         0.0
Name           0.0
Sex            0.0
Age            0.0
SibSp          0.0
Parch          0.0
Ticket         0.0
Fare           0.0
Cabin          0.0
Embarked       0.0
dtype: float64

In [14]:
train_clean.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

## Feature Engineering

In [15]:
# concatenated is created for EDA purposes e.g. categorical data analysis
concatenated = pd.concat([train_clean, test_clean],ignore_index=True)
concatenated.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,Unavailable,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,Unavailable,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,Unavailable,S


In [16]:
concatenated.isnull().sum()

PassengerId      0
Survived       418
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin            0
Embarked         0
dtype: int64

### Defining and Analyzing Categories

In [17]:
def get_title0(nm):
     return nm.split(",")[1].split(".")[0].strip()
get_title0 = np.vectorize(get_title0)

def get_title(nm):
    VIPs = ('Lady', 'the Countess','Capt', 'Col','Don', 'Dr',
            'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona')
    title = nm.split(",")[1].split(".")[0].strip()
    if title in VIPs:
        title = 'VIP'
    elif title == 'Mlle' or title == 'Ms':
        title = 'Miss'
    elif title == 'Mme':
        title = 'Mrs'
    return title
get_title = np.vectorize(get_title)

(concatenated
 .assign(Title=lambda x: get_title(x.Name))
 [['Title']]
 .value_counts())

Title 
Mr        757
Miss      264
Mrs       198
Master     61
VIP        29
dtype: int64

In [18]:
"""
Youth (<18)
YoungAdult (18 to 35)
Adult (36 to 55)
Senior (56 and up)
"""
def get_age_group(age):
    if age <= 18:
        grp = 'Youth'
    elif age > 18 and age <=35:
        grp = 'Yadult'
    elif age > 35 and age <=55:
        grp = 'Adult'
    else:
        grp = 'Senior'
    return grp
get_age_group = np.vectorize(get_age_group)
(concatenated
 .assign(Age_Group =lambda x: get_age_group(x.Age))
 [['Age_Group']]
 .value_counts())

Age_Group
Yadult       794
Adult        263
Youth        193
Senior        59
dtype: int64

### Adding Categorical Columns

In [19]:
def add_categories(df):
    df1 = (df
          .assign(
              Title=lambda x: get_title(x.Name),
              Age_Group=lambda x: get_age_group(x.Age),
              Deck = lambda x: np.where(x.Cabin.isnull(), 'U', x.Cabin.str[0]),
          ))
    return df1
train_cat = add_categories(train_clean)
test_cat = add_categories(test_clean)
train_cat.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Age_Group,Deck
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,Unavailable,S,Mr,Yadult,U
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,Adult,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,Unavailable,S,Miss,Yadult,U


In [20]:
from pandas.api.types import CategoricalDtype
cats = np.union1d(train_cat[['Deck']], test_cat[['Deck']])
cats

array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T', 'U'], dtype=object)

In [21]:
train_cat['Deck'] = pd.Categorical(train_cat['Deck'], categories = cats, ordered = True)
test_cat['Deck'] = pd.Categorical(test_cat['Deck'], categories = cats, ordered = True)

## Feature Selection

In [22]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [23]:
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 
           'Fare', 'Embarked','Title','Age_Group','Deck']
target = ['Survived']

## Data Splitting

In [24]:
X_train, X_valid, y_train, y_valid = train_test_split(
    train_cat[features], train_cat[target], test_size=0.20, random_state=42)

X_test = test_cat[features]

In [25]:
X_train.head(3)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Age_Group,Deck
331,1,male,45.5,0,0,28.5,S,Mr,Adult,C
733,2,male,23.0,0,0,13.0,S,Mr,Yadult,U
382,3,male,32.0,0,0,7.925,S,Mr,Yadult,U


In [26]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 331 to 102
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   Pclass     712 non-null    int64   
 1   Sex        712 non-null    object  
 2   Age        712 non-null    float64 
 3   SibSp      712 non-null    int64   
 4   Parch      712 non-null    int64   
 5   Fare       712 non-null    float64 
 6   Embarked   712 non-null    object  
 7   Title      712 non-null    object  
 8   Age_Group  712 non-null    object  
 9   Deck       712 non-null    category
dtypes: category(1), float64(2), int64(3), object(4)
memory usage: 56.7+ KB


## Categorical Data Encoding

In [27]:
X_train[['Pclass']].value_counts()

Pclass
3         398
1         163
2         151
dtype: int64

In [28]:
X_valid.loc[:,'Pclass'] = X_valid.loc[:,'Pclass'].astype(str)
X_train.loc[:,'Pclass'] = X_train.loc[:,'Pclass'].astype(str)

In [29]:
X_test.loc[:,'Pclass'] = X_test.loc[:,'Pclass'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [30]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 331 to 102
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   Pclass     712 non-null    object  
 1   Sex        712 non-null    object  
 2   Age        712 non-null    float64 
 3   SibSp      712 non-null    int64   
 4   Parch      712 non-null    int64   
 5   Fare       712 non-null    float64 
 6   Embarked   712 non-null    object  
 7   Title      712 non-null    object  
 8   Age_Group  712 non-null    object  
 9   Deck       712 non-null    category
dtypes: category(1), float64(2), int64(2), object(5)
memory usage: 56.7+ KB


In [31]:
enc = LabelEncoder()
def encode_cat(df):
    df1 = (df
           .assign(
               Pclass = enc.fit_transform(df.Pclass),
               Sex = enc.fit_transform(df.Sex),
               Embarked = enc.fit_transform(df.Embarked),
               Title = enc.fit_transform(df.Title),
               Age_Group = enc.fit_transform(df.Age_Group),
               Deck = enc.fit_transform(df.Deck),       
            )
          )
    return df1

X_train = encode_cat(X_train)
X_valid = encode_cat(X_valid)
X_test = encode_cat(X_test)

In [32]:
X_train.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Title',
       'Age_Group', 'Deck'],
      dtype='object')

In [33]:
X_test.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Title',
       'Age_Group', 'Deck'],
      dtype='object')

## Numerical Data Normalization

In [34]:
scaler_age = StandardScaler()
scaler_age.fit(X_train[['Age']])
scaler_fare = StandardScaler()
scaler_fare.fit(X_train[['Fare']])

StandardScaler()

In [35]:
# standardize Age & Fare values to Z-scores
X_train[['Age']] = scaler_age.transform(X_train[['Age']].values)
X_train[['Fare']] = scaler_fare.transform(X_train[['Fare']].values)

X_valid[['Age']] = scaler_age.transform(X_valid[['Age']].values)
X_valid[['Fare']] = scaler_fare.transform(X_valid[['Fare']].values)

X_test[['Age']] = scaler_age.transform(X_test[['Age']].values)
X_test[['Fare']] = scaler_fare.transform(X_test[['Fare']].values)

In [36]:
X_train.head(3)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Age_Group,Deck
331,0,1,1.253641,0,0,-0.078684,2,2,0,2
733,1,1,-0.477284,0,0,-0.377145,2,2,2,8
382,2,1,0.215086,0,0,-0.474867,2,2,2,8


In [37]:
X_train[['Age','Fare']].describe()

Unnamed: 0,Age,Fare
count,712.0,712.0
mean,1.746418e-17,5.363999e-17
std,1.000703,1.000703
min,-2.214363,-0.6274674
25%,-0.5542135,-0.474867
50%,-0.09263364,-0.3491435
75%,0.4458762,-0.04017244
max,3.907725,9.237724


# Building the Model

In [38]:
import pandas as pd
from sklearn.datasets import make_classification, make_regression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier

from collections import Counter

In [39]:
clf_rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=700,
                             min_samples_split=10,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)

In [40]:
from xgboost.sklearn import XGBClassifier

clf_xg = XGBClassifier(learning_rate=0.001,n_estimators=2500,
                                max_depth=4, min_child_weight=0,
                                gamma=0, subsample=0.7,
                                colsample_bytree=0.7,
                                scale_pos_weight=1, seed=27,
                                reg_alpha=0.00006)

In [41]:
from sklearn.neural_network import MLPClassifier

clf_mlp = MLPClassifier(solver='lbfgs', alpha=1e-5,
                     max_iter=10000,
                     hidden_layer_sizes=(25,10,5), random_state=42,
                     activation='logistic')

In [42]:
clf_list = [('MLP', clf_mlp),
            ('XGBoost', clf_xg),
            ('Random Forest', clf_rf),
            ('decision tree', DecisionTreeClassifier(random_state=42)),
            ('logistic regression', LogisticRegression(random_state=42)),
            ('knn', KNeighborsClassifier()),
            ('naive bayes classifier', GaussianNB())]

# Training the Model 

In [43]:
for model_tuple in clf_list:
    model = model_tuple[1]
    model.fit(X_train, y_train.iloc[:,0].values)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


# Summarizing the Results

In [44]:
for model_tuple in clf_list:
    model = model_tuple[1]
    y_pred_train = model.predict(X_train)
    y_pred_valid = model.predict(X_valid)
    acc_t = accuracy_score(y_pred_train, y_train)
    acc_v = accuracy_score(y_pred_valid, y_valid)
    print(f"{model_tuple[0]}'s accuracy: {acc_t:.2f} / {acc_v:.2f}")

MLP's accuracy: 0.93 / 0.72
XGBoost's accuracy: 0.88 / 0.81
Random Forest's accuracy: 0.91 / 0.83
decision tree's accuracy: 0.98 / 0.77
logistic regression's accuracy: 0.79 / 0.82
knn's accuracy: 0.85 / 0.81
naive bayes classifier's accuracy: 0.79 / 0.77


In [45]:
voting_clf = VotingClassifier(clf_list, voting='hard')
voting_clf.fit(X_train, y_train.iloc[:,0].values)
y_pred = voting_clf.predict(X_valid)
y_pred_train = voting_clf.predict(X_train)
y_pred_valid = voting_clf.predict(X_valid)
acc_t = accuracy_score(y_pred_train, y_train)
acc_v = accuracy_score(y_pred_valid, y_valid)
print(f"Voting Classifier's accuracy: {acc_t:.2f} / {acc_v:.2f}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Voting Classifier's accuracy: 0.90 / 0.82


In [46]:
model = voting_clf

In [47]:
# Compute the accuracy on the testing data set:
print('Accuracy on the training data set')
y_pred_train = model.predict(X_train)
print(accuracy_score(y_train, y_pred_train))
print("Classification Report:")
print(classification_report(y_train, y_pred_train))
print("Confusion Matrix:")
print(confusion_matrix(y_train, y_pred_train))

Accuracy on the training data set
0.8974719101123596
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.95      0.92       444
           1       0.90      0.81      0.86       268

    accuracy                           0.90       712
   macro avg       0.90      0.88      0.89       712
weighted avg       0.90      0.90      0.90       712

Confusion Matrix:
[[421  23]
 [ 50 218]]


In [48]:
print('Accuracy on the testing data set')
y_pred_valid = model.predict(X_valid)
print(accuracy_score(y_valid, y_pred_valid))
print("Classification Report:")
print(classification_report(y_valid, y_pred_valid))
print("Confusion Matrix:")
print(confusion_matrix(y_valid, y_pred_valid))

Accuracy on the testing data set
0.8156424581005587
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.85      0.84       105
           1       0.78      0.77      0.78        74

    accuracy                           0.82       179
   macro avg       0.81      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179

Confusion Matrix:
[[89 16]
 [17 57]]


# Generating the submission files

In [49]:
y_pred_test = model.predict(X_test)
y_pred_test[:10]

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0])

In [50]:
output = pd.DataFrame({ 
    'PassengerId' : test_clean['PassengerId'], 
    'Survived': y_pred_test })

output.head(3)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0


In [51]:
output.to_csv('submission_final.csv', index=False)

In [53]:
!head submission_final.csv

PassengerId,Survived
892,0
893,1
894,0
895,0
896,1
897,0
898,1
899,0
900,1
