In [1]:
%pylab inline
%config InlineBackend.figure_formats = ['retina']

import numpy as np # linear algebra
import pandas as pd # data processing
import seaborn as sns 

sns.set_style('whitegrid')
import matplotlib.pyplot as plt

from numpy.random import choice
from collections import Counter
import itertools

from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
#from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA, KernelPCA
from imblearn.over_sampling import ADASYN
#from sklearn.feature_selection import SelectKBest
#from sklearn.feature_selection import f_classif, chi2, mutual_info_classif

from sklearn.metrics import accuracy_score, classification_report, f1_score, roc_auc_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import roc_curve, precision_recall_curve, confusion_matrix

from imblearn.pipeline import Pipeline
from sklearn.cluster import AgglomerativeClustering
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.ensemble import VotingClassifier

Populating the interactive namespace from numpy and matplotlib


In [2]:
# function to plot confusion matrix
def vis_conf_matrix(conf_martix, model_name):
    group_names = ['True Neg', 'False Pos', 'False Neg', 'True Pos']
    group_counts = ["{0:0.0f}".format(value) for value in
                    conf_martix.flatten()]
    group_percentages = ["{0:.2%}".format(value) for value in
                         conf_martix.flatten()/np.sum(conf_martix)]
    labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
              zip(group_names,group_counts,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)

    _, ax = plt.subplots(figsize = (6,6))
    ax = sns.heatmap(conf_martix, annot=labels, fmt = '', 
                     annot_kws = {"size": 20, "weight": "bold"}, cmap = 'Blues')  
    labels = ['False', 'True']
    ax.set_title('Confusion Matrix for {}'.format(model_name), fontsize = 15)
    ax.set_xticklabels(labels, fontsize = 10)
    ax.set_yticklabels(labels, fontsize = 10)
    ax.set_xlabel('Prediction', fontsize = 15)
    ax.set_ylabel('Ground Truth', fontsize = 15)

In [3]:
# define function that fits clustering model and returns data + clustering labels column
def agg_cluster(data, n_clusters, linkage = 'ward'):
    if n_clusters <= 0:
        return data
    else:
        agg = AgglomerativeClustering(n_clusters = n_clusters, linkage = linkage)
        new_col = agg.fit_predict(data)
        new_col = new_col.reshape(len(new_col), 1)
        data = np.append(data, new_col, axis=1)
        return data

In [4]:
# Load data
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
print("Train data size: ", train_data.shape)
print("Train data: ", train_data.columns.tolist())
print("-" * 40)
print("Test data size: ", test_data.shape)
print("Test data: ", test_data.columns.tolist())
print("-" * 40)
train_data.head()

Train data size:  (891, 12)
Train data:  ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
----------------------------------------
Test data size:  (418, 11)
Test data:  ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
----------------------------------------


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
# calculate how many missing values
missing_values_count_train = train_data.isnull().sum()
missing_values_count_test = test_data.isnull().sum()
print("Train data:\n", missing_values_count_train)
print('-'*40)
print("Test data:\n", missing_values_count_test)

Train data:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
----------------------------------------
Test data:
 PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [7]:
# Calculate guess_age matrix for train data to later impute into both train and test datasets
guess_age = np.zeros((2,3))
sex = ['male', 'female']

for i in range(0, 2):
    for j in range(0, 3):
        guess_age[i,j] = int(train_data[(train_data['Sex'] == sex[i]) 
                                        & (train_data['Pclass'] == j + 1)]['Age']
                            .dropna()
                            .median())
guess_age

array([[40., 30., 25.],
       [35., 28., 21.]])

In [8]:
# Copy train data into new data frame
train1 = train_data.copy()
test1 = test_data.copy()
datasets = [train1, test1]

In [9]:
# Impute NAs in Embarked column of the train dataset with the most frequent value (mode value)
train1['Embarked'].fillna(train1['Embarked'].mode()[0], inplace = True)
print("Train data Embarked NAs: ", train1.Embarked.isnull().sum())

Train data Embarked NAs:  0


In [10]:
# Find a Pclass value for the NA in the test data
NA_pclass = int(test1[(test1.Fare.isnull())]['Pclass'])
print("Pclass of NA value in the test data: ", NA_pclass)

# Calculate median value for the Fare within that pclass
fare_to_impute = train1[(train1['Pclass'] == NA_pclass)]['Fare'].median()
print("Median value to impute: ", fare_to_impute)

Pclass of NA value in the test data:  3
Median value to impute:  8.05


In [12]:
# Impute NAs in Age with guess_age
for dataset in datasets:
    for i in range(0, 2):
        for j in range(0, 3):
            dataset.loc[(dataset.Age.isnull()) & (dataset.Sex == sex[i]) 
                        & (dataset.Pclass == j + 1), 'Age'] = guess_age[i,j]
            
print("Train data Age NAs: ", train1.Age.isnull().sum())
print("Test data Age NAs: ", test1.Age.isnull().sum())

Train data Age NAs:  0
Test data Age NAs:  0


In [13]:
# Check again for NAs
print("Train data:\n", train1.isnull().sum())
print('-'*40)
print("Test data:\n", test1.isnull().sum())

Train data:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64
----------------------------------------
Test data:
 PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [14]:

# columns to drop defenetly
to_drop = ['Name', 'Ticket',  'Cabin']
train2 = train1.copy().drop(to_drop, axis=1)
test2 = test1.copy().drop(to_drop, axis=1)
datasets2 = [train2, test2]

for dataset in datasets2:
    print(dataset.info())
    print('-'*40)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    object 
 4   Age          891 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Fare         891 non-null    float64
 8   Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(2)
memory usage: 62.8+ KB
None
----------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Sex          418 non-null    object 
 3   Age   

In [15]:
# fix data type for PassengerId
for dataset in datasets2:
    dataset['PassengerId'] = dataset['PassengerId'].astype('object')

# split columns into numerical and categorical (exclude the PassengerId)
float_cols = train2.drop('PassengerId', axis=1).dtypes[train2.dtypes == 'float64'].index.tolist()
int_cols = train2.drop('PassengerId', axis=1).dtypes[train2.dtypes == 'int64'].index.tolist()
cat_cols = train2.drop('PassengerId', axis=1).dtypes[train2.dtypes == 'object'].index.tolist()
num_cols = float_cols + int_cols
binary = [x for x in num_cols if len(train2[x].unique()) == 2]
num_not_binary = [x for x in num_cols if x not in binary]

print("Numerical variables: ", num_cols)
print("Numerical continuous variables: ", float_cols)
print("Numerical discrete variables: ", int_cols)
print("Numerical and not binary variables: ", num_not_binary)
print("Binary variables: ", binary)
print("Categorical variables: ", cat_cols)

Numerical variables:  ['Age', 'Fare', 'Survived', 'Pclass', 'SibSp', 'Parch']
Numerical continuous variables:  ['Age', 'Fare']
Numerical discrete variables:  ['Survived', 'Pclass', 'SibSp', 'Parch']
Numerical and not binary variables:  ['Age', 'Fare', 'Pclass', 'SibSp', 'Parch']
Binary variables:  ['Survived']
Categorical variables:  ['Sex', 'Embarked']


In [16]:
# update lists of columns (numerical and categorical)
float_cols = train2.drop('PassengerId', axis=1).dtypes[train2.dtypes == 'float64'].index.tolist()
int_cols = train2.drop('PassengerId', axis=1).dtypes[train2.dtypes == 'int64'].index.tolist()
cat_cols = train2.drop('PassengerId', axis=1).dtypes[train2.dtypes == 'object'].index.tolist()
num_cols = float_cols + int_cols
binary = [x for x in num_cols if len(train2[x].unique()) == 2]
num_not_binary = [x for x in num_cols if x not in binary]

print("Numerical variables: ", num_cols)
print("Numerical continuous variables: ", float_cols)
print("Numerical discrete variables: ", int_cols)
print("Numerical and not binary variables: ", num_not_binary)
print("Binary variables: ", binary)
print("Categorical variables: ", cat_cols)

Numerical variables:  ['Age', 'Fare', 'Survived', 'Pclass', 'SibSp', 'Parch']
Numerical continuous variables:  ['Age', 'Fare']
Numerical discrete variables:  ['Survived', 'Pclass', 'SibSp', 'Parch']
Numerical and not binary variables:  ['Age', 'Fare', 'Pclass', 'SibSp', 'Parch']
Binary variables:  ['Survived']
Categorical variables:  ['Sex', 'Embarked']


In [17]:
target = ["Survived"]
features = [x for x in num_cols + cat_cols if x not in target]
cat_features = [x for x in features if x in cat_cols]
num_features = [x for x in features if x in num_cols]
print("Features: ", features)
print("Categorical features: ", cat_features)
print("Numerical features: ", num_features)


Features:  ['Age', 'Fare', 'Pclass', 'SibSp', 'Parch', 'Sex', 'Embarked']
Categorical features:  ['Sex', 'Embarked']
Numerical features:  ['Age', 'Fare', 'Pclass', 'SibSp', 'Parch']


In [18]:
# get training data features and target
X = train2[features].copy()
y = train2[target].copy()
X.head()

Unnamed: 0,Age,Fare,Pclass,SibSp,Parch,Sex,Embarked
0,22.0,7.25,3,1,0,male,S
1,38.0,71.2833,1,1,0,female,C
2,26.0,7.925,3,0,0,female,S
3,35.0,53.1,1,1,0,female,S
4,35.0,8.05,3,0,0,male,S


In [19]:
# split training data into train and test sets
# Get the split indexes
strat_shuf_split = StratifiedShuffleSplit(n_splits=1, 
                                          test_size=0.4, 
                                          random_state=42)

train_idx, test_idx = next(strat_shuf_split.split(X, y))

# Create the dataframes for train and test
X_train = X.loc[train_idx,]
y_train = y.loc[train_idx,]

X_test  = X.loc[test_idx,]
y_test  = y.loc[test_idx,]

In [20]:
# scale numeric columns
mm = MinMaxScaler()

for column in num_features:
    X_train[[column]] = mm.fit_transform(X_train[[column]])
    X_test[[column]] = mm.transform(X_test[[column]])

round(X_train.describe(), 3)

Unnamed: 0,Age,Fare,Pclass,SibSp,Parch
count,534.0,534.0,534.0,534.0,534.0
mean,0.368,0.062,0.658,0.055,0.063
std,0.168,0.097,0.417,0.114,0.138
min,0.0,0.0,0.0,0.0,0.0
25%,0.259,0.015,0.5,0.0,0.0
50%,0.334,0.028,1.0,0.0,0.0
75%,0.472,0.059,1.0,0.125,0.0
max,1.0,1.0,1.0,1.0,1.0
