In [1]:
# Import libraries
import pandas as pd
from pandas_profiling import ProfileReport
import sklearn
from sklearn import preprocessing
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, mean_absolute_error
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
import os

In [2]:
train = pd.read_csv("data/input/train.csv")
test = pd.read_csv("data/input/test.csv")
test_entry = pd.read_csv("data/input/test.csv")

In [3]:
# Let us print some of the data to examine what we are dealing with
print(train.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [4]:
# Useful function that saves us a lot of time is an automatic xploratory Data Analysis that gives us an overview of all data
#eda_report = ProfileReport(train)

#eda_report

In [5]:
# Data explanation
# PassengerId - just an ID
# Survived - result (present only in train data)
# Pclass - class of the pessengers (might be valuable)
# Name - through feature engineering we might obtain some of the useful informations
# Sex - male or female (might be valuable)
# Age - float value of passenger age (might be valuable)
# SibSp - passenger has a sibling or spouse (might be valuable)
# Parch - number of parents or childer in the boat (might be valuable)
# Ticket number - through feature engineering we might obtain some of the useful informations
# Fare - price of the ticket (might be useful)
# Cabin - number of the cabin (might be useful)
# Embarked - port where passengers embarked on the ship (might be useful)

In [6]:
# Now we are going to examine what types of variables we have in dataset
train.dtypes
# Object types need to be encoded (transformed into integers)

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [7]:
# First we are going to see where are the NaN values
print(train.isna().sum())
print(test.isna().sum())
# Those are Age, Cabin, Embarked in train data and Age, Cabin and Fare in test data

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [8]:
# Now we need to choose what to do with missing data, some of the options are to populate them or to drop them. 
# Cabin has a huge amount of the missing values so we are goint to drop it, also in this step we will drop PassangerID because we have
# no use of it, also Ticket has a high cardinality
train.drop(["PassengerId", "Cabin", "Ticket"], axis=1,inplace = True)
test.drop(["PassengerId", "Cabin", "Ticket"], axis=1, inplace = True)

# Also we are going to split Name, Lastname and Title, and then drop Name and Lastname because the name and lastname don't seem to have
# any big correspondence with Survived, while Title might have, and Title might help us to populate Age
train[['FirstName', 'TitleLastName']]=train['Name'].str.split(', ', expand=True)
train[['Title', 'LastName']]=train['TitleLastName'].str.split('. ',n=1, expand=True)

test[['FirstName', 'TitleLastName']]=test['Name'].str.split(', ', expand=True)
test[['Title', 'LastName']]=test['TitleLastName'].str.split('. ',n=1, expand=True)

# Now we are going to drop FirstName, TitleLastName and LastName
train.drop(["FirstName","TitleLastName", "LastName", "Name"], axis=1, inplace = True)
test.drop(["FirstName","TitleLastName", "LastName", "Name"], axis=1, inplace = True)

In [9]:
print(train.head())

   Survived  Pclass     Sex   Age  SibSp  Parch     Fare Embarked Title
0         0       3    male  22.0      1      0   7.2500        S    Mr
1         1       1  female  38.0      1      0  71.2833        C   Mrs
2         1       3  female  26.0      0      0   7.9250        S  Miss
3         1       1  female  35.0      1      0  53.1000        S   Mrs
4         0       3    male  35.0      0      0   8.0500        S    Mr


In [10]:
# Let us deal with NaN values Embarked (Only two variables for Embarked are Nan and we are going to use Mode to replace it)
train["Embarked"].fillna(train["Embarked"].mode().iloc[0], inplace = True)
# Fare is going to be replaced with mode also because we are missing only one variable
test["Fare"].fillna(test["Fare"].mode().iloc[0], inplace = True)

In [11]:
# Now we are going to encode all categorical values and let us see what values are categorical and how many values do they have

cols = train.select_dtypes(include=['object'])
print(cols)
for col in cols:
    print(test[col].unique())
    print(test[col].value_counts())
# Those are Sex (only two values so we are going to use binary classification), Ticket 


        Sex Embarked Title
0      male        S    Mr
1    female        C   Mrs
2    female        S  Miss
3    female        S   Mrs
4      male        S    Mr
..      ...      ...   ...
886    male        S   Rev
887  female        S  Miss
888  female        S  Miss
889    male        C    Mr
890    male        Q    Mr

[891 rows x 3 columns]
['male' 'female']
male      266
female    152
Name: Sex, dtype: int64
['Q' 'S' 'C']
S    270
C    102
Q     46
Name: Embarked, dtype: int64
['Mr' 'Mrs' 'Miss' 'Master' 'Ms' 'Col' 'Rev' 'Dr' 'Dona']
Mr        240
Miss       78
Mrs        72
Master     21
Col         2
Rev         2
Ms          1
Dr          1
Dona        1
Name: Title, dtype: int64


In [12]:
# Now we will encode categorical values left in our dataset tryng to use encoder that best fits the data
# For Sex and Embarked we can use OneHotEncoder because our categorical variables do not embed order information

# OneHotEncoder
oh = preprocessing.OneHotEncoder()

sex_result_train = oh.fit_transform(train[["Sex"]]).toarray()
sex_result_test = oh.transform(test[["Sex"]]).toarray()

# Appending columns
train[['Female', 'Male']] = pd.DataFrame(sex_result_train, index = train.index)
test[['Female', 'Male']] = pd.DataFrame(sex_result_test, index = test.index)

embarked_result_train = oh.fit_transform(train[["Embarked"]]).toarray()
embarked_result_test = oh.transform(test[["Embarked"]]).toarray()

# Appending columns
train[['Embarked_C', 'Embarked_Q', 'Embarked_S']] = pd.DataFrame(embarked_result_train, index = train.index)
test[['Embarked_C', 'Embarked_Q', 'Embarked_S']] = pd.DataFrame(embarked_result_test, index = test.index)

# For Title we will user OrdinalEncoding

oe = preprocessing.OrdinalEncoder(handle_unknown='use_encoded_value',  unknown_value=np.nan)
# transform data
train[["Title"]] = oe.fit_transform(train[["Title"]])
test[["Title"]] = oe.transform(test[["Title"]])

In [13]:
# Remove redundant columns now that we have encoded them, we don't need Sex and Embarked
train.drop(["Sex", "Embarked"], axis=1,inplace = True)
test.drop(["Sex", "Embarked"], axis=1, inplace = True)

In [14]:
# Now we only need to deal with Age, we have several options:
# Drop entire column, but as we identified Age as a valuable Feature this is not a good option
# Drop only rows with NaN values (this is also not a good option because the dataset is already small)
# Replace Age with median or mean (This is a valid option that we can try)
# Then we can use Title for grouping some rows and replacing Age with median of only that group

# First calculate age mean for all title groups
print(train.groupby(["Title"])["Age"].mean())

train['Age'] = train['Age'].fillna(train.groupby(['Title'])['Age'].transform('mean'))
test['Age'] = test['Age'].fillna(test.groupby(['Title'])['Age'].transform('mean'))

print(train.loc[train['Title'] == 0.0])

Title
0.0     70.000000
1.0     58.000000
2.0     40.000000
3.0     42.000000
4.0     38.000000
5.0     48.000000
6.0     48.500000
7.0      4.574167
8.0     21.773973
9.0     24.000000
10.0    24.000000
11.0    32.368090
12.0    35.898148
13.0    28.000000
14.0    43.166667
15.0    49.000000
16.0    33.000000
Name: Age, dtype: float64
     Survived  Pclass   Age  SibSp  Parch  Fare  Title  Female  Male  \
745         0       1  70.0      1      1  71.0    0.0     0.0   1.0   

     Embarked_C  Embarked_Q  Embarked_S  
745         0.0         0.0         1.0  


In [15]:
# Now we are going to split train and test data
#train.drop(["Embarked_S", "Male"], axis=1,inplace = True)
#test.drop(["Embarked_S", "Male"], axis=1, inplace = True)

y=train["Survived"]
X=train.drop("Survived", axis=1)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state=42)



In [16]:
# Using logistic regression
print(X_train)
logreg = LogisticRegression(max_iter=800)

logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_val)
print("Logistic regression ",accuracy_score(y_pred,y_val), mean_absolute_error(y_pred, y_val))

scores = cross_val_score(logreg, X, y, cv=5)
scores.sort()
accuracy = scores.mean()
print(scores)
print(accuracy)

     Pclass       Age  SibSp  Parch      Fare  Title  Female  Embarked_C  \
331       1  45.50000      0      0   28.5000   11.0     0.0         0.0   
733       2  23.00000      0      0   13.0000   11.0     0.0         0.0   
382       3  32.00000      0      0    7.9250   11.0     0.0         0.0   
704       3  26.00000      1      0    7.8542   11.0     0.0         0.0   
813       3   6.00000      4      2   31.2750    8.0     1.0         0.0   
..      ...       ...    ...    ...       ...    ...     ...         ...   
106       3  21.00000      0      0    7.6500    8.0     1.0         0.0   
270       1  32.36809      0      0   31.0000   11.0     0.0         0.0   
860       3  41.00000      2      0   14.1083   11.0     0.0         0.0   
435       1  14.00000      1      2  120.0000    8.0     1.0         0.0   
102       1  21.00000      0      1   77.2875   11.0     0.0         0.0   

     Embarked_Q  
331         0.0  
733         0.0  
382         0.0  
704         0.0

In [17]:
# There is still NaN data in test
print(test.isna().sum())
test["Age"].fillna(test["Age"].mode().iloc[0], inplace = True)
test["Title"].fillna(test["Title"].mode().iloc[0], inplace = True)

print(test_entry)

Pclass        0
Age           1
SibSp         0
Parch         0
Fare          0
Title         1
Female        0
Embarked_C    0
Embarked_Q    0
dtype: int64
     PassengerId  Pclass                                          Name  \
0            892       3                              Kelly, Mr. James   
1            893       3              Wilkes, Mrs. James (Ellen Needs)   
2            894       2                     Myles, Mr. Thomas Francis   
3            895       3                              Wirz, Mr. Albert   
4            896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)   
..           ...     ...                                           ...   
413         1305       3                            Spector, Mr. Woolf   
414         1306       1                  Oliva y Ocana, Dona. Fermina   
415         1307       3                  Saether, Mr. Simon Sivertsen   
416         1308       3                           Ware, Mr. Frederick   
417         1309       3     

In [18]:
# create csv
print(test)

#submission_data = logreg.predict(test)

#submission_data_frame = pd.DataFrame({
#    "PassengerId": test_entry["PassengerId"].values,
#    "Survived": submission_data
#})
#print(submission_data_frame)
#submission_data_frame.to_csv("data/output/submission_logreg.csv", index=False)


     Pclass        Age  SibSp  Parch      Fare  Title  Female  Embarked_C  \
0         3  34.500000      0      0    7.8292   11.0     0.0         0.0   
1         3  47.000000      1      0    7.0000   12.0     1.0         0.0   
2         2  62.000000      0      0    9.6875   11.0     0.0         0.0   
3         3  27.000000      0      0    8.6625   11.0     0.0         0.0   
4         3  22.000000      1      1   12.2875   12.0     1.0         0.0   
..      ...        ...    ...    ...       ...    ...     ...         ...   
413       3  32.000000      0      0    8.0500   11.0     0.0         0.0   
414       1  39.000000      0      0  108.9000   11.0     1.0         1.0   
415       3  38.500000      0      0    7.2500   11.0     0.0         0.0   
416       3  32.000000      0      0    8.0500   11.0     0.0         0.0   
417       3   7.406471      1      1   22.3583    7.0     0.0         1.0   

     Embarked_Q  
0           1.0  
1           0.0  
2           1.0  
3  

In [19]:
# Test another model XGB

model2 = xgb.XGBClassifier()
model2.fit(X_train,y_train,
             eval_set=[(X_val, y_val)],
             verbose=False)
y2_pred = model2.predict(X_val)
print("xgb classifier ", accuracy_score(y2_pred,y_val), mean_absolute_error(y2_pred, y_val))

scores = cross_val_score(model2, X, y, cv=5)
scores.sort()
accuracy = scores.mean()

print(scores)
print(accuracy)

xgb classifier  0.8156424581005587 0.18435754189944134
[0.78089888 0.79888268 0.81460674 0.83146067 0.85393258]
0.815956311593748


In [20]:
# Test another model TREE

model3 = DecisionTreeClassifier()
model3.fit(X_train,y_train)
y3_pred = model3.predict(X_val)
print("xgb classifier ", accuracy_score(y3_pred,y_val), mean_absolute_error(y3_pred, y_val))

scores = cross_val_score(model3, X, y, cv=5)
scores.sort()
accuracy = scores.mean()

print(scores)
print(accuracy)

xgb classifier  0.8100558659217877 0.18994413407821228
[0.71910112 0.76536313 0.78089888 0.78089888 0.84269663]
0.7777917268219195
