<B>My Titanic Model</B>

Author: Zane Lynch

In [1]:
# Imports

# pandas
import pandas as pd
from pandas import Series,DataFrame

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [2]:
train_df = pd.read_csv("data/train.csv")
test_df  = pd.read_csv("data/test.csv")

In [3]:
train_df.info()
print("----------------------------")
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
----------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare

In [4]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
test_df.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


I'll drop ticket number, fare, and cabin number. Ticket number and cabin number seem arbitrary enough that they shouldn't have too much of an effect of survival. Fare seemed to have no significance based on a linear regression I ran previously using Stata.

In [6]:
train_df = train_df.drop(['Ticket','Fare','Cabin'], axis=1)
test_df  = test_df.drop(['Ticket','Fare','Cabin'], axis=1)

In [7]:
train_df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,S
5,6,0,3,"Moran, Mr. James",male,,0,0,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,C


First I'll address the missing ages. Each passenger's title can be used to provide an accurate estimate of their age. For example, passengers with the title "Mr." will be older than those with the title "Master".

In [8]:
def get_title(name):
    i=0
    current_surname = ''
    while (name[i] != ','):
        current_surname = current_surname + name[i]
        i += 1
        # when we meet the first appearance of ',' we drop out of this while loop
    
    i += 1 # here we skip that first comma.
    i += 1 # here we skip the space-character that follows that first comma. 

    current_title = ''
    while (name[i] != ' '):
        current_title = current_title + name[i]
        i += 1
    return current_title

train_df['Title'] = train_df["Name"].apply(get_title)
test_df['Title'] = test_df["Name"].apply(get_title)

train_df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,S,Mr.
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,C,Mrs.
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,S,Miss.
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,S,Mrs.
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,S,Mr.
5,6,0,3,"Moran, Mr. James",male,,0,0,Q,Mr.
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,S,Mr.
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,S,Master.
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,S,Mrs.
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,C,Mrs.


Now I can get descriptive statistics based on title and age. Passengers can be separated into three groups (youth, woman, man) based on title and gender.

In [9]:
def get_agegroup(passenger):
    title, sex = passenger
    if ((title == 'Miss.') or (title == 'Master.') or (title == 'Mlle.')):
        return 'youth'
    elif (sex == 'female'):
        return 'woman'
    else:
        return 'man'

train_df['AgeGroup'] = train_df[["Title","Sex"]].apply(get_agegroup, axis=1)
test_df['AgeGroup'] = test_df[["Title","Sex"]].apply(get_agegroup, axis=1)

train_df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Embarked,Title,AgeGroup
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,S,Mr.,man
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,C,Mrs.,woman
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,S,Miss.,youth
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,S,Mrs.,woman
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,S,Mr.,man
5,6,0,3,"Moran, Mr. James",male,,0,0,Q,Mr.,man
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,S,Mr.,man
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,S,Master.,youth
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,S,Mrs.,woman
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,C,Mrs.,woman


In [10]:
youthtrain_df = train_df[train_df['AgeGroup']=='youth']
womantrain_df = train_df[train_df['AgeGroup']=='woman']
mantrain_df   = train_df[train_df['AgeGroup']=='man']

youthtest_df = test_df[test_df['AgeGroup']=='youth']
womantest_df = test_df[test_df['AgeGroup']=='woman']
mantest_df   = test_df[test_df['AgeGroup']=='man']

youth_df = pd.concat([youthtrain_df, youthtest_df])
woman_df = pd.concat([womantrain_df, womantest_df])
man_df   = pd.concat([mantrain_df, mantest_df])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if __name__ == '__main__':
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  # Remove the CWD from sys.path while we load stuff.
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  # This is added back by InteractiveShellApp.init_path()


In [11]:
average_age_youth = youth_df["Age"].mean()
std_age_youth     = youth_df["Age"].std()
count_nan_age_youthtrain = youthtrain_df["Age"].isnull().sum()
count_nan_age_youthtest  = youthtest_df["Age"].isnull().sum()

rand_y1 = np.random.randint(average_age_youth - std_age_youth, average_age_youth + std_age_youth, size = count_nan_age_youthtrain)
youthtrain_df["Age"][np.isnan(youthtrain_df["Age"])] = rand_y1

rand_y2 = np.random.randint(average_age_youth - std_age_youth, average_age_youth + std_age_youth, size = count_nan_age_youthtest)
youthtest_df["Age"][np.isnan(youthtest_df["Age"])] = rand_y2


average_age_woman = woman_df["Age"].mean()
std_age_woman     = woman_df["Age"].std()
count_nan_age_womantrain = womantrain_df["Age"].isnull().sum()
count_nan_age_womantest  = womantest_df["Age"].isnull().sum()

rand_w1 = np.random.randint(average_age_woman - std_age_woman, average_age_woman + std_age_woman, size = count_nan_age_womantrain)
womantrain_df["Age"][np.isnan(womantrain_df["Age"])] = rand_w1

rand_w2 = np.random.randint(average_age_woman - std_age_woman, average_age_woman + std_age_woman, size = count_nan_age_womantest)
womantest_df["Age"][np.isnan(womantest_df["Age"])] = rand_w2


average_age_man     = man_df["Age"].mean()
std_age_man         = man_df["Age"].std()
count_nan_age_mantrain = mantrain_df["Age"].isnull().sum()
count_nan_age_mantest  = mantest_df["Age"].isnull().sum()

rand_m1 = np.random.randint(average_age_man - std_age_man, average_age_man + std_age_man, size = count_nan_age_mantrain)
mantrain_df["Age"][np.isnan(mantrain_df["Age"])] = rand_m1

rand_m2 = np.random.randint(average_age_man - std_age_man, average_age_man + std_age_man, size = count_nan_age_mantest)
mantest_df["Age"][np.isnan(mantest_df["Age"])] = rand_m2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set 

In [12]:
train_df = pd.concat([youthtrain_df, womantrain_df, mantrain_df])
test_df = pd.concat([youthtest_df, womantest_df, mantest_df])

train_df = train_df.sort_values(by='PassengerId')
test_df = test_df.sort_values(by='PassengerId')

I'll fill any data missing for embarked with Southhampton because it seems to be the most common.

In [13]:
train_df["Embarked"] = train_df["Embarked"].fillna("S")
test_df["Embarked"] = test_df["Embarked"].fillna("S")

For the more discrete variables, I'll convert them into dummy variables.

In [14]:
pclasstrain_df = pd.get_dummies(train_df['Pclass'])
pclasstest_df = pd.get_dummies(test_df['Pclass'])

train_df = train_df.join(pclasstrain_df)
test_df = test_df.join(pclasstest_df)

train_df = train_df.drop(['Pclass', 3], axis=1)
test_df  = test_df.drop(['Pclass', 3], axis=1)

train_df = train_df.rename(columns={1:'PClass1', 2:'PClass2'})
test_df = test_df.rename(columns={1:'PClass1', 2:'PClass2'})

In [15]:
sextrain_df = pd.get_dummies(train_df['Sex'])
sextest_df = pd.get_dummies(test_df['Sex'])

train_df = train_df.join(sextrain_df)
test_df = test_df.join(sextest_df)

train_df = train_df.drop(['Sex', 'male'], axis=1)
test_df  = test_df.drop(['Sex', 'male'], axis=1)

In [16]:
embarkedtrain_df = pd.get_dummies(train_df['Embarked'])
embarkedtest_df = pd.get_dummies(test_df['Embarked'])

train_df = train_df.join(embarkedtrain_df)
test_df = test_df.join(embarkedtest_df)

train_df = train_df.drop(['Embarked', 'S'], axis=1)
test_df  = test_df.drop(['Embarked', 'S'], axis=1)

train_df = train_df.rename(columns={'C':'EmbarkedC', 'Q':'EmbarkedQ'})
test_df = test_df.rename(columns={'C':'EmbarkedC', 'Q':'EmbarkedQ'})

In [17]:
titletrain_df = pd.get_dummies(train_df['Title'])
titletest_df = pd.get_dummies(test_df['Title'])

train_df = train_df.join(titletrain_df)
test_df = test_df.join(titletest_df)

train_df = train_df.drop(['Title'], axis=1)
test_df  = test_df.drop(['Title'], axis=1)

I would also like to investigate age squared because the young and elderly may have been more likely to survive out of courtesy.

In [18]:
agesquaredtrain = train_df['Age']**2
agesquaredtest = test_df['Age']**2

agesquaredtrain_df = agesquaredtrain.to_frame(name='AgeSq')
agesquaredtest_df = agesquaredtest.to_frame(name='AgeSq')

train_df = train_df.join(agesquaredtrain_df)
test_df = test_df.join(agesquaredtest_df)

In [19]:
train_df = train_df.drop(['PassengerId', 'Name', 'AgeGroup', 'Capt.', 'Jonkheer.', 'Lady.', 'Major.', 'Mlle.', 'Mme.', 'Sir.', 'the'], axis=1)
test_df  = test_df.drop(['Name', 'AgeGroup'], axis=1)

In [20]:
train_df.head()

Unnamed: 0,Survived,Age,SibSp,Parch,PClass1,PClass2,female,EmbarkedC,EmbarkedQ,Col.,Don.,Dr.,Master.,Miss.,Mr.,Mrs.,Ms.,Rev.,AgeSq
0,0,22.0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,484.0
1,1,38.0,1,0,1,0,1,1,0,0,0,0,0,0,0,1,0,0,1444.0
2,1,26.0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,676.0
3,1,35.0,1,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,1225.0
4,0,35.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1225.0


In [21]:
test_df.head()

Unnamed: 0,PassengerId,Age,SibSp,Parch,PClass1,PClass2,female,EmbarkedC,EmbarkedQ,Col.,Dona.,Dr.,Master.,Miss.,Mr.,Mrs.,Ms.,Rev.,AgeSq
0,892,34.5,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1190.25
1,893,47.0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,2209.0
2,894,62.0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,3844.0
3,895,27.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,729.0
4,896,22.0,1,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,484.0


In [22]:
X_train = train_df.drop("Survived",axis=1)
Y_train = train_df["Survived"]
X_test  = test_df.drop("PassengerId",axis=1).copy()

logreg = LogisticRegression()

logreg.fit(X_train, Y_train)

Y_pred = logreg.predict(X_test)

logreg.score(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.8271604938271605

In [23]:
coeff_df = DataFrame(train_df.columns.delete(0))
coeff_df.columns = ['Features']
coeff_df["Coefficient Estimate"] = pd.Series(logreg.coef_[0])

coeff_df

Unnamed: 0,Features,Coefficient Estimate
0,Age,-0.043687
1,SibSp,-0.433829
2,Parch,-0.311066
3,PClass1,2.040318
4,PClass2,1.050834
5,female,1.198044
6,EmbarkedC,0.402545
7,EmbarkedQ,0.091342
8,Col.,-0.030355
9,Don.,-0.112294


In [24]:
submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": Y_pred
    })
submission.to_csv('data/titanicfinal.csv', index=False)

Kaggle Score: 0.78468