In [40]:
import numpy as np
import pandas as pd
import re
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import linear_model

We begin by using Pandas to read in the csv file.

**Pandas** is used for easily storing, and preprocessing our data.

Download Data from https://www.kaggle.com/c/titanic/data

In [41]:
df = pd.read_csv("train.csv")
df.sample(7)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
103,104,0,3,"Johansson, Mr. Gustaf Joel",male,33.0,0,0,7540,8.6542,,S
302,303,0,3,"Johnson, Mr. William Cahoone Jr",male,19.0,0,0,LINE,0.0,,S
769,770,0,3,"Gronnestad, Mr. Daniel Danielsen",male,32.0,0,0,8471,8.3625,,S
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
433,434,0,3,"Kallio, Mr. Nikolai Erland",male,17.0,0,0,STON/O 2. 3101274,7.125,,S
458,459,1,2,"Toomey, Miss. Ellen",female,50.0,0,0,F.C.C. 13531,10.5,,S
505,506,0,1,"Penasco y Castellana, Mr. Victor de Satode",male,18.0,1,0,PC 17758,108.9,C65,C


From the columns we wish to use, Age and Embarked are missing data. (NaN is our placeholder for missing data)

For this tutorial we will replace all the NaN values in Age with the mean age of the column, and the NaN values in Embarked with the value 'M' to signify a missing embarkment point.

In [42]:
df['Age'] = df['Age'].replace(np.nan, df['Age'].mean(), regex=True)
df['Embarked'] = df['Embarked'].replace(np.nan, "M", regex=True)

**Dealing with Name column** We decided to classify them by their titles i.e. Mr, Miss, Master, etc...

**Using Regex** For each data point in the Name column, we shall extract everything between ',' and '.' hence their titles. We then store it in an array called 'titles' and add that to our DataFrame

In [43]:
names = df['Name']
titles = []

for i in range(len(names)):
	s = names[i]
	title = re.search(', (.*)\.', s)
	title = title.group(1)
	titles.append(title)

df['Titles'] = titles

In [44]:
df.sample(7)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Titles
697,698,1,3,"Mullens, Miss. Katherine ""Katie""",female,29.699118,0,0,35852,7.7333,,Q,Miss
219,220,0,2,"Harris, Mr. Walter",male,30.0,0,0,W/C 14208,10.5,,S,Mr
485,486,0,3,"Lefebre, Miss. Jeannie",female,29.699118,3,1,4133,25.4667,,S,Miss
511,512,0,3,"Webber, Mr. James",male,29.699118,0,0,SOTON/OQ 3101316,8.05,,S,Mr
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6,S,Miss
803,804,1,3,"Thomas, Master. Assad Alexander",male,0.42,0,1,2625,8.5167,,C,Master
358,359,1,3,"McGovern, Miss. Mary",female,29.699118,0,0,330931,7.8792,,Q,Miss


**Dropping Unwanted Columns** Since PassengerID is unique it serves us no value. In addition, we already made the Titles column from the Name column so we can drop that as well. Finally we decided that we should drop Cabin for simplicity.

In [45]:
df = df.drop(['PassengerId'], axis=1)
df = df.drop(['Name'], axis=1)
df = df.drop(['Cabin'], axis=1)

In [46]:
df.sample(7)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Titles
623,0,3,male,21.0,0,0,350029,7.8542,S,Mr
627,1,1,female,21.0,0,0,13502,77.9583,S,Miss
103,0,3,male,33.0,0,0,7540,8.6542,S,Mr
742,1,1,female,21.0,2,2,PC 17608,262.375,C,Miss
840,0,3,male,20.0,0,0,SOTON/O2 3101287,7.925,S,Mr
222,0,3,male,51.0,0,0,21440,8.05,S,Mr
570,1,2,male,62.0,0,0,S.W./PP 752,10.5,S,Mr


**Making values Categorical** This is accomplished by using SkLearns LabelBinarizer to OneHotEncode our values

See: https://www.quora.com/What-is-one-hot-encoding-and-when-is-it-used-in-data-science

We create 4 seperate variables to store the column information of each.

In [47]:
lb1 = preprocessing.LabelBinarizer()
lb2 = preprocessing.LabelBinarizer()
lb3 = preprocessing.LabelBinarizer()
lb4 = preprocessing.LabelBinarizer()

In [48]:
Sex_C = lb1.fit_transform(df.Sex)
Ticket_C = lb2.fit_transform(df.Ticket)
Embarked_C = lb3.fit_transform(df.Embarked)
Titles_C = lb4.fit_transform(df.Titles)

After fitting the columns we take a look at examples of the OneHotEncoding

In [49]:
Sex_C[:10]

array([[1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0]])

In [50]:
Ticket_C[:10]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [51]:
Embarked_C[:10]

array([[0, 0, 0, 1],
       [1, 0, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [0, 0, 1, 0],
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [1, 0, 0, 0]])

In [52]:
Titles_C[:10]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]])

To be able to fit the data into our data frame, we must convert each array to it's own data frame with each column as a seperate feature.

In [53]:
Sex_C = pd.DataFrame(Sex_C, columns=["col_val:S"])

In [54]:
Sex_C.sample(5)

Unnamed: 0,col_val:S
434,1
215,0
567,0
180,0
617,0


For arrays with more than one column we loop over the fitted classes (columns in the array) and make them their own features (columns).

We also drop the a column from each of the new datasets to avoid co-linearity (this was automatically handled for us in the Sex column since it only had two columns)

In [55]:
Ticket_C = pd.DataFrame(Ticket_C, columns=['col_val:' + x for x in lb2.classes_])
Ticket_C = Ticket_C.iloc[:, 0:-1]

In [56]:
Ticket_C.sample(5)

Unnamed: 0,col_val:110152,col_val:110413,col_val:110465,col_val:110564,col_val:110813,col_val:111240,col_val:111320,col_val:111361,col_val:111369,col_val:111426,...,col_val:STON/O2. 3101283,col_val:STON/O2. 3101290,col_val:SW/PP 751,col_val:W./C. 14258,col_val:W./C. 14263,col_val:W./C. 6607,col_val:W./C. 6608,col_val:W./C. 6609,col_val:W.E.P. 5734,col_val:W/C 14208
225,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
660,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
857,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
234,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
Embarked_C = pd.DataFrame(Embarked_C, columns=['col_val:' + x for x in lb3.classes_])
Embarked_C = Embarked_C.iloc[:, 0:-1]

In [58]:
Embarked_C.sample(5)

Unnamed: 0,col_val:C,col_val:M,col_val:Q
671,0,0,0
593,0,0,1
584,1,0,0
61,0,1,0
746,0,0,0


In [59]:
Titles_C = pd.DataFrame(Titles_C, columns=['col_val:' + x for x in lb4.classes_])
Titles_C = Titles_C.iloc[:, 0:-1]

In [60]:
Titles_C.sample(5)

Unnamed: 0,col_val:Capt,col_val:Col,col_val:Don,col_val:Dr,col_val:Jonkheer,col_val:Lady,col_val:Major,col_val:Master,col_val:Miss,col_val:Mlle,col_val:Mme,col_val:Mr,col_val:Mrs,col_val:Mrs. Martin (Elizabeth L,col_val:Ms,col_val:Rev,col_val:Sir
373,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
619,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
769,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
440,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
550,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


Next, we concat all the new DataFrames together into Df_New and add the columns from the original DF to it.

In [61]:
df_new = pd.concat([Sex_C, Ticket_C, Embarked_C, Titles_C], axis = 1)

In [62]:
df_new.sample(5)

Unnamed: 0,col_val:S,col_val:110152,col_val:110413,col_val:110465,col_val:110564,col_val:110813,col_val:111240,col_val:111320,col_val:111361,col_val:111369,...,col_val:Master,col_val:Miss,col_val:Mlle,col_val:Mme,col_val:Mr,col_val:Mrs,col_val:Mrs. Martin (Elizabeth L,col_val:Ms,col_val:Rev,col_val:Sir
861,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
586,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
674,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
100,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
459,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [63]:
df_new['Survived'] = df.Survived
df_new['Pclass'] = df.Pclass
df_new['Age'] = df.Age
df_new['SibSp'] = df.SibSp
df_new['Parch'] = df.Parch
df_new['Fare'] = df.Fare

In [64]:
df_new.sample(5)

Unnamed: 0,col_val:S,col_val:110152,col_val:110413,col_val:110465,col_val:110564,col_val:110813,col_val:111240,col_val:111320,col_val:111361,col_val:111369,...,col_val:Mrs. Martin (Elizabeth L,col_val:Ms,col_val:Rev,col_val:Sir,Survived,Pclass,Age,SibSp,Parch,Fare
245,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,44.0,2,0,90.0
832,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,3,29.699118,0,0,7.2292
886,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,2,27.0,0,0,13.0
568,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,3,29.699118,0,0,7.2292
165,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,3,9.0,0,2,20.525


We create our X data by dropping Survived and keeping the other columns.

We create our Y data by only keeping the Survived column.

In [65]:
df_x = df_new.drop(['Survived'], axis = 1)
df_y = df_new[['Survived']]

We randomly select and split the x_train, y_train and the x_test, y_test with 15% of the data being used for testing

In [66]:
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size = 0.15, random_state = 10)

We select SkLearn's logistic regression model and fit the x and y data to it.

We feed the model the test data to generate our predictions

In [67]:
clf = linear_model.LogisticRegression()
clf = clf.fit(x_train, y_train.values.ravel())

a = clf.predict(x_test)

Testing accuracy using SkLearns accuracy score where we feed it the expected values and compare them to our generated values.

In [68]:
accuracy_score(y_test, a)

0.88059701492537312