# Import Modules

In [31]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split

# Static Variables

In [2]:
# Paths
base_path = 'Datasets\\'
train_data = os.path.join(base_path, 'train.csv')
test_data = os.path.join(base_path, 'test.csv')

# Load Data
train_df = pd.read_csv(train_data)
test_df = pd.read_csv(test_data)

In [3]:
train_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [4]:
# Drop columns that won't be useful
train_df.drop(columns=['PassengerId','Name','Ticket','Cabin'], inplace=True)
train_df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
888,0,3,female,,1,2,23.4500,S
889,1,1,male,26.0,0,0,30.0000,C


# Exploratory Data Analysis / Cleaning

In [5]:
"""
We want to replace Sex values and Embarked values with numbers.
"""
new_sex = train_df['Sex'].replace({'male':0, 'female':1}) #males are 0, females are 1
train_df['Sex'] = new_sex
new_embarked = train_df['Embarked'].replace({'S':0, 'C':1, 'Q':2}) #Pretty self explanatory
train_df['Embarked'] = new_embarked
train_df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.2500,0.0
1,1,1,1,38.0,1,0,71.2833,1.0
2,1,3,1,26.0,0,0,7.9250,0.0
3,1,1,1,35.0,1,0,53.1000,0.0
4,0,3,0,35.0,0,0,8.0500,0.0
...,...,...,...,...,...,...,...,...
886,0,2,0,27.0,0,0,13.0000,0.0
887,1,1,1,19.0,0,0,30.0000,0.0
888,0,3,1,,1,2,23.4500,0.0
889,1,1,0,26.0,0,0,30.0000,1.0


In [6]:
train_df.describe()
# We will have to clean the Age column (over 100 nulls).  
#Embarked isn't missing enough to cause concern; however, we'll need to fill those in before we can move on.

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0,889.0
mean,0.383838,2.308642,0.352413,29.699118,0.523008,0.381594,32.204208,0.362205
std,0.486592,0.836071,0.47799,14.526497,1.102743,0.806057,49.693429,0.636157
min,0.0,1.0,0.0,0.42,0.0,0.0,0.0,0.0
25%,0.0,2.0,0.0,20.125,0.0,0.0,7.9104,0.0
50%,0.0,3.0,0.0,28.0,0.0,0.0,14.4542,0.0
75%,1.0,3.0,1.0,38.0,1.0,0.0,31.0,1.0
max,1.0,3.0,1.0,80.0,8.0,6.0,512.3292,2.0


In [7]:
train_df.corr()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
Survived,1.0,-0.338481,0.543351,-0.077221,-0.035322,0.081629,0.257307,0.108669
Pclass,-0.338481,1.0,-0.1319,-0.369226,0.083081,0.018443,-0.5495,0.043835
Sex,0.543351,-0.1319,1.0,-0.093254,0.114631,0.245489,0.182333,0.118593
Age,-0.077221,-0.369226,-0.093254,1.0,-0.308247,-0.189119,0.096067,0.012186
SibSp,-0.035322,0.083081,0.114631,-0.308247,1.0,0.414838,0.159651,-0.060606
Parch,0.081629,0.018443,0.245489,-0.189119,0.414838,1.0,0.216225,-0.07932
Fare,0.257307,-0.5495,0.182333,0.096067,0.159651,0.216225,1.0,0.063462
Embarked,0.108669,0.043835,0.118593,0.012186,-0.060606,-0.07932,0.063462,1.0


In [8]:
# It looks like Age has the most correlation 
#(and, if you look at the two null values, that's the only difference between the two)
train_df[train_df['Embarked'].isnull()]
# Lets find the median age for each Embarked value and fill these with whichever is closest

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
61,1,1,1,38.0,0,0,80.0,
829,1,1,1,62.0,0,0,80.0,


In [9]:
train_df.groupby('Embarked').median()['Age']
# Looks like they're both being assigned the value of 1.

Embarked
0.0    28.0
1.0    29.0
2.0    27.0
Name: Age, dtype: float64

In [10]:
train_df['Embarked'].fillna(1, inplace=True)

In [11]:
# We'll use linear regression to find the most likely age
train_df.corr()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
Survived,1.0,-0.338481,0.543351,-0.077221,-0.035322,0.081629,0.257307,0.111203
Pclass,-0.338481,1.0,-0.1319,-0.369226,0.083081,0.018443,-0.5495,0.040138
Sex,0.543351,-0.1319,1.0,-0.093254,0.114631,0.245489,0.182333,0.121266
Age,-0.077221,-0.369226,-0.093254,1.0,-0.308247,-0.189119,0.096067,0.017651
SibSp,-0.035322,0.083081,0.114631,-0.308247,1.0,0.414838,0.159651,-0.061591
Parch,0.081629,0.018443,0.245489,-0.189119,0.414838,1.0,0.216225,-0.080277
Fare,0.257307,-0.5495,0.182333,0.096067,0.159651,0.216225,1.0,0.065492
Embarked,0.111203,0.040138,0.121266,0.017651,-0.061591,-0.080277,0.065492,1.0


In [12]:
# Set up two DataFrames: One for null age values (to train) and one without (to predict)
no_age_nan = train_df[train_df['Age'].notnull()]
only_age_nan = train_df[train_df['Age'].isnull()]

# Grab from df without null age values to train the alg
X_array = np.array(no_age_nan[['Pclass','Sex','SibSp','Parch','Fare','Embarked']])
y_array = np.array(no_age_nan['Age'])

In [13]:
# Train the alg
knr = KNeighborsRegressor(n_neighbors=5) 
knr.fit(X=X_array, y=y_array)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform')

In [14]:
knr.score(X_array,y_array)

0.3739727538572384

In [15]:
predicted_age = knr.predict(only_age_nan[['Pclass','Sex','SibSp','Parch','Fare','Embarked']])
predicted_age

array([47.2  , 29.4  , 23.   , 32.5  , 21.7  , 29.8  , 21.584, 21.1  ,
       24.8  , 27.8  , 30.2  , 29.   , 21.1  , 26.4  , 49.4  , 38.9  ,
       16.2  , 29.8  , 30.2  , 21.1  , 30.2  , 30.2  , 29.8  , 34.2  ,
       32.6  , 30.2  , 47.2  , 17.6  , 20.4  , 34.8  , 26.8  , 37.   ,
       33.8  , 44.8  , 19.2  , 37.   , 32.   , 45.4  , 22.1  , 47.2  ,
       21.1  , 37.   , 47.2  , 29.8  , 21.4  , 25.8  , 19.9  , 22.1  ,
       34.8  , 39.2  , 47.2  , 21.1  , 48.6  , 21.1  , 33.8  , 42.6  ,
       38.9  , 48.6  , 21.1  , 29.6  , 32.6  , 30.2  , 28.   , 37.   ,
       23.2  , 40.4  , 29.8  , 27.2  , 49.4  , 32.5  , 21.7  , 21.7  ,
       29.   , 22.   , 21.1  , 36.4  , 29.8  , 42.1  , 21.4  , 29.8  ,
       40.1  , 33.8  , 27.   , 27.8  , 34.8  , 47.2  , 27.2  , 21.6  ,
       27.   , 30.2  , 41.8  , 47.2  , 30.2  , 33.8  , 42.1  , 34.8  ,
       44.4  , 33.8  , 21.4  , 27.   , 22.4  , 30.3  , 21.1  , 52.6  ,
       30.2  , 32.2  , 32.5  , 24.8  , 30.8  , 24.8  , 14.   , 40.   ,
      

In [16]:
# Now lets fill it in
train_df.loc[train_df['Age'].isnull(),'Age'] = predicted_age
train_df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.2500,0.0
1,1,1,1,38.0,1,0,71.2833,1.0
2,1,3,1,26.0,0,0,7.9250,0.0
3,1,1,1,35.0,1,0,53.1000,0.0
4,0,3,0,35.0,0,0,8.0500,0.0
...,...,...,...,...,...,...,...,...
886,0,2,0,27.0,0,0,13.0000,0.0
887,1,1,1,19.0,0,0,30.0000,0.0
888,0,3,1,26.8,1,2,23.4500,0.0
889,1,1,0,26.0,0,0,30.0000,1.0


In [17]:
train_df.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,0.352413,30.137192,0.523008,0.381594,32.204208,0.363636
std,0.486592,0.836071,0.47799,13.603728,1.102743,0.806057,49.693429,0.636159
min,0.0,1.0,0.0,0.42,0.0,0.0,0.0,0.0
25%,0.0,2.0,0.0,21.1,0.0,0.0,7.9104,0.0
50%,0.0,3.0,0.0,29.0,0.0,0.0,14.4542,0.0
75%,1.0,3.0,1.0,38.0,1.0,0.0,31.0,1.0
max,1.0,3.0,1.0,80.0,8.0,6.0,512.3292,2.0


In [21]:
# Save file for tableau use
train_df.to_csv('Datasets\\new_train.csv')
train_df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.2500,0.0
1,1,1,1,38.0,1,0,71.2833,1.0
2,1,3,1,26.0,0,0,7.9250,0.0
3,1,1,1,35.0,1,0,53.1000,0.0
4,0,3,0,35.0,0,0,8.0500,0.0
...,...,...,...,...,...,...,...,...
886,0,2,0,27.0,0,0,13.0000,0.0
887,1,1,1,19.0,0,0,30.0000,0.0
888,0,3,1,26.8,1,2,23.4500,0.0
889,1,1,0,26.0,0,0,30.0000,1.0


In [57]:
# Generate arrays for testing
X,y = np.array(train_df[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]), np.array(train_df['Survived'])

X_train, X_test, y_train, y_test = train_test_split(X,y)