In [1]:
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import numpy as np
import os

In [2]:
#import csv and display first 5 rows
survivors = pd.read_csv('train.csv')
print(survivors.shape)
survivors.head(5)

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
#Use describe() to get a quick glance at the structure of the data. 
#NOTE: Columns with text values are omitted
print(survivors.shape)
survivors.describe(include="all")

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891,891.0,204,889
unique,,,,891,2,,,,681,,147,3
top,,,,"Burns, Miss. Elizabeth Margaret",male,,,,CA. 2343,,C23 C25 C27,S
freq,,,,1,577,,,,7,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


In [4]:
print("# Missing Age: ",survivors["Age"].isna().sum())

# Missing Age:  177


In [5]:
#Alter the data to be more ML friendly

#Turn Text data into categorical data. Ie. "Male" becomes 1 and "Female" becomes 2.  
survivors["Sex"]=survivors["Sex"].astype("category").cat.codes
survivors["Embarked"]=survivors["Embarked"].astype("category").cat.codes
# survivors["Embarked"]=survivors["Embarked"].astype("category").cat.codes

#ML cannot handle Null values or Na values. 
#This fills any NaN in Column "Age" with the average age. 
#Idk if this is appropriate but I read somewhere it's ok.  
#The purpose is to get as much data as possible. 
survivors["Age"]=survivors["Age"].fillna(29.699118)

#THis is a list of bins for "Age". ML does better with discrete data rather than continuou. 
#This turns ages into age brackets
ageBins= (0,5,10,15,20,25,30,35,40,45,50,55,60,65, 70, 75,80,85)
ageBins2= (0,16,55,85) 
ageBins3=(0,5,10,20,30,40,50,60,70,80,90)
survivors['Age'] = pd.cut(survivors['Age'], bins=ageBins2, labels=False)

fareBins= [0,50,100,150,200,250,300,350,400,450,500,550,600, 650,700,750,800,850,900]
fareBins2= [0,100,200,300,400,500,600,700,800,900]
fareBins3= [0,100,500,900]
survivors['Fare'] = pd.cut(survivors['Fare'], bins=fareBins3, labels=False)


In [6]:
#This step drops unused columns from the dataframe
survivors.drop(["Name", "Ticket", "Cabin"], axis=1, inplace=True)

In [7]:
#checks the number of NaN's in column age. There should be 0 since they were filled with 29.xxx.
survivors["Age"].isna().sum()

0

In [8]:
survivors.head()
survivors.Fare.unique()

array([ 0.,  1., nan,  2.])

In [9]:
# Assign the data to X and y

#Drop any remaining Na values. ML cannot handle them. 
survivors.dropna(inplace=True)
#Assign X and y values
#X is our input and y is our output
X = survivors[["Pclass","Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]]
y = survivors["Survived"].values.reshape(-1, 1)

#X and y must have same 1st dimension (891)
print(X.shape, y.shape)

(876, 7) (876, 1)


In [10]:
#split the data into training and testing sets
#random state allows us to randomly select data but be able to reuse that same random data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
print(len(X_train))
print(len(y_train))
print(len(X_test))
print(len(y_test))

657
657
219
219


In [11]:
X_train= X_train.values
y_train= np.stack(y_train, axis=1)
y_train = np.ravel(y_train)

In [12]:
# Support vector machine linear classifier
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [13]:
# Model Accuracy
print('Test Acc: %.3f' % model.score(X_test, y_test))

Test Acc: 0.799


In [14]:
target = survivors["Survived"]
target_names = ["negative", "positive"]

In [16]:
# Calculate classification report
from sklearn.metrics import classification_report
predictions = model.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

    negative       0.82      0.86      0.84       134
    positive       0.76      0.71      0.73        85

   micro avg       0.80      0.80      0.80       219
   macro avg       0.79      0.78      0.79       219
weighted avg       0.80      0.80      0.80       219

