In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
#import csv and display first 5 rows
survivors = pd.read_csv('train.csv')
survivors.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
#Use describe() to get a quick glance at the structure of the data. 
#NOTE: Columns with text values are omitted
survivors.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
#Alter the data to be more ML friendly

#THis is a list of bins for "Age". ML does better with discrete data rather than continuou. 
#This turns ages into age brackets
bins= (0,5,10,15,20,25,30,35,40,45,50,55,60,65, 70, 75,80,85) 

#Turn Text data into categorical data. Ie. "Male" becomes 1 and "Female" becomes 2.  
survivors["Sex"]=survivors["Sex"].astype("category").cat.codes
survivors["Embarked"]=survivors["Embarked"].astype("category").cat.codes
survivors["Embarked"]=survivors["Embarked"].astype("category").cat.codes

#ML cannot handle Null values or Na values. 
#This fills any NaN in Column "Age" with the average age. 
#Idk if this is appropriate but I read somewhere it's ok.  
#The purpose is to get as much data as possible. 
survivors["Age"]=survivors["Age"].fillna(29.699118)

In [5]:
#This step drops unused columns from the dataframe
survivors.drop(["Name", "Ticket", "Cabin"], axis=1, inplace=True)

In [6]:
#checks the number of NaN's in column age. There should be 0 since they were filled with 29.xxx.
survivors["Age"].isna().sum()

0

In [7]:
# Assign the data to X and y

#Drop any remaining Na values. ML cannot handle them. 
survivors.dropna(inplace=True)
#Assign X and y values
#X is our input and y is our output
X = survivors[["Pclass","Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]]
y = survivors["Survived"].values.reshape(-1, 1)

#X and y must have same 1st dimension (891)
print(X.shape, y.shape)

(891, 7) (891, 1)


In [8]:
#split the data into training and testing sets
#random state allows us to randomly select data but be able to reuse that same random data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
print(len(X_train))
print(len(y_train))
print(len(X_test))
print(len(y_test))

668
668
223
223


In [9]:
#import the ML model we want to use
from sklearn.linear_model import LogisticRegression
#Instantiate the class LogisticRegression()
classifier = LogisticRegression()
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [10]:
#train our chosen model using our data
classifier.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [11]:
#Print our scores
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.7994011976047904
Testing Data Score: 0.8026905829596412


In [14]:
#get a list of our predictions
predictions = classifier.predict(X_test)

#create a more useful report on our model
from sklearn.metrics import classification_report

print(classification_report(y_test, predictions,target_names=["Neg","Pos"]))#Note: I'm not sure if Neg/Pos is correct. Need to verify.

              precision    recall  f1-score   support

         Neg       0.81      0.88      0.85       137
         Pos       0.78      0.67      0.72        86

   micro avg       0.80      0.80      0.80       223
   macro avg       0.80      0.78      0.79       223
weighted avg       0.80      0.80      0.80       223



In [None]:
# predictions = classifier.predict(X_test)
# pd.DataFrame({"Prediction": predictions, "Actual": np.ravel(y_test)})