In [None]:
#Generic DataScience Requirements
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns 
import csv

#Transformers
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#Classifiers
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

#Performance Measuring Tools
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

In [None]:
titanic = pd.read_csv('train.csv')
titanic_test = pd.read_csv('test.csv')
test_pass_id = titanic_test['PassengerId']

In [None]:
titanic.head()

In [None]:
titanic.info()

In [None]:
#age, cabin,embarked all missing data 

In [None]:
titanic.describe()

#### Data Description 
#### Passenger Id- Id of each passenger 
#### Survived - 0= No, 1 = yes
#### Pclass - ticket class 
#### Name - name of passenger 
#### sex - male or female 
#### sibsp - # of siblings/ spouses aboard the titanic 
#### parch - # number of parents/childern abroad the titanic 
#### ticket - ticekt number 
#### fare - passenger fare
#### cabin- cabin number
#### embarked - port of embarkation (C= cherbourg, Q= Queenstown, S= Southhampton)

#### Let's start by looking at the categorical variables 

In [None]:
titanic['Pclass'].value_counts() # majority of passengers 3rd class

In [None]:
titanic['Embarked'].value_counts()

In [None]:
titanic['Sex'].value_counts()

In [None]:
titanic['Survived'].value_counts()

In [None]:
titanic['Cabin'].value_counts()

In [None]:
titanic['Cabin'].isna().sum()

In [None]:
titanic['Ticket'].value_counts()

In [None]:
titanic['Ticket'].isna().sum()

In [None]:
titanic['Ticket'].describe()

In [None]:
titanic.groupby(['Ticket']).size()

In [None]:
median_age = sns.catplot(x='Pclass',y='Age', kind= 'bar',estimator=np.median,hue='Sex',palette='pastel', data=titanic )
median_age.fig.suptitle('Median age of per class')

In [None]:
titanic['count'] = titanic.groupby('Ticket')['Ticket'].transform('count')
titanic

In [None]:
titanic.Ticket.unique() 

#### We see that the median age of for each class increases. Unsurpsiginly older passengers were in the better class


In [None]:
avg_surv = sns.catplot(x='Survived', y='Age', kind='bar',hue='Sex',palette='pastel',estimator=np.median, data= titanic)
avg_surv.fig.suptitle('Median age of survived passenagers')

#### The median age of woman and men that survived was around 27. Men who did not survive had were slightly older, and woman that did not survive were around 24 years of age

In [None]:
avg_fare = sns.catplot(x='Pclass', y='Fare', kind='bar', estimator=np.mean, palette = 'pastel',data=titanic)
avg_fare.fig.suptitle('Averge cost of fare per class')

#### The average cost of a a ticket increases over 50% for first class

In [None]:
class_surv = sns.catplot(x='Sex', y='Survived',hue='Pclass', kind= 'bar',palette='pastel', data= titanic)
class_surv.fig.suptitle('Sex survived by class')

#### We see females had a higher survival rate, with classes seeming to be a factor. The majority of survivors were first class woman

In [None]:
embarked_surv = sns.catplot(x='Embarked', y='Survived', hue='Sex', kind= 'bar',palette='pastel', data=titanic)
embarked_surv.fig.suptitle('Survival rate of embarked location')

#### Does embarked location have any impact on survival?  There is not an embarked location that appears to have a higher survival rate

In [None]:
embarked_surv = sns.catplot(x='Embarked', y='Survived', hue='Pclass', kind= 'bar',palette='pastel', data=titanic)
embarked_surv.fig.suptitle('Survival rate of embarked location')

In [None]:
fare_surv = sns.catplot(x='Survived', y='Fare', kind= 'bar',palette='pastel',data=titanic)
fare_surv.fig.suptitle('Average fare of survived passengers')

#### We see that the on average passengers who survived spend 45 dollars on their fare

#### Let's see look at the relationship between SibSp and Parch with Survival 

In [None]:
sns.pointplot(x='Survived', y='SibSp', data= titanic)

#### Passengers with more siblings or spouses present were less likely to survive

In [None]:
sns.pointplot(x='Survived', y='Parch', data= titanic)

#### The number of parents to children had a positive relationship with survival- suggesting parents and children may have been prioritized. This is the opposite of the relationship with see with survival and sibsp

In [None]:
# transform ticket class, sex, embark location 
#age, cabin,embarked all missing data deal with missing values
# see how ticket class, sex, ticket fare, age and embarked location predict survival rate 

# 1. Drop columns we don't need
# 2. Deal with missing values
# 3. Hot encode categorical values
# 4. Drop NA
# 5. Standadrize values
# 6. Run models 

In [None]:
titanic = titanic.drop(['Name','Cabin', 'Ticket'], axis=1)
titanic_test = titanic_test.drop(['Name','Cabin', 'Ticket'], axis=1)

In [None]:
titanic.head()

In [None]:
#embarked has 2 null values. 
titanic['Embarked'].value_counts()


In [None]:
titanic['Embarked']= titanic['Embarked'].fillna('S')
titanic_test['Embarked']= titanic['Embarked'].fillna('S')

In [None]:
# let's turn categorical values into dummy variables

In [None]:
cat_encoder = OneHotEncoder(categories='auto')
features = cat_encoder.fit_transform(titanic[['Pclass','Sex',
                                           'Embarked']]).toarray()
feature_labels = cat_encoder.categories_
feature_labels = np.array(feature_labels).ravel()
feature_labels

In [None]:
features_test = cat_encoder.fit_transform(titanic[['Pclass','Sex',
                                           'Embarked']]).toarray()
feature_labels_test = cat_encoder.categories_
feature_labels_test = np.array(feature_labels).ravel()
feature_labels_test                                           

In [None]:
feats = pd.DataFrame(features, columns=['1','2','3','female','male','C','Q','S'])
feats.head()


In [None]:
feats_test = pd.DataFrame(features_test, columns=['1','2','3','female','male','C','Q','S'])
feats_test.head()

In [None]:
titanic= titanic.join(feats, how='inner')
titanic = titanic.drop(['PassengerId','Pclass', 'Sex','Embarked'],axis=1)
titanic.head()

In [None]:
titanic_test= titanic_test.join(feats_test, how='inner')
titanic_test = titanic_test.drop(['PassengerId','Pclass','Sex','Embarked'],axis=1)
titanic_test.head()
#print(len(titanic_test))

In [None]:
# transform all na values into median values 
imputer = SimpleImputer(strategy='median')
x= imputer.fit_transform(titanic)
#x =imputer.transform(titanic)

In [None]:
x_test = imputer.fit_transform(titanic_test)

In [None]:
titanic = pd.DataFrame(x, index= titanic.index,columns=titanic.columns)
titanic.head()

In [None]:
titanic_test = pd.DataFrame(x_test, index= titanic_test.index,columns=titanic_test.columns)
titanic.head()

In [None]:
titanic_sur = titanic['Survived']

In [None]:
titanic_pred = titanic.drop('Survived', axis=1)

In [None]:
scaler = StandardScaler()
scaler.fit(titanic_pred)
x = scaler.fit_transform(titanic_pred)
x_test = scaler.fit_transform(titanic_test)

In [None]:
titanic = pd.DataFrame(x, index= titanic_pred.index,columns=titanic_pred.columns)
titanic.head()

In [None]:
titanic = titanic.join(titanic_sur, how='outer')
titanic

let's do some additional visualization of the cleaned data

In [None]:
sns.distplot(titanic['Age'], axlabel='Distribution of Age')

In [None]:
corr = titanic.corr()
mask= np.triu(np.ones_like(corr, dtype=bool))
f, ax = plt.subplots(figsize=(10,10))
cmap = sns.diverging_palette(10,10, as_cmap= True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
           square=True, linewidth= .5, cbar_kws={'shrink':.5})

As expected we see some correlations between sex, class and survival. 

# Summary
Overall we've found several possible predictors of survival rate such as the boarding class passengers belonged to, age, sex, fare price. The parent/child relationship and sibling/spouse relationship may also play an important factor of survival

We anticipate these features will be able to successfully predict survival of passengers


In [None]:
# let's create our split our data into a test, train df. 
#train_set, test_set = train_test_split(titanic, test_size=0.2,random_state = 72)


# Building the Model

In [None]:
#Splitting the target value from the datasets
y_train = titanic['Survived']
train_set = titanic.drop(columns='Survived', axis=1)

#Commented out for now (not using test split. We already have test data)
#y_test = test_set['Survived']
#test_set = test_set.drop(columns='Survived', axis=1)

In [None]:
#Lets start with Gradient Descent
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(train_set, y_train)

In [None]:
cross_val_score(sgd_clf, train_set, y_train,cv=3, scoring='accuracy')

In [None]:
y_train_pred = cross_val_predict(sgd_clf, train_set, y_train, cv=3)

In [None]:
confusion_matrix(y_train, y_train_pred)

In [None]:
precision_score(y_train, y_train_pred) #160/160+56

In [None]:
print(f'precision: {160/(160+56)}')
print(f'recall: {160/(160+83)}')

In [None]:
recall_score(y_train, y_train_pred)

In [None]:
f1_score(y_train, y_train_pred)

In [None]:
y_scores = cross_val_predict(sgd_clf, train_set, y_train, cv=3, method='decision_function')
fpr, tpr, thresholds = roc_curve(y_train, y_scores)

In [None]:
def plot_roc_curve(fpr,tpr, label=None):
    plt.plot(fpr,tpr, linewidth=2, label=label)
    plt.plot([0,1],[0,1], 'k--')

In [None]:
plot_roc_curve(fpr,tpr)
plt.show()

In [None]:
roc_auc_score(y_train, y_train_pred)

In [None]:
#Now lets try Random Foreset
forest_clf = RandomForestClassifier(random_state=42)
forest_clf.fit(train_set, y_train)
y_probas_forest = cross_val_predict(forest_clf, train_set, y_train, cv=3, method='predict_proba')

In [None]:
y_scores_forest = y_probas_forest[:,1]
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train, y_scores_forest)

In [None]:
plt.plot(fpr,tpr,'b:', label='SGD')
plot_roc_curve(fpr_forest, tpr_forest, 'Random Forest')
plt.legend(loc='lower right')
plt.show()

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(train_set, y_train)
y_pred_gnb = gnb.predict(titanic_test)
gnb_test_results = gnb.predict(titanic_test)

In [None]:
cross_val_score(gnb, train_set,y_train, cv=3, scoring='accuracy')

In [None]:
y_train_pred_gnb = cross_val_predict(gnb, train_set, y_train, cv=3)
confusion_matrix(y_train, y_train_pred_gnb)

In [None]:
print('Gaussion Results')
print(f'Precision:{precision_score(y_train,y_train_pred_gnb)}')
print(f'Recall:{recall_score(y_train, y_train_pred_gnb)}')
print(f'F1 Score:{f1_score(y_train, y_train_pred_gnb)}')

In [None]:
#Random Forest Prediction
forest_pred = forest_clf.predict(titanic_test)
sgd_pred = sgd_clf.predict(titanic_test)

# Save results to CSV

In [None]:
def saveToCSV(predictions):
    pd.DataFrame(list(zip(np.array(test_pass_id),predictions.astype(int))), columns=['PassengerId', 'Survived']).to_csv('results.csv', index=False)
    print('Saved to results.csv')

# Extras Grid Search

In [None]:
#Lets find what Random Forest Classifier Hyper-parameters we cna use
forest_clf.get_params().keys()

In [None]:
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2', None],
    'max_depth' : [6,7,8,10,12, None],
    'bootstrap': [False,True],
    'criterion' : ['gini', 'entropy']
}

grid_search = GridSearchCV(forest_clf, param_grid,
    cv=3, return_train_score=True)
grid_search.fit(train_set, y_train)

In [None]:
grid_search.best_params_

In [None]:
#lets find what hyperparameters we can try and fine tune
sgd_clf.get_params().keys()

In [None]:
param_grid = { 
    'n_jobs': [200, 500, -1],
    'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_loss', 'huber', 'epsilon_insensitive'],
    'penalty': ['l2','l1','elasticnet']
}
grid_search_2 = GridSearchCV(sgd_clf, param_grid,
    cv=3, return_train_score=True)
grid_search_2.fit(train_set, y_train)

In [None]:
grid_search_2.best_params_

In [None]:
tuned_rfc = RandomForestClassifier(random_state=42, max_features=None, n_estimators= 200, max_depth=7, criterion='entropy')
tuned_rfc.fit(train_set, y_train)

In [None]:
tuned_sgd = SGDClassifier(loss='huber',n_jobs=200,penalty='l2').fit(train_set,y_train)
tuned_sgd_pred = tuned_sgd.predict(titanic_test)

In [None]:
tuned_pred = tuned_rfc.predict(titanic_test)

In [None]:
y_score_sgd = cross_val_predict(tuned_sgd,train_set,y_train)
cross_val_score(tuned_sgd,train_set,y_train)

In [None]:
y_score = cross_val_predict(tuned_rfc,train_set, y_train)
cross_val_score(tuned_rfc,train_set, y_train)

In [None]:
saveToCSV(tuned_pred)