In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set() #set seaborn as default for plots
import pandas as pd

train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

test_target = test['PassengerId']

train.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S


In [None]:
test.head(1)

In [None]:
train.info()

In [None]:
test.info()

In [None]:
# what's missing from train?
train.isnull().sum()

In [None]:
# histograms -- looking at the distribution
def plotHistogram(dataset=train, feature='Age', bins=None):
    sns.distplot(dataset.dropna()[feature], hist=True, bins=bins)

In [None]:
plotHistogram(train, 'Age')

In [None]:
plotHistogram(train, 'Fare')

In [None]:
# plot descriptive feature vs target feature
def plotBarGraph(feature, stacked=True):
    # == 1 b/c target holds 0/1. i.e. binary classification
    survived = train[train['Survived']==1][feature].value_counts()
    dead = train[train['Survived']==0][feature].value_counts()
    df = pd.DataFrame([survived, dead])
    df.index = ['Survived', 'Dead']
    df.plot(kind='bar', stacked=stacked, figsize=(10,5))
    

In [None]:
plotBarGraph('Sex')

In [None]:
plotBarGraph('Pclass', stacked=False)

In [None]:
# 2D KDE plots
sns.jointplot(x=train['Pclass'], y=train['Survived'], kind="kde")

In [None]:
# check feature importance
import numpy as np
from sklearn.ensemble import RandomForestClassifier
X = train.copy()
y = train['Survived']
X = X.drop('Survived', axis=1)
X = X.fillna(-999)
# label encoding
for col in train.columns[train.dtypes == 'object']:
    X[col] = X[col].factorize()[0]
rf = RandomForestClassifier()
rf.fit(X,y)
# plot feature importance
plt.plot(rf.feature_importances_)
plt.xticks(np.arange(X.shape[1]), X.columns.tolist(), rotation=90)

In [None]:
train_test_data = [train, test]

In [None]:
for dataset in train_test_data:
    dataset['Title'] = dataset['Name'].str.extract('([A-Za-z]+)\.', expand=False)

In [None]:
train_test_data[0].head(1)

In [None]:
# label encoding Title
for dataset in train_test_data:
    dataset['Title'] = dataset['Title'].factorize()[0]
    dataset['Embarked'] = dataset['Embarked'].factorize()[0]
    dataset['Sex'] = dataset['Sex'].factorize()[0]
train_test_data[0].head(10)

In [None]:
# drop name
for dataset in train_test_data:
    dataset.drop(['Name','Ticket'], axis=1, inplace=True)
train_test_data[0].head()

In [None]:
train.info()
# we still have some missing age and cabin info

In [None]:
# filling in missing age with the median age for each title group (Mr, Mrs, etc)

In [None]:
for dataset in train_test_data:
    dataset['Age'].fillna(dataset.groupby('Title')['Age'].transform('median'), inplace=True)

train.info()

In [None]:
# facet plot Age groups vs Target
def plotFacetGraph(dataset, feature, target, domain=None):
    facet = sns.FacetGrid(dataset, hue=target, aspect=3)
    facet.map(sns.kdeplot, feature, shade=True)
    if not domain:
        facet.set(xlim=(0, dataset[feature].max()))
    else:
        facet.set(xlim=domain)
    facet.add_legend()
    plt.show()

In [None]:
train.head()

In [None]:
# need to choose a binning scheme for age
plotHistogram(train, 'Age', bins=10)

In [None]:
train['Age'].describe()

In [None]:
# binning looking at the facet graph
for dataset in train_test_data:
    dataset.loc[dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 34), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 34) & (dataset['Age'] <= 43), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 43), 'Age'] = 3    

train['Age'].head(10)

In [None]:
# now moving on to embarked
plotBarGraph('Embarked')

In [None]:
train['Embarked'].value_counts()

In [None]:
train.head()

In [None]:
plotFacetGraph(train, 'Fare', 'Survived', (0,200))

In [None]:
# binning
for instance in train_test_data:
    instance.loc[instance['Fare'] <= 17, 'Fare'] = 0
    instance.loc[(instance['Fare'] > 17) & (instance['Fare'] <= 30), 'Fare'] = 1
    instance.loc[(instance['Fare'] > 30) & (instance['Fare'] <= 100), 'Fare'] = 2
    instance.loc[instance['Fare'] > 100, 'Fare'] = 3

train.head()

In [None]:
train['Cabin'].value_counts().head()

In [None]:
# extracting alphabet from cabin
for dataset in train_test_data:
    dataset['Cabin'] = dataset['Cabin'].str[:1]

In [None]:
train.head(20)

In [None]:
train['Cabin'].value_counts()

In [None]:
# lets just transform cabin into labels
for dataset in train_test_data:
    dataset['Cabin'] = dataset['Cabin'].factorize()[0]

train['Cabin'].value_counts()

In [None]:
# actually lets just drop cabin, b/c it is missing too much information
for dataset in train_test_data:
    dataset = dataset.drop('Cabin', axis=1, inplace=True)

In [None]:
train.info()

In [None]:
train.head()

In [None]:
train.info()

In [None]:
# actually lets just drop cabin, b/c it is missing too much information
for dataset in train_test_data:
    dataset = dataset.drop('PassengerId', axis=1, inplace=True)

In [None]:
train.head()

In [None]:
target = train['Survived']
train = train.drop('Survived', axis=1)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
corr = train.corr()
sns.heatmap(corr)

In [None]:
train.shape

In [None]:
test.shape

In [None]:
target.shape

In [None]:
# Modelling
# Importing Classifier Modules
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

import numpy as np

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)

clf = KNeighborsClassifier(n_neighbors = 13)
scoring = 'accuracy'
score = cross_val_score(clf, train, target, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

In [None]:
# kNN Score
round(np.mean(score)*100, 2)

In [None]:
clf = DecisionTreeClassifier()
scoring = 'accuracy'
score = cross_val_score(clf, train, target, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

In [None]:
# decision tree Score
round(np.mean(score)*100, 2)

In [None]:
clf = RandomForestClassifier(n_estimators=13)
scoring = 'accuracy'
score = cross_val_score(clf, train, target, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

In [None]:
# Random Forest Score
round(np.mean(score)*100, 2)

In [None]:
clf = GaussianNB()
scoring = 'accuracy'
score = cross_val_score(clf, train, target, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

In [None]:
# Naive Bayes Score
round(np.mean(score)*100, 2)

In [None]:
# SVM
clf = SVC(gamma='scale')
scoring = 'accuracy'
score = cross_val_score(clf, train, target, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

In [None]:
round(np.mean(score)*100,2)


In [None]:
test['Age'].isnull().sum()

In [None]:
test.info()

In [None]:
test["Age"].fillna(train.groupby("Title")["Age"].transform("median"), inplace=True)
test["Fare"].fillna(train.groupby("Title")["Fare"].transform("median"), inplace=True)


In [None]:
test.isnull().sum()

In [None]:
# testing 
clf = SVC(gamma='scale')
clf.fit(train, target)

prediction = clf.predict(test)

In [None]:

submission = pd.DataFrame({
    "PassengerId": test_target,
    "Survived": prediction
})
submission.to_csv('submission.csv', index=False)

In [None]:
submission = pd.read_csv('submission.csv')
submission.head()

In [None]:
# lets try xgboost
from xgboost import XGBClassifier

# fit model no training data
model = XGBClassifier()
model.fit(train, target)

print(model)

In [None]:
# make predictions for test data
y_pred = model.predict(test)

submission2 = pd.DataFrame({
    "PassengerId": test_target,
    "Survived": prediction
})
submission.to_csv('submission-xgboost.csv', index=False)
submission = pd.read_csv('submission.csv')
submission.head()