In [None]:
#import libraries

import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import model_selection
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score



In [None]:
#load the datasets

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
#get an idea of the data

train.head(5)

In [None]:
test.head(5)

In [None]:
#descriptive statistics

train.describe()

In [None]:
#drop columns we don't need

train.drop(['PassengerId','Cabin', 'Ticket', 'Name'], axis=1, inplace = True)
test.drop(['Cabin', 'Ticket', 'Name'], axis=1, inplace = True)

In [None]:
#create a list with both datasets

datasets = [train, test]

In [None]:
#Replace NaN with values (clean dataset)

for dataset in datasets:    
    dataset['Age'].fillna(dataset['Age'].median(), inplace = True)
    dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace = True)
    dataset['Fare'].fillna(dataset['Fare'].median(), inplace = True)

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
#convert to categorical

label = LabelEncoder()
for dataset in datasets:    
    dataset['Sex_Code'] = label.fit_transform(dataset['Sex'])
    dataset['Embarked_Code'] = label.fit_transform(dataset['Embarked'])


In [None]:
train.head(5)

In [None]:
#set features and target/label
features = ['Sex','Pclass', 'Embarked','SibSp', 'Parch', 'Age', 'Fare']
target = ['Survived']

features_for_test = ['PassengerId', 'Sex','Pclass', 'Embarked','SibSp', 'Parch', 'Age', 'Fare']


In [None]:
#create dummy columns from categorical

train_dummy = pd.get_dummies(train[features])
test_dummy = pd.get_dummies(test[features_for_test])


train_dummy.head()

In [None]:
#Correlations

for x in features:
    if train[x].dtype != 'float64':
        print('Survival Correlation by:', x)
        print(train[[x, target[0]]].groupby(x, as_index=False).mean())
        print('-'*10, '\n')

In [None]:
#correlation heatmap
def correlation_heatmap(df):
    _ , ax = plt.subplots(figsize =(14, 12))
    colormap = sns.diverging_palette(220, 10, as_cmap = True)
    
    _ = sns.heatmap(
        df.corr(), 
        cmap = colormap,
        square=True, 
        cbar_kws={'shrink':.9 }, 
        ax=ax,
        annot=True, 
        linewidths=0.1,vmax=1.0, linecolor='white',
        annot_kws={'fontsize':12 }
    )
    
    plt.title('Pearson Correlation of Features', y=1.05, size=15)

correlation_heatmap(train)

In [None]:
#split the dataset
new_features = train_dummy.columns.tolist()
train_x, test_x, train_y, test_y = model_selection.train_test_split(train_dummy[new_features], train[target], train_size=0.8)

In [None]:
clf = RandomForestClassifier(n_jobs=2, random_state=0)
clf.fit(train_x[new_features], train_y)


In [None]:
predictions = clf.predict(test_x)

In [None]:
print("Train Accuracy :: ", accuracy_score(train_y, clf.predict(train_x)))
print("Test Accuracy  :: ", accuracy_score(test_y, predictions))

In [None]:
test_dummy['Survived'] = clf.predict(test_dummy[new_features])

In [None]:
test_dummy[['PassengerId','Survived']].to_csv('predictions.csv',index=False)