# C4021 - Project
## Group 2.4
Gerry Kerley
### Dataset used
Titanic passenger survival: 
https://www.kaggle.com/c/titanic/data

## Data Loading & Preprocessing

### Load libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

### Load Titanic dataset

In [None]:
train = pd.read_csv('titanic/train.csv')
test = pd.read_csv('titanic/test.csv')

train.info()
train.sample(5)

## Data Cleaning

### Check for missing data

In [None]:
combined_data = pd.concat([train, test])
sns.heatmap(combined_data.isnull(),yticklabels=False,cbar=False,cmap='viridis')

Age & Cabin have a lot of missing data.

Cabin number is probably not that important.

### Fill missing Age data
Check whether the 3 passenger classes have different age distributions

In [None]:
plt.figure(figsize=(10, 8))
sns.boxplot(x='Pclass',y='Age',data=combined_data,palette='autumn')

1st class tends to have older passengers so we can use the mean age for each class to fill in the blank ages.

In [None]:
def fill_missing_ages_with_mean_for_pclass (cols):
    Age = cols[0]
    Pclass = cols[1]   
          
    if pd.isnull(Age): 
        
        if Pclass == 1:            
            return np.mean(combined_data[combined_data['Pclass'] == 1 ]['Age'])

        elif Pclass == 2:
            return np.mean(combined_data[combined_data['Pclass'] == 2 ]['Age'])

        else:
            return np.mean(combined_data[combined_data['Pclass'] == 3 ]['Age'])

    else:
        return Age
    
combined_data['Age'] = combined_data[['Age','Pclass']].apply(fill_missing_ages_with_mean_for_pclass, axis=1)
combined_data.head()

In [None]:
sns.barplot(x="Embarked", y="Survived", hue="Sex", data=train_data);

In [None]:
sns.pointplot(x="Pclass", y="Survived", hue="Sex", data=train_data,
              palette={"male": "green", "female": "orange"},
              markers=["*", "o"], linestyles=["-", "--"]);

## Feature Engineering

In [None]:
def prepare_features(data):
    # Age
    data.Age = data.Age.fillna(-0.5)
    bins = (-1, 0, 5, 12, 20, 25, 40, 65, 120)
    categories = ['Unknown', 'Baby', 'Child', 'Teenager', 'Student', 'Young Adult', 'Adult', 'Senior']
    age_groups = pd.cut(data.Age, bins, labels=categories)
    data.Age = age_groups
    
    # TODO: categorise fares, split name?, cabin numbers?
    #data.Fare = data.Fare.fillna(-0.5)
    data.fillna(value={'Fare': np.mean(data['Fare'])}, inplace=True)
    
    # TODO: Convert Pclass to Social Class
    
    # Drop columns
    unwanted_cols = ['Embarked', 'Ticket', 'Name', 'Cabin']
    data.drop(unwanted_cols, axis=1, inplace= True)

    return data

combined_data = prepare_features(combined_data)
combined_data.head()

In [None]:
sns.barplot(x="Age", y="Survived", hue="Sex", data=combined_data);

### Normalise labels
Use SKLearn's LabelEncoder to convert each unique string value into a number

In [None]:
def encode_features(combined_data):
    features = ['Age', 'Sex']
    
    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(combined_data[feature])
        combined_data[feature] = le.transform(combined_data[feature])
    return combined_data
    
combined_data = encode_features(combined_data)
print(combined_data.head())

#### Feature heatmap

In [None]:
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
X = combined_data[features]

plt.subplots(figsize=(8, 5))
sns.heatmap(X.corr(), annot=True, cmap="RdYlGn")
plt.show()

## Training 
#### Split the Training Data

In [None]:
train_data = combined_data[combined_data['Survived'].notnull()]
test_data = combined_data[combined_data['Survived'].isnull()]
test_data = test_data.drop('Survived', axis=1)

X_all = train_data.drop(['Survived', 'PassengerId'], axis=1)
y_all = train_data['Survived']

test_proportion = 0.20
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=test_proportion, random_state=42)

In [None]:
X_test.head()

## Machine Learning
#### Fit, predict and fine tune the algorithms

### Logistic Regression

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

In [None]:
log_reg.score(X_train, y_train)

In [None]:
lr_predictions = log_reg.predict(X_test)
lr_predictions

In [None]:
confusion_matrix = confusion_matrix(y_test, lr_predictions)
confusion_matrix

In [None]:
print(accuracy_score(y_test, lr_predictions))

#### Cross Validation

In [None]:
scores = cross_val_score(log_reg, X_train, y_train, cv=10)
scores.sort()
accuracy = scores.mean()

print(scores)
print(accuracy)

#### Evaluation

In [None]:
print(classification_report(y_test, lr_predictions))

### Random Forest

In [None]:
rf_clf = RandomForestClassifier()

parameters = {'n_estimators': [4, 7, 10], 
              'max_features': ['log2', 'sqrt', 'auto'], 
              'criterion': ['entropy', 'gini'],
              'max_depth': [2, 3, 5, 10], 
              'min_samples_split': [2, 3, 5],
              'min_samples_leaf': [1,5,8]
             }

scorer = make_scorer(accuracy_score)

grid_search = GridSearchCV(rf_clf, parameters, scoring=scorer)
grid_search = grid_search.fit(X_train, y_train)

rf_clf = grid_search.best_estimator_

rf_clf.fit(X_train, y_train)

In [None]:
rf_predictions = rf_clf.predict(X_test)
print(accuracy_score(y_test, rf_predictions))

### Predict on Test Data

In [None]:
ids = test_data['PassengerId']

predictions = rf_clf.predict(test_data.drop('PassengerId', axis=1))

output = pd.DataFrame({ 'PassengerId' : ids, 'Survived': predictions })
# output.to_csv('titanic/titanic-predictions.csv', index = False)
output.head()