# 0. Configure Package Dependencies

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

# 1. Import the Dataset

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

# 2. Preview the Dataset

## Training Data

In [None]:
train.head(5)

In [None]:
# Display the dimensions of the dataset.
rows = train.shape[0]
columns = train.shape[1]
feature_set = train.columns.values
print('Total Number of Features: ', columns)
print('Total Number of Instances: ', rows)
print('Feature Set includes: ', feature_set)

In [None]:
# Return column names, non-null numbers.
train.info()

In [None]:
# Return the statistics for all numeric variables.
train.describe()

## Testing Data

In [None]:
test.head(5)

In [None]:
# Display the dimensions of the dataset.
rows = test.shape[0]
columns = test.shape[1]
feature_set = test.columns.values
print('Total Number of Features: ', columns)
print('Total Number of Instances: ', rows)
print('Feature Set includes: ', feature_set)

In [None]:
# Return column names, non-null numbers.
test.info()

In [None]:
# Return the statistics for all numeric variables.
test.describe()

# 3. Feature Engineering

## 3.1 Feature Analysis

### 3.1.1 Univariate Analysis

In [None]:
# Target Feature Analysis.
train['Survived'].value_counts()

In [None]:
# Compute pairwise correlation of numeric variables, excluding ID, NA/null values.
train_corr = train.drop('PassengerId',axis=1).corr()
train_corr

In [None]:
# Display heatmap of pairwise correlation.
a = plt.subplots(figsize=(15,9)) # Resize canvas 
a = sns.heatmap(train_corr, vmin=-1, vmax=1 , annot=True , square=True)

### 3.1.2 Bivariate Analysis

In [None]:
# Association between "Pclass" and "Survived".
train[['Pclass','Survived']].groupby(['Pclass']).mean().plot.bar()

In [None]:
# Association between "Sex" and "Survived".
train[['Sex','Survived']].groupby(['Sex']).mean().plot.bar()

In [None]:
# Association between "Parch" and "Survived".
train[['Parch','Survived']].groupby(['Parch']).mean().plot.bar()

In [None]:
# Association between "Age" and "Survived".
g = sns.FacetGrid(train, col='Survived',size=5)
g.map(plt.hist, 'Age', bins=40)

In [None]:
# Association between "Embarked" and "Survived".
sns.countplot('Embarked',hue='Survived',data=train)

## 3.2. Feature Processing

In [None]:
# Merge training and testing data.
test['Survived'] = 0
train_test = pd.concat((train, test)).reset_index(drop=True)
train_test.info()

### Categorical Variables

In [None]:
# One-Hot-Encoder for categorical variable.
train_test['SibSp_Parch'] = train_test['SibSp'] + train_test['Parch']  # Merge siblings & spouses and parents & children
train_test['Embarked'].fillna(train_test['Embarked'].mode()[0],inplace=True) # Fill null with mode value
train_test = pd.get_dummies(train_test,columns = ['Pclass','Sex','SibSp','Parch','SibSp_Parch','Embarked']) 
train_test.head(5)

In [None]:
# Drop other useless categorical features.
train_test = train_test.drop(['PassengerId','Name','Cabin','Ticket'],axis=1)

### Numerical Variables

In [None]:
# Fill null with mean value.
train_test['Fare'].fillna(train_test['Fare'].mean(),inplace=True)

In [None]:
train_test.info()

In [None]:
# Populate "Age" with a regression model.
# Drop original target feature and create training and testing data.
missing_age = train_test.drop(['Survived'],axis=1)            

missing_age_train = missing_age[missing_age['Age'].notnull()] # Create training data
missing_age_test = missing_age[missing_age['Age'].isnull()]   # Create testing data

missing_age_train_X = missing_age_train.drop(['Age'], axis=1) # Create X
missing_age_train_Y = missing_age_train['Age']                # Create Y
missing_age_test_X = missing_age_test.drop(['Age'], axis=1)

# Data standardization and train and standardize with test sets.
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

ss.fit(missing_age_train_X)
missing_age_X_train = ss.transform(missing_age_train_X)
missing_age_X_test = ss.transform(missing_age_test_X)

# Predict age (null) with Bayes model.
from sklearn import linear_model
lin = linear_model.BayesianRidge()

lin.fit(missing_age_train_X,missing_age_train_Y)

# Populate the data set with predicted values.
train_test.loc[(train_test['Age'].isnull()), 'Age'] = lin.predict(missing_age_test_X)

# Bin continous values into discrete intervals (0-10, 10-18, 18-30, 30-50, 50-100).
train_test['Age'] = pd.cut(train_test['Age'], bins=[0,10,18,30,50,100],labels=[1,2,3,4,5])

# One-hot-encoder for "Age".
train_test = pd.get_dummies(train_test,columns=['Age'])

In [None]:
# Recover dataset into training and testing data.
train_data = train_test[:891]
test_data = train_test[891:]
# Create X and Y
train_data_X = train_data.drop(['Survived'],axis=1)
train_data_Y = train_data['Survived']
test_data_X = test_data.drop(['Survived'],axis=1)

Linear models require standardized data to be modeled, while tree models do not require standardized data.

In [None]:
from sklearn.preprocessing import StandardScaler
ss2 = StandardScaler()
ss2.fit(train_data_X)
train_data_X_sd = ss2.transform(train_data_X)
test_data_X_sd = ss2.transform(test_data_X)

# 4. Modeling

## 4.1 Build Models
- Single model: 
  - Random Forest
  - Logistic Regression
  - SVM
  - XGBOOST
  - GDBT
- Multiple model combination: 
  - ensemble
  - voting
  - stacking

In [None]:
# ======Random Forest======
from sklearn.ensemble import RandomForestClassifier

RF_model = RandomForestClassifier(n_estimators=150,min_samples_leaf=3,max_depth=6,oob_score=True)
RF_model.fit(train_data_X,train_data_Y)

# Random forests are modeled with randomly selected features, so the results may be slightly different each time.
# If the score is good enough, you can save the model and use it next time.
# from sklearn.externals import joblib
# joblib.dump(RF_model, 'rf10.pkl')

# Predict and Output "submission.csv" file.
result = RF_model.predict(test_data_X)
output = pd.DataFrame(data={"PassengerId":test["PassengerId"], "Survived":result})
output.to_csv("RF_model.csv", index=False)

In [None]:
# ======Logistic Regression======
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Build model and optimise hyperparameters with GridSearchCV.
LR_model = LogisticRegression()
param = {'C':[0.001,0.01,0.1,1,10], "max_iter":[100,250]}
clf = GridSearchCV(LR_model,param,cv=5,n_jobs=-1,verbose=1,scoring="roc_auc")
clf.fit(train_data_X_sd, train_data_Y)

# Output optimal param.
print(clf.best_params_)

# Pass the best parameters into the training model.
# LR_model = LogisticRegression(clf.best_params_)
LR_model = LogisticRegression(C=1,max_iter=100)
LR_model.fit(train_data_X_sd, train_data_Y)

# Predict and Output "submission.csv" file.
result = LR_model.predict(test_data_X_sd)
output = pd.DataFrame(data={"PassengerId":test["PassengerId"], "Survived":result})
output.to_csv("LR_model.csv", index=False)

In [None]:
# ======SVM======
from sklearn import svm

# Build model and optimise hyperparameters with GridSearchCV.
svc = svm.SVC()
clf = GridSearchCV(svc,param,cv=5,n_jobs=-1,verbose=1,scoring="roc_auc")
clf.fit(train_data_X_sd,train_data_Y)

# Output optimal param.
print(clf.best_params_)

# Pass the best parameters into the training model.
svc = svm.SVC(C=1,max_iter=250)
svc.fit(train_data_X_sd,train_data_Y)

# Predict and Output "submission.csv" file.
result = svc.predict(test_data_X_sd)
output = pd.DataFrame(data={"PassengerId":test["PassengerId"], "Survived":result})
output.to_csv("SVM_model.csv", index=False)

In [None]:
# ======Gradient Boosting======
from sklearn.ensemble import GradientBoostingClassifier

# Build model.
gbdt = GradientBoostingClassifier(learning_rate=0.7,max_depth=6,n_estimators=100,min_samples_leaf=2)
gbdt.fit(train_data_X,train_data_Y)

# Predict and Output "submission.csv" file.
result = gbdt.predict(test_data_X)
output = pd.DataFrame(data={"PassengerId":test["PassengerId"], "Survived":result})
output.to_csv("GDBT_model.csv", index=False)

In [None]:
# ======XGBOOST======
import xgboost as xgb

xgb_model = xgb.XGBClassifier(n_estimators=150,min_samples_leaf=3,max_depth=6)
xgb_model.fit(train_data_X,train_data_Y)

# Predict and Output "submission.csv" file.
result = xgb_model.predict(test_data_X)
output = pd.DataFrame(data={"PassengerId":test["PassengerId"], "Survived":result})
output.to_csv("XGB_model.csv", index=False)

In [None]:
# ======voting======
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Build four models: LR, XGBoost, RF, GDBT.
lr = LogisticRegression(C=0.1,max_iter=100)

xgb_model = xgb.XGBClassifier(max_depth=6,min_samples_leaf=2,n_estimators=100,num_round = 5)

rf = RandomForestClassifier(n_estimators=200,min_samples_leaf=2,max_depth=6,oob_score=True)

gbdt = GradientBoostingClassifier(learning_rate=0.1,min_samples_leaf=2,max_depth=6,n_estimators=100)

# Uses predicted class labels for majority rule voting.
vot = VotingClassifier(estimators=[('lr', lr), ('rf', rf),('gbdt',gbdt),('xgb',xgb_model)], voting='hard')
vot.fit(train_data_X_sd,train_data_Y)

# Predict and Output "submission.csv" file.
result = vot.predict(test_data_X_sd)
output = pd.DataFrame(data={"PassengerId":test["PassengerId"], "Survived":result})
output.to_csv("VOT_Ensemble.csv", index=False)

In [None]:
# # ======stacking======
# # Divide dataset: training features, testing features, target feature.
# X = train_data_X_sd
# X_predict = test_data_X_sd
# y = train_data_Y

# from sklearn.linear_model import LogisticRegression
# from sklearn import svm
# import xgboost as xgb
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.ensemble import GradientBoostingClassifier

# clfs = [LogisticRegression(C=0.1,max_iter=100),
#         xgb.XGBClassifier(max_depth=6,n_estimators=100,num_round = 5),
#         RandomForestClassifier(n_estimators=100,max_depth=6,oob_score=True),
#         GradientBoostingClassifier(learning_rate=0.3,max_depth=6,n_estimators=100)]

# # Create  n_folds = 5.
# from sklearn.model_selection import StratifiedKFold
# n_folds = 5
# # skf = list(StratifiedKFold(y, n_folds))
# skf = StratifiedKFold(y, n_folds)

# # Create a zero matrix.
# dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
# dataset_blend_test = np.zeros((X_predict.shape[0], len(clfs)))

# # Build model.
# for j, clf in enumerate(clfs):
#     '''Train each single model in turn'''
#     dataset_blend_test_j = np.zeros((X_predict.shape[0], len(skf)))
#     for i, (train, test) in enumerate(skf):
#         '''The i part is used as the prediction, 
#         the rest part is used to train the model, 
#         and the predicted output is obtained as the new feature of the i part'''
#         X_train, y_train, X_test, y_test = X[train], y[train], X[test], y[test]
#         clf.fit(X_train, y_train)
#         y_submission = clf.predict_proba(X_test)[:, 1]
#         dataset_blend_train[test, j] = y_submission
#         dataset_blend_test_j[:, i] = clf.predict_proba(X_predict)[:, 1]
#     '''For the test set, the predicted mean value of the k models is directly used as the new feature'''
#     dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)

# # Use to build the second layer model.
# clf2 = LogisticRegression(C=0.1,max_iter=100)
# clf2.fit(dataset_blend_train, y)
# y_submission = clf2.predict_proba(dataset_blend_test)[:, 1]

# test = pd.read_csv("test.csv")

# # Predict and Output "submission.csv" file.
# result = clf2.predict(dataset_blend_test)
# output = pd.DataFrame(data={"PassengerId":test["PassengerId"], "Survived":result})
# output.to_csv("STACK_Ensemble.csv", index=False)