In [28]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import MinMaxScaler

train_data = pd.read_csv("train.csv")

In [29]:
# ---  Pre-processing train data ---

# Removing insignificant data
train_data = train_data.drop(columns=['Ticket', 'Name', 'Cabin', 'PassengerId'])

# Removing NaN values
train_data = train_data.fillna(0)

# Extracting features
features_raw = train_data.drop(columns = 'Survived')

# Extracting Survived status
survived_data = train_data['Survived']

# Log-transform the skewed features
skewed = ['Fare']
features_log_transformed = pd.DataFrame(data = features_raw)
features_log_transformed[skewed] = train_data[skewed].apply(lambda x: np.log(x + 1))

# Min Max Scaler (Normalization)
scaler = MinMaxScaler()
features_log_transformed.head(20)
numerical = ['Age', 'Fare']
features_log_minmax_transform = pd.DataFrame(data = features_log_transformed)
features_log_minmax_transform[numerical] = scaler.fit_transform(features_log_minmax_transform[numerical])

# Replacing: Male = 0 and Female = 1
gender = {"male": 0, "female": 1}
features_log_minmax_transform = features_log_minmax_transform.replace({"Sex":gender})

# # One-hot encode the 'features_log_minmax_transform' data using pandas.get_dummies()
# features_final = pd.get_dummies(features_log_minmax_transform)

# Replaces: Embarked C = 0, Q = 1, S = 2
gate = {"C":0, "Q":1, "S":2}
features_final = features_log_minmax_transform.replace({"Embarked":gate})

In [30]:
features_final.head(20)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,0,0.275,1,0,0.338125,2
1,1,1,0.475,1,0,0.685892,0
2,3,1,0.325,0,0,0.350727,2
3,1,1,0.4375,1,0,0.639463,2
4,3,0,0.4375,0,0,0.352955,2
5,3,0,0.0,0,0,0.360026,1
6,1,0,0.675,0,0,0.635755,2
7,3,0,0.025,3,1,0.495832,2
8,3,1,0.3375,0,2,0.399934,2
9,2,1,0.175,1,0,0.550603,0


In [31]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = \
# train_test_split(features_final, survived_data, random_state=30, test_size=0.2 )

from sklearn.model_selection import ShuffleSplit
cv_sets = ShuffleSplit(n_splits = 10, test_size = 0.2, random_state = 30)

In [32]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()

In [33]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score

parameters = {'max_depth':list(range(1,8)),
              'min_samples_split':[5, 10, 15, 20, 25, 30],
              'min_samples_leaf':[3, 4, 5, 6, 7, 8, 9, 10]}
scorer = make_scorer(accuracy_score)

grid_obj = GridSearchCV(clf, parameters, scoring = scorer, cv = cv_sets)

# grid_fit = grid_obj.fit(X_train, y_train)
grid_fit = grid_obj.fit(features_final, survived_data)

best_clf = grid_fit.best_estimator_

In [34]:
# y_pred_train = best_clf.predict(X_train)
# y_pred_test = best_clf.predict(X_test)

y_pred_train = best_clf.predict(features_final)
print(best_clf)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=10,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


In [37]:
from sklearn.metrics import accuracy_score

acc_train = accuracy_score(survived_data, y_pred_train)
# acc_test = accuracy_score(y_test, y_pred_test)

print("Accuracy training score: {}".format(acc_train))
# print("Accuracy testing score: {}".format(acc_test))

Accuracy training score: 0.8630751964085297


In [38]:
# --- Final prediction with test_data ---
test_data = pd.read_csv("test.csv")

# ---  Pre-processing test data ---

# Saving PassengerId
passengerid = test_data['PassengerId']

# Removing insignificant data for testing
test_features_raw = test_data.drop(columns = ['Ticket', 'Name', 'Cabin', 'PassengerId'])

# Removing NaN values
test_features_raw = test_features_raw.fillna(0)

# Log-transform the skewed features
skewed = ['Fare']
test_features_log_transformed = pd.DataFrame(data = test_features_raw)
test_features_log_transformed[skewed] = test_data[skewed].apply(lambda x: np.log(x + 1))

# Min Max Scaler (Normalization)
test_scaler = MinMaxScaler()
numerical = ['Age', 'Fare']
test_features_log_minmax_transform = pd.DataFrame(data = test_features_log_transformed)
test_features_log_minmax_transform[numerical] =\
test_scaler.fit_transform(test_features_log_minmax_transform[numerical])

# Replacing: Male = 0 and Female = 1
gender = {"male": 0, "female": 1}
test_features_log_minmax_transform = test_features_log_minmax_transform.replace({"Sex":gender})

# Replaces: Embarked C = 0, Q = 1, S = 2
gate = {"C":0, "Q":1, "S":2}
test_features_final = test_features_log_minmax_transform.replace({"Embarked":gate})

test_features_final = test_features_final.fillna(0)

In [39]:
y_pred_test = best_clf.predict(test_features_final)

submission = pd.DataFrame({'PassengerId':passengerid, 'Survived':y_pred_test})

filename = "titanic_submission.csv"
submission.to_csv(filename, index = False)
print("saved file: " + filename)

saved file: titanic_submission.csv
