In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
dataset = pd.read_csv('train.csv')
dataset = dataset.drop(columns=['Name', 'PassengerId', 'Ticket'])

dataset.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


In [3]:
i = 0
for row in dataset.iloc[:,7]:
    row = str(row)    
    if row == 'nan' or row[0] == 'n':
        cabins = 0
    else:
        cabins = row.count(' ') + 1
    dataset.iat[i, 7] = cabins
    i += 1

In [4]:
C = 0
S = 0
Q = 0
for row in dataset.iloc[:, 8]:
    row = str(row)
    if row == 'C':
        C += 1
    elif row  == 'S':
        S += 1
    elif row == 'Q':
        Q += 1

In [5]:
i = 0
for row in dataset.iloc[:,8]:
    row = str(row)
    if row == 'nan':
        dataset.iat[i, 8] = 'S' # S is most frequent
    i += 1

In [6]:
embarked_dict = {"Embarked": {"S": 0, "C": 1, "Q": 2}}
dataset.replace(embarked_dict, inplace = True)

In [7]:
dataset['TotFamSize'] = dataset['Parch'] + dataset['SibSp']

dataset.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,TotFamSize
0,0,3,male,22.0,1,0,7.25,0,0,1
1,1,1,female,38.0,1,0,71.2833,1,1,1
2,1,3,female,26.0,0,0,7.925,0,0,0
3,1,1,female,35.0,1,0,53.1,1,0,1
4,0,3,male,35.0,0,0,8.05,0,0,0


In [8]:
X = dataset.iloc[:, 1:].values
y = dataset.iloc[:, 0].values

print (X.shape)
print (y.shape)

(891, 9)
(891,)


In [9]:
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = 'NaN', strategy = 'median', axis = 0)
imputer = imputer.fit(X[:, 2:3])
X[:, 2:3] = imputer.transform(X[:, 2:3])



In [10]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
X[:, 1] = labelencoder_X.fit_transform(X[:, 1])

In [11]:
print (X.shape)

(891, 9)


In [12]:
onehotencoder = OneHotEncoder(categorical_features = [7])
X = onehotencoder.fit_transform(X).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [14]:
print (X.shape)

(891, 11)


In [15]:
# Dummy variable trap
X = np.delete(X, obj = 0, axis = 1)

print (X.shape)

(891, 10)


In [36]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [37]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [38]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

classifier = xgb.XGBClassifier()
# classifier = RandomForestClassifier(n_estimators = 20, max_depth = 10, criterion = 'entropy', bootstrap = False)
# classifier = SVC(kernel = 'rbf', gamma = 0.01, C = 100)

classifier.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [39]:
y_pred = classifier.predict(X_test)

In [40]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

cm

array([[119,  17],
       [ 24,  63]], dtype=int64)

In [41]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
accuracies.mean()

0.8294436906377205

In [42]:
accuracies.std()

0.04184580057687378

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = [{'max_depth': [5, 10, 25, 50, None], 'n_estimators': [15, 20, 25, 30], 'bootstrap': [True, False]}]
grid_search = GridSearchCV(estimator = classifier,
                          param_grid = parameters,
                          scoring = 'accuracy',
                          cv = 10,
                          n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_

In [None]:
from sklearn.metrics import mean_squared_error
rms = np.sqrt(mean_squared_error(y_test, y_pred))
print ("RMS error:" rms)

In [None]:
# Test data

In [43]:
dataset_submission = pd.read_csv('test.csv')
dataset_submission = dataset_submission.drop(columns=['Name', 'Ticket'])

dataset_submission.shape

(418, 9)

In [44]:
i = 0
for row in dataset_submission.iloc[:,7]:
    row = str(row)
    if row == 'nan':
        cabins = 0
    else:
        cabins = row.count(' ') + 1
    dataset_submission.iat[i, 7] = cabins
    i += 1

In [45]:
C = 0
S = 0
Q = 0
for row in dataset_submission.iloc[:, 8]:
    row = str(row)
    if row == 'C':
        C += 1
    elif row  == 'S':
        S += 1
    elif row == 'Q':
        Q += 1

In [46]:
i = 0
for row in dataset_submission.iloc[:,8]:
    row = str(row)
    if row == 'nan':
        dataset_subission.iat[i, 8] = 'S' # S is most frequent
    i += 1

In [47]:
embarked_dict = {"Embarked": {"S": 0, "C": 1, "Q": 2}}
dataset_submission.replace(embarked_dict, inplace = True)

In [48]:
dataset_submission['TotFamSize'] = dataset_submission['Parch'] + dataset_submission['SibSp']

dataset_submission.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,TotFamSize
0,892,3,male,34.5,0,0,7.8292,0,2,0
1,893,3,female,47.0,1,0,7.0,0,0,1
2,894,2,male,62.0,0,0,9.6875,0,2,0
3,895,3,male,27.0,0,0,8.6625,0,0,0
4,896,3,female,22.0,1,1,12.2875,0,0,2


In [49]:
X_submission = dataset_submission.iloc[:, 1:].values

In [50]:
from sklearn.preprocessing import Imputer

imputer = Imputer(missing_values = 'NaN', strategy = 'median', axis = 0)

imputer = imputer.fit(X_submission[:, 2:3])
X_submission[:, 2:3] = imputer.transform(X_submission[:, 2:3]) # needs to be vector but just does 2
imputer = imputer.fit(X_submission[:, 5:6])
X_submission[:, 5:6] = imputer.transform(X_submission[:, 5:6])



In [51]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
X_submission[:, 1] = labelencoder_X.fit_transform(X_submission[:, 1])

In [52]:
print (X_submission.shape)

for i in range(X_submission.shape[1]):
    print (X_submission[:, i])

(418, 9)
[3 3 2 3 3 3 3 2 3 3 3 1 1 2 1 2 2 3 3 3 1 3 1 1 1 3 1 3 1 3 2 2 3 3 1 3 3
 3 3 3 3 1 3 2 1 3 1 3 1 3 1 2 2 1 2 3 3 3 3 1 3 2 3 3 1 2 3 1 1 1 3 3 3 1
 1 1 3 1 2 3 3 1 1 3 2 3 3 3 3 2 3 3 1 3 1 3 1 3 3 3 1 2 3 3 3 3 3 3 3 2 2
 3 1 3 1 3 3 3 1 2 2 3 1 3 3 3 3 3 2 3 3 1 3 3 3 3 3 2 3 3 3 1 1 2 1 3 1 3
 1 2 1 3 3 3 3 3 1 3 1 3 3 3 2 3 2 3 1 3 1 3 3 3 3 3 3 2 2 1 2 1 2 1 1 3 1
 2 2 3 3 2 2 1 3 2 2 3 1 3 2 3 3 3 1 2 2 1 3 2 1 3 3 3 2 2 3 1 3 1 1 3 2 3
 2 3 1 3 3 3 3 2 2 1 3 3 1 3 1 3 2 1 1 2 1 3 3 1 2 2 2 3 2 3 1 3 3 3 3 3 2
 3 3 3 2 3 2 3 1 3 3 3 1 3 1 3 3 2 2 2 2 2 3 3 3 3 3 3 3 1 3 3 1 3 3 1 3 3
 2 3 1 3 3 2 2 3 3 1 1 3 1 3 3 3 3 3 1 3 1 2 3 2 3 3 2 1 1 3 2 1 2 2 2 1 3
 3 3 1 2 3 2 3 2 3 3 1 3 3 2 3 2 2 1 2 2 2 3 1 1 3 3 3 3 2 2 3 1 3 3 3 1 2
 2 1 1 2 1 1 3 2 1 3 3 3 3 3 2 2 3 2 3 3 1 1 3 2 3 1 3 1 3 3 1 2 1 1 1 2 2
 1 3 3 3 1 3 3 1 3 3 3]
[1 0 1 1 0 1 0 1 0 1 1 1 0 1 0 0 1 1 0 0 1 1 0 1 0 1 0 1 1 1 1 1 0 0 1 1 0
 0 1 1 1 1 1 0 0 1 1 1 0 0 1 1 0 0 1 1 1 1 1 0 1 1 1 0 1 0 0 1 1 0 

In [53]:
onehotencoder = OneHotEncoder(categorical_features = [7])
X_submission = onehotencoder.fit_transform(X_submission).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [55]:
print (X_submission.shape)

(418, 11)


In [56]:
# Dummy variable trap
print (X_submission[:, 0])
X_submission = np.delete(X_submission, obj = 0, axis = 1)

print (X_submission.shape)

[0. 1. 0. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 1. 1. 0. 0. 0. 1. 0. 0. 1. 1. 0.
 0. 1. 0. 0. 1. 0. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 0. 0.
 0. 1. 1. 0. 1. 1. 0. 0. 1. 1. 1. 0. 1. 1. 1. 0. 0. 1. 0. 1. 0. 1. 0. 1.
 1. 0. 0. 0. 1. 1. 1. 0. 0. 1. 1. 1. 0. 0. 0. 1. 0. 1. 1. 1. 1. 1. 0. 1.
 1. 1. 1. 1. 0. 1. 0. 1. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1.
 1. 0. 0. 1. 0. 1. 1. 0. 1. 1. 1. 0. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 0. 1.
 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 0. 1.
 0. 1. 1. 0. 1. 0. 1. 1. 1. 0. 1. 0. 1. 0. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1.
 1. 0. 1. 1. 0. 1. 1. 1. 0. 1. 0. 1. 1. 0. 0. 1. 0. 1. 1. 1. 1. 1. 1. 1.
 0. 1. 0. 1. 0. 1. 1. 1. 0. 0. 1. 0. 1. 1. 1. 1. 1. 0. 0. 1. 0. 0. 1. 0.
 0. 1. 0. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1.
 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1.
 0. 1. 1. 0. 0. 1. 1. 1. 0. 0. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 1. 0.
 1. 0. 0. 0. 0. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 0. 1.

In [57]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_submission = sc.fit_transform(X_submission)

In [58]:
prediction = classifier.predict(X_submission)

In [59]:
submission = pd.DataFrame({"PassengerId": dataset_submission["PassengerId"], "Survived": prediction})

In [60]:
submission.to_csv('Predictions-XG.csv', index = False)