In [1]:
# Import libraries and the data set
import pandas as pd
import numpy as np
import sidetable
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline
import DataScience

df = pd.read_csv('train.csv', index_col=['PassengerId'])
X_test = pd.read_csv('test.csv', index_col=['PassengerId'])

In [2]:
# Have a look at the first five rows
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Column __Name__ and __Ticket__ has nothing to do with predicting __Survived__, it will be dropped

In [3]:
df.drop(columns=['Name', 'Ticket'], inplace=True)
X_test.drop(columns=['Name', 'Ticket'], inplace=True)

In [4]:
# See the missing data in columns
df.stb.missing(style=True)

Unnamed: 0,missing,total,percent
Cabin,687,891,77.10%
Age,177,891,19.87%
Embarked,2,891,0.22%
Survived,0,891,0.00%
Pclass,0,891,0.00%
Sex,0,891,0.00%
SibSp,0,891,0.00%
Parch,0,891,0.00%
Fare,0,891,0.00%


As column __Cabin__ has 77% missing values, the column will be dropped.
<br>Column __Age__ can be imputed.
<br>As column __Embarked__ has 2 missing rows, the rows will be dropped.

In [5]:
# As Pclass includes fare, I will drop 'Fare'
df.drop(columns=['Cabin', 'Fare'], inplace=True, axis=1)
df.dropna(subset=['Embarked'], axis=0, inplace=True)

In [6]:
X_test.stb.missing(style=True)

Unnamed: 0,missing,total,percent
Cabin,327,418,78.23%
Age,86,418,20.57%
Fare,1,418,0.24%
Pclass,0,418,0.00%
Sex,0,418,0.00%
SibSp,0,418,0.00%
Parch,0,418,0.00%
Embarked,0,418,0.00%


As column __Cabin__ has 77% missing values, the column will be dropped.
<br>Column __Age__ can be imputed.
<br>As column __Fare__ has 2 missing rows, the rows will be dropped.

In [7]:
# As there are only two missing rows of column Embarked, I will drop the rows
X_test.drop(columns=['Cabin', 'Fare'], axis=1, inplace=True)
# X_test.dropna(subset=['Fare'], axis=0, inplace=True)

In [8]:
# Recheck the missing data in columns
df.stb.missing(style=True)

Unnamed: 0,missing,total,percent
Age,177,889,19.91%
Survived,0,889,0.00%
Pclass,0,889,0.00%
Sex,0,889,0.00%
SibSp,0,889,0.00%
Parch,0,889,0.00%
Embarked,0,889,0.00%


In [9]:
X_test.stb.missing(style=True)

Unnamed: 0,missing,total,percent
Age,86,418,20.57%
Pclass,0,418,0.00%
Sex,0,418,0.00%
SibSp,0,418,0.00%
Parch,0,418,0.00%
Embarked,0,418,0.00%


In [10]:
# Get X_train and y_train
y_train = df.Survived.copy()
X_train = df.drop(columns='Survived').copy()

In [11]:
# Retrive num_cols and cat_cols
num_cols = list(X_train._get_numeric_data().columns)
cat_cols = list(set(X_train.columns) - set(num_cols))

In [12]:
print(num_cols, cat_cols)

['Pclass', 'Age', 'SibSp', 'Parch'] ['Sex', 'Embarked']


In [13]:
# Impute cat_cols
from sklearn.impute import SimpleImputer
si = SimpleImputer(strategy='most_frequent')
X_train[cat_cols] = si.fit_transform(X_train[cat_cols])
X_test[cat_cols] = si.fit_transform(X_test[cat_cols])

In [14]:
# Encode cat_cols
X_train = pd.get_dummies(data=X_train, columns=cat_cols, drop_first=True)
X_test = pd.get_dummies(data=X_test, columns=cat_cols, drop_first=True)

In [15]:
# Have a look at the current columns
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 1 to 891
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Pclass      889 non-null    int64  
 1   Age         712 non-null    float64
 2   SibSp       889 non-null    int64  
 3   Parch       889 non-null    int64  
 4   Sex_male    889 non-null    uint8  
 5   Embarked_Q  889 non-null    uint8  
 6   Embarked_S  889 non-null    uint8  
dtypes: float64(1), int64(3), uint8(3)
memory usage: 37.3 KB


In [16]:
train_num_cols_to_be_imputed = X_train._get_numeric_data().columns[X_train.isnull().any()]
test_num_cols_to_be_imputed = X_test._get_numeric_data().columns[X_test.isnull().any()]

In [17]:
# # This is the module I wrote included in folder 'package'
# # Test for the best imputation algorithm for num_cols
# DataScience.test_imputations(X_train, y_train, train_num_cols_to_be_imputed)

We can see that Iterative yields the best Adjusted R Squared score

In [18]:
# # # Test for the best k neighors in KNN for num_cols
# DataScience.test_KNN_imputation(X_train, y_train, num_cols, range(2, 10))

In [19]:
# Impute num_cols
from fancyimpute import IterativeImputer
ii = IterativeImputer()
X_train[train_num_cols_to_be_imputed] = ii.fit_transform(X_train[train_num_cols_to_be_imputed])
X_test[test_num_cols_to_be_imputed] = ii.fit_transform(X_test[test_num_cols_to_be_imputed])

In [20]:
# Scale num_cols
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
num_scaled = sc.fit_transform(X_train[train_num_cols_to_be_imputed])
X_train[train_num_cols_to_be_imputed] = pd.DataFrame(num_scaled, index=X_train.index, columns=train_num_cols_to_be_imputed)
num_scaled = sc.fit_transform(X_test[test_num_cols_to_be_imputed])
X_test[test_num_cols_to_be_imputed] = pd.DataFrame(num_scaled, index=X_test.index, columns=test_num_cols_to_be_imputed)

In [21]:
# Verify X_train
X_train.head()

Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Sex_male,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,3,-0.58962,1,0,1,0,1
2,1,0.644848,1,0,0,0,0
3,3,-0.281003,0,0,0,0,1
4,1,0.413385,1,0,0,0,1
5,3,0.413385,0,0,1,0,1


In [22]:
from sklearn.model_selection import GridSearchCV, LeavePOut
from sklearn.ensemble import RandomForestClassifier

parameters = {'criterion':['gini','entropy'],
          'n_estimators':[10,15,20,25,30],
          'min_samples_leaf':range(4),
          'min_samples_split':range(3, 8)
             }
rfc = RandomForestClassifier()
              
grid_search = GridSearchCV(estimator=rfc,
                        param_grid=parameters,
                        cv = LeavePOut(1),
                        verbose=True,
                        n_jobs = -1)
# Try fitting training data sets with all parameters
grid_search.fit(X_train,y_train)

# Print the best parameters
print(grid_search.best_params_)

# # Fit the training tests using the best parameters
best_search = RandomForestClassifier(**grid_search.best_params_, random_state=1)

SyntaxError: invalid syntax (<ipython-input-22-cf833981c0e4>, line 8)

In [None]:
# best_search.fit(X_train,y_train)

# # Get the predicted values
# predictions = best_search.predict(X_test_copy)

In [None]:
# from sklearn.model_selection import GridSearchCV
# from xgboost import XGBClassifier
# from sklearn.model_selection import StratifiedKFold
# parameters = {'objective':['reg:linear'],
#               'learning_rate': [0.1, 0.2], #so called `eta` value
#               'gamma': [0.5, 1, 1.5, 2, 5],
#               'subsample': [0.6, 0.8, 1.0],
#               'min_child_weight': [1, 5, 10],
#               'n_estimators': [500],
#                 'colsample_bytree': [0.6, 0.8, 1.0],
#                 'max_depth': [3, 4, 5],
#                 'reg_alpha': [1.1, 1.2, 1.3],
#                 'reg_lambda': [1.1, 1.2, 1.3],
#                 'subsample': [0.7, 0.8, 0.9]
#              }

# xgb_grid = GridSearchCV(XGBClassifier(random_state=1),
#                         parameters,
#                         cv = StratifiedKFold(),
#                         n_jobs = -1,
#                         verbose=True)

# # Try fitting training data sets with all parameters
# xgb_grid.fit(X_train,y_train)

# # Print the best parameters
# print(xgb_grid.best_params_)

# #Fit the training tests using the best parameters
# gbm = XGBClassifier(**xgb_grid.best_params_, random_state=1)
# gbm.fit(X_train,y_train)

# # Print the accuracy of prediction
# predictions = gbm.predict(X_test_copy)

In [None]:
# Get the index of the predicted table
index = X_test.index
predictions = pd.DataFrame(predictions, columns=['Survived'], index=index)

# saving the DataFrame as a CSV file 
csv_data = predictions.to_csv('Predictions.csv', index = True) 

In [None]:
# Verify the prediction's format
print(predictions.head())