In [1]:
#now that we have a rough idea of our model, let's build a pipeline around and conduct cross-validation on the models, this is an excercise following Kaggle's course

In [21]:
#setting up libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRFClassifier
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

In [3]:
#load data
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')
gs = pd.read_csv('data/gender_submission.csv') 

In [4]:
# Separate target from predictors
y = train_data.Survived
X = train_data.drop(['Survived'], axis=1)

# Divide data into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [5]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [6]:
rf_model = RandomForestClassifier(n_estimators=500, random_state=0)

In [7]:
# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', rf_model)
                             ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

# Evaluate the model
score = accuracy_score(y_valid, preds)
print('accuracy score:', score)

accuracy score: 0.8659217877094972


In [8]:
scores = cross_val_score(my_pipeline, X_train, y_train,
                              cv=5,
                              scoring='accuracy')


In [9]:
scores.mean()

0.8019600118191667

In [10]:
selector = SelectKBest(f_classif, k=3)

In [11]:
kv_pipeline = Pipeline(steps=[('preprocessor', preprocessor),('selector', selector),
                              ('model', rf_model)
                             ])

In [12]:
scores = cross_val_score(kv_pipeline, X_train, y_train,
                              cv=5,
                              scoring='accuracy')
scores.mean()

0.772441642864178

In [13]:
X_train.head()

Unnamed: 0,Sex,Embarked,PassengerId,Pclass,Age,SibSp,Parch,Fare
140,female,C,141,3,,0,2,15.2458
439,male,S,440,2,31.0,0,0,10.5
817,male,C,818,2,31.0,1,1,37.0042
378,male,C,379,3,20.0,0,0,4.0125
491,male,S,492,3,21.0,0,0,7.25


In [14]:
dt_model = DecisionTreeClassifier(random_state=1)
xgb_model = XGBRFClassifier(n_estimators=10000, learning_rate=0.05)

In [15]:
models = [dt_model, xgb_model, rf_model]

In [16]:
for model in models:
    model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])
    scores = cross_val_score(my_pipeline, X_train, y_train,
                              cv=5,
                              scoring='accuracy')
    print("score for {0} is {1}".format(model, scores.mean()))

score for DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=1, splitter='best') is 0.8019600118191667
score for XGBRFClassifier(base_score=0.5, colsample_bylevel=1, colsample_bynode=0.8,
                colsample_bytree=1, gamma=0, learning_rate=0.05,
                max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
                n_estimators=10000, n_jobs=1, nthread=None,
                objective='binary:logistic', random_state=0, reg_alpha=0,
                reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
                subsample=0.8, verbosity=1) is 0.8019600118191667
score for RandomForestClassifier(bootst

In [17]:
selector = SelectKBest(f_classif, k=5)
for model in models:
    kv_pipeline = Pipeline(steps=[('preprocessor', preprocessor),('selector', selector),
                              ('model', model)
                             ])
    scores = cross_val_score(my_pipeline, X_train, y_train,
                              cv=5,
                              scoring='accuracy')
    print("score for {0} is {1}".format(model, scores.mean()))

score for DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=1, splitter='best') is 0.8019600118191667
score for XGBRFClassifier(base_score=0.5, colsample_bylevel=1, colsample_bynode=0.8,
                colsample_bytree=1, gamma=0, learning_rate=0.05,
                max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
                n_estimators=10000, n_jobs=1, nthread=None,
                objective='binary:logistic', random_state=0, reg_alpha=0,
                reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
                subsample=0.8, verbosity=1) is 0.8019600118191667
score for RandomForestClassifier(bootst

In [18]:
n_est_list = []
i = 1000
while i < 10000:
    n_est_list.append(i)
    i = i + 500

n_est_list

[1000,
 1500,
 2000,
 2500,
 3000,
 3500,
 4000,
 4500,
 5000,
 5500,
 6000,
 6500,
 7000,
 7500,
 8000,
 8500,
 9000,
 9500]

In [19]:
for ne in n_est_list:
    rf_model = RandomForestClassifier(n_estimators=ne, random_state=0)
    
    rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', rf_model)
                             ])
    scores = cross_val_score(my_pipeline, X_train, y_train,
                              cv=5,
                              scoring='accuracy')
    print("score for n_estimators {0} is {1}".format(ne, scores.mean()))
    

score for n_estimators 1000 is 0.8019600118191667
score for n_estimators 1500 is 0.8019600118191667
score for n_estimators 2000 is 0.8019600118191667
score for n_estimators 2500 is 0.8019600118191667
score for n_estimators 3000 is 0.8019600118191667
score for n_estimators 3500 is 0.8019600118191667
score for n_estimators 4000 is 0.8019600118191667
score for n_estimators 4500 is 0.8019600118191667
score for n_estimators 5000 is 0.8019600118191667
score for n_estimators 5500 is 0.8019600118191667
score for n_estimators 6000 is 0.8019600118191667
score for n_estimators 6500 is 0.8019600118191667
score for n_estimators 7000 is 0.8019600118191667
score for n_estimators 7500 is 0.8019600118191667
score for n_estimators 8000 is 0.8019600118191667
score for n_estimators 8500 is 0.8019600118191667
score for n_estimators 9000 is 0.8019600118191667
score for n_estimators 9500 is 0.8019600118191667


In [20]:
for ne in n_est_list:
    xgb_model = XGBRFClassifier(n_estimators=ne, learning_rate=0.05)
    
    xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', xgb_model)
                             ])
    scores = cross_val_score(my_pipeline, X, y,
                              cv=5,
                              scoring='accuracy')
    print("score for n_estimators {0} is {1}".format(ne, scores.mean()))

score for n_estimators 1000 is 0.8014562802083987


KeyboardInterrupt: 

In [None]:
rf_model = RandomForestClassifier(n_estimators=500, random_state=0)
# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', rf_model)
                             ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(test_data)

In [None]:
preds

In [None]:
preds_df = pd.DataFrame(preds)

In [None]:
preds_df.index = test_data.index
preds_df

In [None]:
preds_df.columns = gs[['Survived']].columns

In [None]:
preds_df

In [None]:
sub = preds_df.join(test_data.PassengerId)

In [None]:
sub

In [None]:
sub.to_csv('titanic_submission.csv')

In [None]:
xgb_model = XGBRFClassifier(n_estimators=10000, learning_rate=0.05)
    
xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', xgb_model)
                             ])
xgb_pipeline.fit(X_train, y_train)

In [None]:
preds = xgb_pipeline.predict(test_data)

In [None]:
preds

In [None]:
preds_df = pd.DataFrame(preds)

In [None]:
preds_df

In [None]:
preds_df.columns = gs[['Survived']].columns

In [None]:
sub = preds_df.join(test_data.PassengerId)

In [None]:
sub

In [None]:
sub.to_csv('titanic_submission.csv')

In [None]:
selector = SelectKBest(f_classif, k=5)
kv_pipeline = Pipeline(steps=[('preprocessor', preprocessor),('selector', selector),
                              ('model', xgb_model)
                             ])
kv_pipeline.fit(X_train, y_train)
preds_kv = kv_pipeline.predict(test_data)

In [None]:
preds_kv

In [23]:
def con_sub (prediction, filename):
    prediction = pd.DataFrame(prediction)
    prediction.columns = gs[['Survived']].columns
    sub = prediction.join(test_data.PassengerId)
    sub.to_csv(filename, index=False)

In [None]:
con_sub(preds_kv, 'kv_sub_xgb.csv')

In [None]:
model

In [None]:
xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', xgb_model)
                             ])
xgb_pipeline.fit(X_train, y_train)
preds_full = xgb_pipeline.predict(test_data)

In [None]:
preds_full

In [None]:
con_sub(preds_full, 'xgb_full.csv')

In [22]:
svc = SVC()

In [24]:
svc_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', svc)
                             ])
svc_pipeline.fit(X_train, y_train)
svc_preds = svc_pipeline.predict(test_data)
svc_preds



array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [25]:
con_sub(svc_preds, 'svc_full.csv')