# Pipeline Models
## From raw data to outputs in single pipeline

## Initial Imports

In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
import seaborn as sns
sns.set()
pd.set_option('display.max_columns',500)

from IPython.display import display
from scipy import stats

import os
import pickle

import sys
sys.path.append(r'C:\Users\User\Documents\Programming Practice\my_modules')

from trav_functions import *

## Load Data

In [7]:
data_raw = '../data/raw/'
data_interim = '../data/interim/'
data_external = '../data/external/'
data_processed = '../data/processed/'
model_dir = '../models/'

In [15]:
df_train = pd.read_feather(data_interim + 'train_01.ftr')
df_test = pd.read_feather(data_interim + 'test_01.ftr')

df_train = df_train.set_index(df_train.columns[0])
df_test = df_test.set_index(df_test.columns[0])

  labels, = index.labels


In [16]:
label = 'Survived'

# Pipelines

## Sklearn Import Statements

In [17]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

In [46]:
X_train = df_train.drop(label,axis=1)
y_train = df_train[label]

X_test = df_test

cols_to_drop = ['Cabin','Name']
num_cols = X_train.select_dtypes(np.number).columns
cat_cols = X_train.select_dtypes('category').columns

## Preprocessing

In [47]:
cat_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant',fill_value = 'missing')),
    ('encoder', OneHotEncoder(handle_unknown = 'ignore'))
])

num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median'))
])

preprocessor = ColumnTransformer(
    transformers= [
        ('cat',cat_transformer,cat_cols),
        ('num',num_transformer,num_cols)]
)

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

## Model Building

In [54]:
%%time

param_distributions = {
    'n_estimators':[20],
    'criterion': ['gini','entropy'],
    'max_depth':stats.randint(2,100),
    'min_samples_split':stats.randint(2,100),
    'min_samples_leaf':stats.randint(1,100)
}

rs = RandomizedSearchCV(
    RandomForestClassifier(),
    param_distributions,
    n_iter = 10,
    scoring = 'accuracy',
    cv = 5,
    n_jobs=-1,
    verbose=1
)

scores = cross_val_score(
    rs,
    X_train_processed,
    y_train,
    scoring = 'accuracy',
    cv = 2
)

print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

rs.fit(X_train_processed,y_train)
results = pd.DataFrame(rs.cv_results_).sort_values('rank_test_score')

model = rs.best_estimator_
# display(results)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done  35 out of  50 | elapsed:    3.4s remaining:    1.4s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    3.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


CV accuracy: 0.762 +/- 0.009
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Wall time: 4.18 s


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.2s finished


In [51]:
def create_predictions(model,X_test,X_test_processed):
    predictions = (pd.DataFrame(model.predict(X_test_processed),index=X_test.index)
                   .reset_index()
                   .rename(columns = {0:'Survived'})
                   .assign(Survived = lambda x: x['Survived'].astype(int))
                  )
    return(predictions)

In [55]:
test_predictions = create_predictions(model,X_test,X_test_processed)

## Save model dataset, model, & results serialized together

Write function which will save the model, model predictions .csv file, and the notebook used to create & run the model at that point in time

In [58]:
def output_model(model,predictions,model_dir):
    """Output serialized model, predictions, and notebook used to make model/predictions"""
    
    for i in range(1,1000):
        model_name = '{}model_{:03d}.pkl'.format(model_dir,i)
        
        # Check if filename exists, if not write files
        if not os.path.isfile(model_name):
            with open(model_name,'wb') as f:
                pickle.dump(model,f)
                
            prediction_name = '{}predictions_{:03d}.csv'.format(model_dir,i)
            predictions.to_csv(prediction_name,index=False)
            
            notebook_name = '{}notebook_{:03d}'.format(model_dir,i)
            copy_current_nb(notebook_name)
            return(prediction_name)
    print('Error: More than 1000 models in folder')
    return

In [70]:
def submit_kaggle(competition,prediction_filename,message):
    """Submit prediction file to kaggle"""
    !kaggle competitions submit -c $competition -f $prediction_filename -m $message
    return

In [59]:
prediction_name = output_model(model,test_predictions,model_dir)

In [72]:
submit_kaggle('titanic',prediction_name,'\"My second attempt at submitting a file via the kaggle command line interface\"')

Successfully submitted to Titanic: Machine Learning from Disaster



  0%|          | 0.00/3.18k [00:00<?, ?B/s]
100%|##########| 3.18k/3.18k [00:00<00:00, 5.73kB/s]
