# Section 3: Developing ML models
## 3.1 Pipeline Models

In [1]:
import pandas as pd
X_train = pd.read_csv('X_train_pca.csv')
y_train = pd.read_csv('y_train.csv')

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import model_selection
import xgboost as xgb #Downloaded package in Environmnet before run of this notebook
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score,fbeta_score,matthews_corrcoef
from sklearn.metrics import log_loss,roc_auc_score,precision_score,f1_score,recall_score,roc_curve,auc

In [None]:
def PipelineModels():
    PipelineModels = []
    PipelineModels.append(('LR_L2'   , LogisticRegression(penalty='l2', random_state=42)))
    PipelineModels.append(('CART' , DecisionTreeClassifier(random_state=42)))
    PipelineModels.append(('AB'   , AdaBoostClassifier(random_state=42)))
    PipelineModels.append(('GBM'  , GradientBoostingClassifier(n_estimators=100,max_features='sqrt', random_state=42)))
    PipelineModels.append(('BC'   , BaggingClassifier( n_estimators=50, random_state=42)))
    PipelineModels.append(('RF_Ent100'   , RandomForestClassifier(criterion='entropy', random_state=42)))
    PipelineModels.append(('RF_Gini100'   , RandomForestClassifier(criterion='gini', random_state=42)))
    PipelineModels.append(('ET100'   , ExtraTreesClassifier(n_estimators= 100, random_state=42 )))
    PipelineModels.append(('ET500'   , ExtraTreesClassifier(n_estimators= 500, random_state=42)))
    PipelineModels.append(('XGB1000'   , xgb.XGBClassifier(n_estimators= 1000, random_state=42)))
    PipelineModels.append(('XGB2000'   , xgb.XGBClassifier(n_estimators= 2000, random_state=42)))
    
    return PipelineModels

# function for performing 10-fold cross validation of all the baseline models, source Kaggle
def PipelineModel2(X_train, y_train,models):
    # Test options and evaluation metric
    num_folds = 10
    scoring = 'f1'
    seed = 7
    results = []
    names = []
    for name, model in models:
        kfold = model_selection.KFold(n_splits=10)
        cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring, n_jobs=-1)
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)
         
        
    return results,msg

In [None]:
models = PipelineModels()
names,results = PipelineModel2(X_train, y_train,models) 