In [4]:
import sys
import pandas as pd
import matplotlib
import sklearn
import numpy as np
from tensorflow import keras
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib
from matplotlib import pyplot as plt
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import joblib
from sklearn.metrics import plot_confusion_matrix
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import  SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator,TransformerMixin
import warnings; warnings.simplefilter('ignore')
from sklearn.feature_selection import RFECV
from sklearn.svm import SVC
from mlxtend.feature_selection import SequentialFeatureSelector as SFS


In [5]:
%config Completer.use_jedi=False

In [6]:
''' 
End to End Machine Learning Pipeline with hyperparameter tuning and Feature Selection

'''

class Pipeline(BaseEstimator,TransformerMixin):
    
    '''
    Function Name:__init__

    Parameters
    1)label_name: str
    Name of the target_variable

    2)algo:str
    Algorithm to use to train models.(Current available options 1.Random Forest;2.Logistic Regression)
    3)scoring:str 
    metric to maximize

    4)categorical_cols:List(str) default:[]
    List of Columns to convert to Ordinal values

    5)want_preprocess:bool default:True
    Whether to avail preprocessing provided by the pipeline

    ##########################################################################################################################
    6)first iter:bool default:False
    Train base models 
    Note:Only set true while running code for first time or using a previously unused algorithm(set by algo parameter) 
    for the first time.

    7)want_train:bool default:False
    train models on transformed dataset
    Note:Pipeline return estimator(model) when this parameter is set to True

    ##########################################################################################################################
    8)recheck:bool default:False
    recompute best features
    '''    

    def __init__(self,label_name,algo='Random_Forest',scoring='roc_auc',categorical_cols=[],want_preprocess=True,first_iter=False,want_train=False,recheck=False):
        
        self.label_name=label_name
        
        self.model_name=algo
        self.scoring=scoring
        self.categorical_cols=categorical_cols
        self.want_preprocess=want_preprocess
        self.first_iter=first_iter
        self.want_train=want_train
        self.recheck=recheck
        self.best_estimator=None
        
    '''
     Function Name:categorical_column dealing

     Parameter

     1)catergorical_columns:List(str)
     List of columns to convert to Ordinal Values

     returns Pandas Dataframe
    '''       
        
        
        
    def categorical_column_dealing(self,categorical_columns=[]):
        self.categorical_columns=categorical_columns
        all_category=self.df.dtypes[self.df.dtypes==object].index
        unwanted_category=all_category[~(all_category.isin(categorical_columns))]
        self.df=self.df.drop(columns=unwanted_category)
        self.reqd_features=self.df.columns
        if len(self.categorical_cols)!=0:
            col_transformer=ColumnTransformer([
                ("ord",OrdinalEncoder(),categorical_columns),
                    ])
            self.encoder=col_transformer.fit_transform(self.df)
            self.df[categorical_columns]=self.encoder
        return self.df
    
    '''
     Function Name:preprocess

     Parameter

     1)dataframe:dataframe default:Empty Dataframe
     Dataframe to be processed

     2)test:bool default False
     Whether the dataset being passed is testset or not

    #########################################################
     3)want_sample bool default False
      Balance dataset based on target variable
      Note:IF want_sample is set to true then test should be false(default)
    ###########################################################

     returns Pandas Dataframe,Pandas Series i.e: Dataset,label
    '''       
    
    def preprocess(self,dataframe=pd.DataFrame(),test=False,want_sample=False):
        if dataframe is None:
            self.df=dataframe
        self.df=self.df.drop(columns=self.df.filter(like='Unnamed',axis=1).columns)
        
        inds=['index','id','ID','Index','INDEX']
        print(self.df.columns)
        try:
            id_colname=inds[np.where(np.isin(['index','id','ID','Index','INDEX'],self.df.columns))[0][0]]

            if id_colname:
                self.df=self.df.drop(columns=[id_colname])
            df_cols=self.df.columns
        except:
            df_cols=self.df.columns
        
        feature_df=self.df[self.df.columns.drop([self.label_name])]
        self.reqd_features=feature_df.columns
        
        pipe=sklearn.pipeline.Pipeline([('imputer',SimpleImputer(strategy='median')),('scaler',StandardScaler())])
        arr=pipe.fit_transform(feature_df)
        
        self.df=np.concatenate([arr,np.array(self.df[self.label_name].values).reshape(arr.shape[0],1)],axis=1)
        self.reqd_features=np.append(np.array(feature_df.columns),self.label_name)
        self.df=pd.DataFrame(self.df,columns=self.reqd_features)
       
        
            

        if test==False and want_sample==True:
            zero_df=self.df.loc[self.df[self.label_name]==0]
            one_df=self.df.loc[self.df[self.label_name]==1]
            one_df=one_df.sample(np.int64(np.floor(zero_df.shape[0]/2.0)),replace=True,random_state=42)
            self.df=pd.concat([zero_df,one_df])
            self.df=self.df.sample(frac=1,random_state=42)
            
        if dataframe.empty:
            self.df,testset=train_test_split(self.df,test_size=0.10,random_state=42)

        if dataframe.empty == False:
            return self.df[feature_df.columns],self.df[self.label_name]
        else:
            return self.df[feature_df.columns],self.df[self.label_name],testset[feature_df.columns],testset[self.label_name]
    
    
    '''
     Function Name:train_models

     Parameter

     1)traindata:Dataframe
     TrainSet

     2)trainlabel:Series or array
     Labels

     #######################################################################################################
     Note:i)This function trains models and stores them in your current directory
     #####################################################################################################

     returns None
    '''       
    
    
    
    
    def train_models(self,traindata,trainlabel):
        
        if self.model_name in ['logistic_regression','Logistic_regression','logistic regression','Logistic_Regression','Logistic Regression']:
            from sklearn.linear_model import LogisticRegression
            logreg=LogisticRegression(penalty='l2',solver='liblinear',max_iter=200,random_state=42)
            parameters=[{'C':[0.001,0.01,0.1,1,10]}]
            grlogreg=GridSearchCV(logreg,parameters,cv=3,n_jobs=-1,scoring=self.scoring)
            grlogreg.fit(traindata,trainlabel)
            logreg=grlogreg.best_estimator_
            logreg.fit(traindata,trainlabel)
            joblib.dump(logreg,self.model_name+'_'+self.scoring + '_' + self.label_name)
            
        elif self.model_name in ['Random Forest','Random_Forest','Random_forest','random_forest','random forest']:       
            n_estimators=[int(x) for x in np.linspace(start=200,stop=2000,num=10)]
            max_features=['auto','sqrt']
            max_depth=[int(x) for x in np.linspace(10,110,num=11)]
            max_depth.append(None)
            min_samples_split=[2,5,10]
            min_samples_leaf=[1,2,4]
            bootstrap=[True,False]
            param_grid={'n_estimators':n_estimators,'max_features':max_features,'max_depth':max_depth,'min_samples_split':min_samples_split
                           ,'min_samples_leaf':min_samples_leaf,'bootstrap':bootstrap}
            rf=RandomForestClassifier()
            rrf=RandomizedSearchCV(estimator=rf,param_distributions=param_grid,cv=3,scoring=self.scoring,verbose=2,n_iter=50,random_state=42,n_jobs=-1,refit=True)
            rrf.fit(traindata,trainlabel)
            joblib.dump(rrf.best_estimator_,self.model_name+ '_' +self.scoring + '_' + self.label_name)
            
        elif self.model_name in ['SVC','LinearSVC','svc']:
            param_grid = {'C': [0.1, 1, 10, 100, 1000]}
            svm=SVC(kernel='linear')
            svsearch=GridSearchCV(svm,param_grid,cv=3,n_jobs=-1,scoring=self.scoring)
            svsearch.fit(traindata,trainlabel)
            joblib.dump(svsearch.best_estimator_,self.model_name+ '_' +self.scoring + '_' + self.label_name)
            
        
        
        
    '''
     Function Name:features_selection

     Parameter

     1)trainset:Dataframe
     TrainSet

     2)trainlabel:Series or array
     Labels

     3)refit:bool default:False
       Set to true if want to rerun base models to extract features

     #######################################################################################################
     Note:i)This function trains models and stores them in your current directory(doesnt store if refit=False)
     #####################################################################################################

     returns 
     1)statsdf:Dataframe
     Stastical Data of all columns

     2)self.features:estimator
     ###################################################################################################
     Variable List:
     self.features.k_scores_:Returns score of model trained on Best feature subset
     self.features.k_feature_names:Return best tuple of best features
     self.features.subsets_:Returns a detailed dictionary holding data of all feature subsets
     ###################################################################################################
     3)final_ranks:List
     features subset filtered out by statistical methods

    '''
        
    def features_selection(self,trainset,trainlabel,refit=False):
        info_gain=mutual_info_classif(trainset,trainlabel)
        impo=pd.Series(info_gain,trainset.columns)
        impo.sort_values(ascending=False,inplace=True)
        trainset.insert(trainset.shape[1],self.label_name,trainlabel,True)
        corr=trainset.corr()
        cor_mat_ranks=np.abs(corr.loc[self.label_name].drop(index=self.label_name).sort_values(ascending=False))
        trainset.drop(columns=[self.label_name],inplace=True)
        variance_rank=np.var(trainset,axis=0).sort_values(axis=0,ascending=False)
        
        statsdf=pd.DataFrame([impo,cor_mat_ranks,variance_rank]).T
        statsdf.rename(columns={'Unnamed 0':'info gain',self.label_name:'Corelation','Unnamed 1':'Variance'},inplace=True)
        
        final_ranks=(0.01*cor_mat_ranks+0.01*impo+0.01*variance_rank)
        statsdf['Total']=final_ranks
        
        mean=np.mean(statsdf['Total'])
        final_ranks=statsdf['Total']
        for i in dict(final_ranks):
            if final_ranks[i]< mean:
                final_ranks.drop(index=[i],inplace=True)
        self.statsdf=statsdf
        trainset=trainset[final_ranks.index]
        

        
        

        
        if refit==True:
            model=joblib.load(self.model_name+'_'+self.scoring+ '_'+self.label_name)
            feature_selector=RFECV(model, step=15, cv=2,
            scoring=self.scoring,
            min_features_to_select=1,n_jobs= -1)
            n_selector=feature_selector.fit(trainset,trainlabel)
            trainset=trainset.drop(columns=trainset.columns[~n_selector.support_])
            
            
            feature_selector=SFS(model,k_features=(1,trainset.shape[1]),cv=2,forward=True,scoring=self.scoring,
                                 n_jobs=-1)
            features=feature_selector.fit(trainset,trainlabel,custom_feature_names=trainset.columns)
            joblib.dump(features,'exhaustive_'+self.model_name+'_'+self.scoring+'_'+self.label_name +'_results')
            self.best_features_=features.k_feature_names_
            self.features=features
        else:
            exhaust=joblib.load('exhaustive_'+self.model_name+'_'+self.scoring+'_'+self.label_name +'_results')
            self.features=exhaust
 ##########################################################################################################################           #
        self.final_ranks=final_ranks
        return self.statsdf,self.features,final_ranks
    '''
     Function Name:train_model

     Parameter

     1)trainset:Dataframe
     TrainSet

     2)trainlabel:Series or array
     Labels

     Detail:Trains model on best feature subset

      #######################################################################################################
     Note:i)This function trains models and stores them in your current directory

     Can access the estimator trained on best features through the varialbe self.best_estimator
     #####################################################################################################
    '''   
    
    def train_model(self,trainset,trainlabel):
        
        best_features=self.features.k_feature_names_
        if self.model_name in ['logistic_regression','Logistic_regression','logistic regression','Logistic_Regression','Logistic Regression']:
            from sklearn.linear_model import LogisticRegression
            logreg=LogisticRegression(penalty='l2',solver='liblinear',max_iter=200,random_state=42)
            parameters=[{'C':[0.001,0.01,0.1,1,10]}]
            grlogreg=GridSearchCV(logreg,parameters,cv=3,n_jobs=-1,scoring=self.scoring)
            grlogreg.fit(trainset,trainlabel)
            logreg=grlogreg.best_estimator_
            logreg.fit(trainset[list(best_features)],trainlabel)
            joblib.dump(logreg,'final'+self.model_name+'_'+self.scoring + '_' + self.label_name)
    
    
    
        elif self.model_name in ['Random Forest','Random_Forest','Random_forest','random_forest','random forest']:       
            n_estimators=[int(x) for x in np.linspace(start=200,stop=2000,num=10)]
            max_features=['auto','sqrt']
            max_depth=[int(x) for x in np.linspace(10,110,num=11)]
            max_depth.append(None)
            min_samples_split=[2,5,10]
            min_samples_leaf=[1,2,4]
            bootstrap=[True,False]
            param_grid={'n_estimators':n_estimators,'max_features':max_features,'max_depth':max_depth,'min_samples_split':min_samples_split
                           ,'min_samples_leaf':min_samples_leaf,'bootstrap':bootstrap}
            rf=RandomForestClassifier()
            rrf=RandomizedSearchCV(estimator=rf,param_distributions=param_grid,cv=3,scoring=self.scoring,verbose=2,n_iter=40,random_state=42,n_jobs=-1,refit=True)
            rrf.fit(trainset[list(best_features)],trainlabel)
            joblib.dump(rrf.best_estimator_,'final'+self.model_name+ '_' +self.scoring + '_' + self.label_name)
            
        elif self.model_name in ['SVC','LinearSVC','svc']:
            param_grid = {'C': [0.1, 1, 10, 100, 1000]}
            svm=SVC(kernel='linear')
            svsearch=GridSearchCV(svm,param_grid,cv=3,n_jobs=-1,scoring=self.scoring)
            svsearch.fit(trainset,trainlabel)
            joblib.dump(svsearch.best_estimator_,'final'+self.model_name+ '_' +self.scoring + '_' + self.label_name)
    
        self.best_estimator=joblib.load('final'+self.model_name+ '_' +self.scoring + '_' + self.label_name)
        
        return self.best_estimator
    
    
    '''
    Function Name:predict_and_plot

     Parameter

     1)trainset:Dataframe
     TrainSet

     2)trainlabel:Series or array
     Labels

     3)option:string
     #############################################################
     option list
     1)predictions
     2)'accuracy_score'
     3)precision_score
     4)recall_score
     5)roc_auc_score
     6)plot_roc_curve
     7)confusion_matrix

     4)estimator:sklearn estimator
     model trained on dataset containing best features subset provided by Features_selection method
    '''
    
    
    def predict_and_plot(self,option,trainset,trainlabel,estimator=None):
    
        
        
        
        y=estimator.predict(trainset)
        if option=='predictions':
            return y
        
        if option=='accuracy_score':
            return accuracy_score(trainlabel,y)
        
        if option=='precision_score':
            return sklearn.metrics.precision_score(trainlabel,y)
        if option=='recall_score':
            return sklearn.metrics.recall_score(trainlabel,y)
        if option=='roc_auc_score':
            return sklearn.metrics.roc_auc_score(trainlabel,y)
        if option=='plot_roc_curve':
            sklearn.metrics.plot_roc_curve(estimator,trainset,trainlabel)
            plt.grid(True)
            plt.show()
            
        if option=='confusion_matrix':
            plot_confusion_matrix(estimator,trainset,trainlabel)
            plt.show()
        
        
        
    def fit(self,trainset,trainlabel=None):
        return self

    '''
    Function Name:transform

    Parameter

    1)path:str or os.path
    path to csv file which contains the data

    Detail:Applies all above mentioned processing to the dataset based on the parameters provided to __init__ function.

    returns best_features.k_feature_name:list of names of best feature subset 
    '''
    def transform(self,path):
        self.df=pd.read_csv(path)
        self.reqd_features=self.df.columns.drop([self.label_name])
        
        self.transform_df=self.categorical_column_dealing(self.categorical_cols)
        if self.want_preprocess ==True:
            self.transform_df,self.transform_label=self.preprocess(dataframe=self.transform_df,want_sample=False)
        
        if self.first_iter==True:
            self.train_models(self.transform_df,self.transform_label)
        stats,best_features,filtered_features=self.features_selection(self.transform_df,self.transform_label,self.recheck)
        if self.want_train==True:
            self.best_estimator=self.train_model(self.transform_df[list(best_features.k_feature_names_)],self.transform_label)
        else:
            self.best_estimator=joblib.load('final'+self.model_name+ '_' +self.scoring + '_' + self.label_name)
        
        return (self.transform_df[list(best_features.k_feature_names_)],self.transform_label)
    