<a href="https://colab.research.google.com/github/vikpy/AISem3/blob/master/HW/ML_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Creating a standard ML pipeline which can used for solving supervised ML problems 


We can create fundamentally two types of pipelines, 
one for the classification problems or pne for the regression problems. 
Thus let us create a common class which can cater to both the problems and create pipeline accordingly. Here we will assume that the target variable is already converted to numeric values using encoders

Reference:
https://medium.com/vickdata/a-simple-guide-to-scikit-learn-pipelines-4ac0d974bdcf

In [49]:
from sklearn.pipeline import Pipeline 
from pandas import read_csv
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder    
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis  
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

class CustomPipeline:
  def __init__(self, X, y, type_of_problem="regression"):
    '''
    X-> Pandas DataFrame
    y-> Pandas Series
    type_of_problem -> regression or classification
    '''
    self.X = X
    self.y = y
    self.type_of_problem = type_of_problem


  def _tts(self, X,y, train_fraction=0.8):
    X = X.sample(frac=1).reset_index(drop=True)
    train_size = int(np.ceil(train_fraction*X.shape[0]))
    X_train = X.head(train_size)
    X_test = X.tail(X.shape[0] - train_size)
    y_train = y[:train_size]
    y_test = y[train_size:]
    return X_train, X_test, y_train, y_test

  def _get_classifier_scores(self):
    # Splitting the data into train and test  
    X_train, X_test, y_train, y_test = self._tts(self.X, self.y)
    numeric_var_trans = []
    # Adding Imputer to impute missing values 
    numeric_var_trans.append(('imputer', SimpleImputer(strategy='median')))
    # Scaling the values
    numeric_var_trans.append(('scaler', StandardScaler()))
    # Creating a pipeline
    num_pipe = Pipeline(steps=numeric_var_trans)

    categorical_var_trans = []
    #Adding imputer to the categorical missing values
    categorical_var_trans.append(('imputer', SimpleImputer(strategy='most_frequent')))
    categorical_var_trans.append(('one_hot_encoder', OneHotEncoder(handle_unknown='ignore')))
    categorical_pipe = Pipeline(steps=categorical_var_trans)

    # Selecting Numeric features from the data 
    numeric_features = self.X.select_dtypes(include=['int64', 'float64']).columns
    
    categorical_features = self.X.select_dtypes(include=['object']).columns    

    # Creating preprocessing pipeline which will be common for regression 
    # or classification problem 
    preprocessing_pipe = ColumnTransformer(
    transformers=[
        ('num', num_pipe, numeric_features),
        ('cat', categorical_pipe, categorical_features)])
    
    #for classification problems 

    classifiers = [
      ('knn', KNeighborsClassifier(3)),
      ('svc', SVC(kernel="rbf", C=0.025, probability=True)),
      ('decision tree', DecisionTreeClassifier(max_depth=10)),
      ('random_forest', RandomForestClassifier(max_depth=10)),
      ('ada_boost', AdaBoostClassifier()),
      ('gradient_boost', GradientBoostingClassifier())
      ]
       
    classifier_scores = {}
    for (classifier_name, classifier_obj) in classifiers:
      pipe = Pipeline(steps=[('preprocessor', preprocessing_pipe),
                    (classifier_name, classifier_obj)])
      pipe.fit(X_train, y_train)   
      classifier_scores[classifier_name] = {"acc_train" : pipe.score(X_train, y_train),\
                                              "acc_test":  pipe.score(X_test, y_test)
      }
    return classifier_scores

  def _get_regressor_scores(self):
    # Splitting the data into train and test  
    X_train, X_test, y_train, y_test = self._tts(self.X, self.y)
    numeric_var_trans = []
    # Adding Imputer to impute missing values 
    numeric_var_trans.append(('imputer', SimpleImputer(strategy='median')))
    # Scaling the values
    numeric_var_trans.append(('scaler', StandardScaler()))
    # Creating a pipeline
    num_pipe = Pipeline(steps=numeric_var_trans)

    categorical_var_trans = []
    #Adding imputer to the categorical missing values
    categorical_var_trans.append(('imputer', SimpleImputer(strategy='most_frequent')))
    categorical_var_trans.append(('one_hot_encoder', OneHotEncoder(handle_unknown='ignore')))
    categorical_pipe = Pipeline(steps=categorical_var_trans)

    # Selecting Numeric features from the data 
    numeric_features = self.X.select_dtypes(include=['int64', 'float64']).columns
    
    categorical_features = self.X.select_dtypes(include=['object']).columns    

    # Creating preprocessing pipeline which will be common for regression 
    # or classification problem 
    preprocessing_pipe = ColumnTransformer(
    transformers=[
        ('num', num_pipe, numeric_features),
        ('cat', categorical_pipe, categorical_features)])
    
    #for classification problems 

    regressors = [
        ('dtc', DecisionTreeRegressor(max_depth=10))
      ]
       
    regressor_scores = {}
    for (regressor_name, regressor_obj) in regressors:
      pipe = Pipeline(steps=[('preprocessor', preprocessing_pipe),
                    (regressor_name, regressor_obj)])
      pipe.fit(X_train, y_train)   
      regressor_scores[regressor_name] = {"error_train" : pipe.score(X_train, y_train),\
                                              "error_test":  pipe.score(X_test, y_test)
      }
    return regressor_scores

  def get_result(self):
      if self.type_of_problem == "classification":
        return self._get_classifier_scores()
      elif self.type_of_problem == "regression":
        return self._get_regressor_scores()



In [34]:
# Importing the data 
# Datasource link: https://archive.ics.uci.edu/ml/datasets/Dermatology
import pandas as pd
df = pd.read_csv("/content/dermatology.csv")
df.head()

Unnamed: 0,erythema,scaling,definite borders,itching,koebner phenomenon,polygonal papules,follicular papules,oral mucosal involvement,knee and elbow involvement,scalp involvement,"family history, (0 or 1)",melanin incontinence,eosinophils in the infiltrate,PNL infiltrate,fibrosis of the papillary dermis,exocytosis,acanthosis,hyperkeratosis,parakeratosis,clubbing of the rete ridges,elongation of the rete ridges,thinning of the suprapapillary epidermis,spongiform pustule,munro microabcess,focal hypergranulosis,disappearance of the granular layer,vacuolisation and damage of basal layer,: spongiosis,saw-tooth appearance of retes,follicular horn plug,perifollicular parakeratosis,inflammatory monoluclear inflitrate,band-like infiltrate,Age (linear),Dermatology
0,2,2,0,3,0,0,0,0,1,0,0,0,0,0,0,3,2,0,0,0,0,0,0,0,0,0,0,3,0,0,0,1,0,55,2
1,3,3,3,2,1,0,0,0,1,1,1,0,0,1,0,1,2,0,2,2,2,2,2,1,0,0,0,0,0,0,0,1,0,8,1
2,2,1,2,3,1,3,0,3,0,0,0,1,0,0,0,1,2,0,2,0,0,0,0,0,2,0,2,3,2,0,0,2,3,26,3
3,2,2,2,0,0,0,0,0,3,2,0,0,0,3,0,0,2,0,3,2,2,2,2,0,0,3,0,0,0,0,0,3,0,40,1
4,2,3,2,2,2,2,0,2,0,0,0,1,0,0,0,1,2,0,0,0,0,0,0,0,2,2,3,2,3,0,0,2,3,45,3


In [35]:
# just to check if the pipeline is handling categorical values 
# properly we convert some of the values to string
df.itching = df.itching.astype(str) 
df.exocytosis = df.exocytosis.astype(str)
df["Age (linear)"] = df["Age (linear)"].str.replace("?", "99").astype(np.int64)
df.dtypes

erythema                                     int64
scaling                                      int64
definite borders                             int64
itching                                     object
koebner phenomenon                           int64
polygonal papules                            int64
follicular papules                           int64
oral mucosal involvement                     int64
knee and elbow involvement                   int64
scalp involvement                            int64
family history, (0 or 1)                     int64
melanin incontinence                         int64
eosinophils in the infiltrate                int64
PNL infiltrate                               int64
fibrosis of the papillary dermis             int64
exocytosis                                  object
acanthosis                                   int64
hyperkeratosis                               int64
parakeratosis                                int64
clubbing of the rete ridges    

       Class code:   Class:                  Number of instances:
       1             psoriasis			    112
       2             seboreic dermatitis             61
       3             lichen planus                   72
       4             pityriasis rosea                49
       5             cronic dermatitis               52    
       6             pityriasis rubra pilaris        20

In [42]:
X = df.iloc[:, :34]
y = df.iloc[:, 34]
pipe = CustomPipeline(X, y, type_of_problem="classification")
pipe.get_result()

{'ada_boost': {'acc_test': 0.1506849315068493,
  'acc_train': 0.3242320819112628},
 'decision tree': {'acc_test': 0.2054794520547945,
  'acc_train': 0.7406143344709898},
 'gradient_boost': {'acc_test': 0.1780821917808219, 'acc_train': 1.0},
 'knn': {'acc_test': 0.2465753424657534, 'acc_train': 0.5187713310580204},
 'random_forest': {'acc_test': 0.2876712328767123,
  'acc_train': 0.9931740614334471},
 'svc': {'acc_test': 0.3561643835616438, 'acc_train': 0.2935153583617747}}

In [50]:
df  = pd.read_csv("/content/sample_data/california_housing_train.csv")
X = df.iloc[:, :8]
y = df.iloc[:, 8]
pipe = CustomPipeline(X, y)
pipe.get_result()

{'dtc': {'error_test': -0.1855026453706992,
  'error_train': 0.053103115386128885}}