In [1]:
#base library
import os
import pandas as pd
import numpy as np
import sklearn
import gc
import pickle
import lightgbm

#visualization
import seaborn as sns
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.express as px
import matplotlib.pyplot as plt


#datacleaning
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline

#preprocessing
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler,PolynomialFeatures
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer

#models
from sklearn.model_selection import GridSearchCV,cross_validate,ParameterGrid,train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC,SVC
from xgboost import XGBClassifier 
from lightgbm import LGBMClassifier

#model evaluation
from sklearn.metrics import classification_report

#Version control
import mlflow

## Main config file
This config file will be transformed intot a config.yaml file during production

In [4]:
CONFIG = dict(
    random_seed = 43,
    train_filepath = os.path.join("..", "data", "raw_data" ,"Train.csv"),
    test_filepath  = os.path.join("..", "data", "raw_data" ,"Test.csv"),
    label_map = dict(PASS=1, FAIL=0),
    fig_root = os.path.join("..", "figures"),
    
    normalizers = {
        "normalizer":[MinMaxScaler(),StandardScaler(),RobustScaler()]
    },
    
    models = [dict(estimator=RandomForestClassifier(random_state=43),
                   name = "RandomForest",
                   params={"clf__n_estimators":[70,75,85]}),
              
#              dict(estimator=LinearSVC(),
#                   name = "LinearSVC",
#                   params={"clf__C":[0.5,1.0,1.2]}),
              
             dict(estimator=SVC(),
                  name = "SVC",
                  params={"clf__C":[0.7,0.8,0.9],
                          "clf__kernel":['poly','rbf']}),
              
             dict(estimator=XGBClassifier(use_label_encoder=False),
                  name = "XGBClassifier",
                  params={"clf__n_estimators":[90,100,110],
                          "clf__max_depth":[3,4,5],
                          "clf__eval_metric":['mlogloss'],
                          }),
              
             dict(estimator=LGBMClassifier(),
                  name = "LGBMClassifier",
                  params={"clf__n_estimators":[70,80,90],
                          "clf__num_leaves":[16,17,18]}),
             ],
    
    scorings = ["f1","accuracy","f1_micro",
                "f1_macro","f1_weighted", "roc_auc",
                "precision","recall"],
    
    trained_model_dir = os.path.join("..", "models","trained_model.pkl"),
    
    prediction_file_path = os.path.join("..", "data", "raw_data" ,"test_prediction.csv"),
    
)

## Utility and custom transformers

In [None]:
class TransformToNumeric(BaseEstimator,TransformerMixin):
    '''
    This class serve 2 purposes. 
    1. Ensure correct conversion of "O" type to float64
    2. Only extract the features that are  present in the training features to continue to down stream processes
    ''' 
    def __init__(self):
        self.columns_to_transfrom =[]
    
    def fit(self, X, y=None):
        self.columns_to_transfrom = X.columns
        return self

    def transform(self, X, y=None):
        X_copy = X[self.columns_to_transfrom].copy()
        for col in X_copy.columns:
            try:
                X_copy[col] = X_copy[col].astype(np.float64)
            except:
                X_copy[col] = pd.to_numeric(X_copy[col],errors="coerce").values
        return X_copy


class PolyFeatures(BaseEstimator,TransformerMixin):
    def __init__(self, columns_to_transform):
        self.columns_to_transform= columns_to_transform
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, df, y= None):
        df = df.copy()
        for i in range(len(self.columns_to_transform)-1):
            for j in range(i+1,len(self.columns_to_transform)):
                column_name = "{}_{}".format(self.columns_to_transform[i],self.columns_to_transform[j])
                df[column_name] = df[self.columns_to_transform[i]]*df[self.columns_to_transform[j]]
#                 print(column_name)
        return df
    
class LabelMapper():
    def __init__(self,label_map):
        self.label_map = label_map
        
    def __call__(self,row):
        return self.label_map[row]
    
    def reverse_map(self, row):
        reverse_mapping = {value:key for key, value in selt.label_map.items()}
        return reverse_mapping[row]
    
def get_best_model(train_models):
    best_score= -1
    best_model = None
    all_scores = []
    
    
    for train_model in train_models:
        model_best_score = train_model.best_score_
        all_scores.append(model_best_score)
        if model_best_score > best_score:
            best_score = model_best_score
            best_model = train_model
        
    print("Best score of {:.2f} among {}".format(best_score*100, all_scores))
    
    return best_model


## Random seeds

In [None]:
np.random.seed(CONFIG["random_seed"])

## Loading of data and changing of data into correct datatypes

In [None]:
#loading the data
train_df = pd.read_csv(CONFIG['train_filepath'],index_col=0)
test_df  = pd.read_csv(CONFIG['test_filepath'], index_col=0)

#split training data into features and labels
X_train = train_df.drop("STATUS",axis=1)
y_train = train_df["STATUS"]

#Converting X_train and test_df into float32/nan
data_type_transformer = TransformToNumeric()
X_train = data_type_transformer.fit_transform(X_train)
test_df = data_type_transformer.transform(test_df)

#mapping of y_train data into numeric 
laber_mapper = LabelMapper(CONFIG['label_map'])
y_train = y_train.map(laber_mapper)

#removing rows with more than 50% missing data
train_df = pd.concat([X_train, y_train], axis = 1)
print("Number of training data before removing removing rows with more than 50% missing data:", len(train_df) )
train_df = train_df.dropna(thresh=len(train_df.columns)//2)
print("Number of training data after removing removing rows with more than 50% missing data:", len(train_df))




#split training data into features and labels
X_train = train_df.drop("STATUS",axis=1)
y_train = train_df["STATUS"]

X_train, X_val, y_train, y_val = train_test_split(X_train,y_train,test_size=0.1,random_state=CONFIG["random_seed"],stratify=y_train)


## EDA
This is the exploration phase. We do this to understand the data
1. Histogram plot
2. crossplot
3. Number of missing data
4. Outliers
6. Check the missing data row distribution, use seaborn to plot
5. **Check wif the distribution of the test file is the same as the training set (not practical as we will optimized for purely unseen data)


In [None]:
label_counts = train_df['STATUS'].value_counts()

fig = go.Figure()
fig.add_trace(go.Bar(y=label_counts.values, x=label_counts.index,
                     text=label_counts.values*100/sum(label_counts.values),
                     textposition='auto'))
fig.update_layout(title="Label distribution")

fig.show()

In [None]:
#Checking of missing data heat map
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20,5))

sns.heatmap(train_df.isna(),ax=ax[0])
sns.heatmap(test_df.isna(),ax=ax[1])

plt.show()

X_train_na_cols = {}
test_na_cols = {}

for col in X_train.columns:
    num_missing_value = X_train[col].isna().sum()
    total_row = len(X_train)
    if num_missing_value > 4:
        X_train_na_cols[col] = num_missing_value/total_row *100
        
for col in test_df.columns:
    num_missing_value = test_df[col].isna().sum()
    total_row = len(test_df)
    if num_missing_value > 3:
        test_na_cols[col] = num_missing_value/total_row *100
        
        
train_missing_df = pd.DataFrame(X_train_na_cols,index=["Train_data"])
test_missing_df = pd.DataFrame(test_na_cols,index=["Test_data"])
test_missing_df.append(train_missing_df)


### We can see that the number of missing value is not a huge percentage of the total number of the observation. Since the training set is 4 times as large as the test set, we set the missing value threshold to 4. We see this special column that we may choose to dive deeper

## Violin Plot

In [None]:
#plot the violin plot for train and test data
fig = make_subplots(rows=42, cols=4, subplot_titles=X_train.columns)
fig.update_layout(height=12000, width=1200, title_text="Distribution plot")

for i, col_name in enumerate(X_train.columns):
    
    row = (i // 4) + 1
    col = (i %  4) + 1 
    
    fig.add_trace(go.Violin(y=X_train[col_name], box_visible=True,
                  meanline_visible=True, opacity=0.6, name="TRAIN", showlegend=False, 
                            points="outliers",pointpos=-1), row=row, col= col,)

    fig.add_trace(go.Violin(y=test_df[col_name], box_visible=True,
                  meanline_visible=True, opacity=0.6, name="TEST", showlegend=False, 
                            points="outliers",pointpos=1), row=row, col= col)

fig.write_html(os.path.join(CONFIG['fig_root'],"raw_violin_plot.html"))

## Correlation

In [None]:
fig = plt.figure(figsize=(12,8))
ax = plt.gca()
sns.heatmap(X_train.corr(), ax= ax);

# Preprocessing pipeline
1. Remove zero variance columns
2. Remove outlier (not going to remove outlier as data does not support removing any outliers)
3. Normalization #have to come before imputation
4. Imputation 

In [None]:
# Testing zero variance removal
zero_variance_removal= VarianceThreshold(threshold=0.0)
print("Number of columns before removing zero variance cols", len(X_train.columns))
after_remove = zero_variance_removal.fit_transform(X_train)
print("Number of columns after removing zero variance cols", sum(zero_variance_removal.get_support()))

## Training

In [None]:
trained_models = []

normalizers = CONFIG['normalizers']

for model in CONFIG["models"]:
    model_obj = model["estimator"]
    model_name = model['name']
    model_params = model["params"]
    
   
    
    model_pipeline = Pipeline(steps=[
        ("feature_selector", VarianceThreshold()),
        ("normalizer",StandardScaler()),
        ("imputer",SimpleImputer()),
        ("clf", model_obj)
    ])
    param_grid = {**normalizers, **model_params }
    
    print("Running model:",model_obj, "Model name:",model_name)
    print("model_params:", param_grid)
    
    
    search_model = GridSearchCV(model_pipeline,param_grid=param_grid,cv=5,scoring=CONFIG['scorings'],refit="f1_macro")
    search_model.fit(X_train,y_train)
    trained_models.append(search_model)


## Saving the trained model and loading them

In [None]:
with open(CONFIG["trained_model_dir"] , 'wb') as handles:
    pickle.dump(trained_models, handles, protocol = pickle.HIGHEST_PROTOCOL)
    
with open(CONFIG["trained_model_dir"] , 'rb') as handles:
    trained_models = pickle.load(handles)
    
best_model = get_best_model(trained_models)

## Validating model

In [None]:
preds = best_model.predict(X_val)
print(classification_report(y_val, preds))

In [None]:
feature_selector = best_model.best_estimator_['feature_selector']
new_columns = X_train.columns[feature_selector.get_support()]
new_columns[76]

## Plotting feature importance

In [None]:
lightgbm_model = best_model.best_estimator_['clf']
lightgbm_model
lightgbm.plot_importance(lightgbm_model,figsize=(20,30));

## Feature cross attempt

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(y = X_train['C94']**0.5, x= X_train['C3'],mode='markers',marker=dict(color=y_train)))

In [None]:
trained_models = []
features_to_enhance =['C94', 'C3', 'C5', 'C2', 'C17']

normalizers = CONFIG['normalizers']

for model in CONFIG["models"]:
    model_obj = model["estimator"]
    model_name = model['name']
    model_params = model["params"]
    
   
    
    model_pipeline = Pipeline(steps=[
        ("feature_enchance", PolyFeatures(features_to_enhance)),
        ("feature_selector", VarianceThreshold()),
        ("normalizer",StandardScaler()),
        ("imputer",SimpleImputer()),
        ("clf", model_obj)
    ])
    param_grid = {**normalizers, **model_params }
    
    print("Running model:",model_obj, "Model name:",model_name)
    print("model_params:", param_grid)
    
    
    search_model = GridSearchCV(model_pipeline,param_grid=param_grid,cv=5,scoring=CONFIG['scorings'],refit="f1_macro")
    search_model.fit(X_train,y_train)
    trained_models.append(search_model)

In [None]:
with open(CONFIG["trained_model_dir"] , 'wb') as handles:
    pickle.dump(trained_models, handles, protocol = pickle.HIGHEST_PROTOCOL)
    
with open(CONFIG["trained_model_dir"] , 'rb') as handles:
    trained_models = pickle.load(handles)
    
best_model = get_best_model(trained_models)

In [None]:
preds = best_model.predict(X_val)
print(classification_report(y_val, preds))

## Prediction 

In [None]:
test_prediction = best_model.predict(test_df)
test_df['prediction'] = test_prediction
test_df.to_csv(CONFIG['prediction_file_path'])
test_df.head()