# Training pipeline validation

In [None]:
import pandas as pd
import numpy as np
import pickle 

from sklearn import datasets, ensemble, model_selection

from evidently.report import Report
from evidently.metric_preset import ClassificationPreset, DataDriftPreset, DataQualityPreset
from evidently.metrics import ColumnDriftMetric, DatasetDriftMetric

## Load Data

In [None]:
bank_marketing = datasets.fetch_openml(name='bank-marketing', as_frame='auto')
bank_marketing_data = bank_marketing.frame

## Attribute Information:

### Bank client data:
* **Age** (numeric)
* **Job :** type of job (categorical: 'admin.', 'blue-collar', 'entrepreneur', 'housemaid', 'management', 'retired', 'self-employed', 'services', 'student', 'technician', 'unemployed', 'unknown')
* **Marital :** marital status (categorical: 'divorced', 'married', 'single', 'unknown' ; note: 'divorced' means divorced or widowed)
* **Education** (categorical: 'basic.4y', 'basic.6y', 'basic.9y', 'high.school', 'illiterate', 'professional.course', 'university.degree', 'unknown')
* **Default:** has credit in default? (categorical: 'no', 'yes', 'unknown')
* **Balance:** average yearly balance, in euros (numeric)
* **Housing:** has housing loan? (categorical: 'no', 'yes', 'unknown')
* **Loan:** has personal loan? (categorical: 'no', 'yes', 'unknown')

### Related with the last contact of the current campaign:
* **Contact:** contact communication type (categorical:
'cellular','telephone')
* **Day:** ast contact day of the month (numeric)
* **Month:** last contact month of year (categorical: 'jan', 'feb', 'mar', …, 'nov', 'dec')
* **Duration:** last contact duration, in seconds (numeric). Important
note: this attribute highly affects the output target (e.g., if
duration=0 then y='no'). Yet, the duration is not known before a call
is performed. Also, after the end of the call y is obviously known.
Thus, this input should only be included for benchmark purposes and
should be discarded if the intention is to have a realistic
predictive model.

### Other attributes:
* **Campaign:** number of contacts performed during this campaign and for
this client (numeric, includes last contact)
* **Pdays:** number of days that passed by after the client was last
contacted from a previous campaign (numeric; 999 means client was not
previously contacted)
* **Previous:** number of contacts performed before this campaign and for
this client (numeric)
* **Poutcome:** outcome of the previous marketing campaign (categorical:
'failure','nonexistent','success')

In [None]:
bank_marketing_data.info()

In [None]:
bank_marketing_data.head()

In [None]:
bank_marketing_data.columns = ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 
                              'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'class']

In [None]:
bank_marketing_data.head()

In [None]:
bank_marketing_data['class'].value_counts()

In [None]:
def feature_engineering(raw_data: pd.DataFrame) -> pd.DataFrame:
    preprocessed_data = raw_data.copy(deep = True)

    preprocessed_data.columns = ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 
                              'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'class']

    #client data preprocessing
    preprocessed_data['has_default'] = preprocessed_data.default.apply(
        lambda x : 0 if x == 'no' else 1 if x == 'yes' else -1
    )

    preprocessed_data['has_housing_loan'] = preprocessed_data.housing.apply(
        lambda x : 0 if x == 'no' else 1 if x == 'yes' else -1
    )

    preprocessed_data['has_personal_loan'] = preprocessed_data.loan.apply(
        lambda x : 0 if x == 'no' else 1 if x == 'yes' else -1
    )

    marital_dummies = pd.get_dummies(preprocessed_data.marital, prefix = 'marital')
    preprocessed_data = pd.concat([preprocessed_data, marital_dummies], axis = 1)

    job_dummies = pd.get_dummies(preprocessed_data.job, prefix = 'job')
    preprocessed_data = pd.concat([preprocessed_data, job_dummies], axis = 1)

    edu_dummies = pd.get_dummies(preprocessed_data.education, prefix = 'edu')
    preprocessed_data = pd.concat([preprocessed_data, edu_dummies], axis = 1)

    preprocessed_data.drop(columns = ['default', 'housing', 'loan', 'marital', 'job', 'education'], inplace=True)

    # last contact data preprocessing
    contact_dummies = pd.get_dummies(preprocessed_data.contact, prefix = 'contact_type')
    preprocessed_data = pd.concat([preprocessed_data, contact_dummies], axis = 1)

    month_dummies = pd.get_dummies(preprocessed_data.month, prefix = 'month')
    preprocessed_data = pd.concat([preprocessed_data, month_dummies], axis = 1)   

    preprocessed_data.drop(columns = ['contact', 'month'], inplace=True)
    
    # other attributes preprocessing
    poutcome_dummies = pd.get_dummies(preprocessed_data.poutcome, prefix = 'prev_camp_outcome')
    preprocessed_data = pd.concat([preprocessed_data, poutcome_dummies], axis = 1)
    preprocessed_data.drop(columns = ['poutcome'], inplace=True)

    #target preprocessing
    preprocessed_data['target'] = preprocessed_data['class'].apply(lambda x : 0 if x == '1' else 1)
    preprocessed_data.drop(columns = ['class'], inplace=True)
    
    return preprocessed_data
    

In [None]:
train_data = bank_marketing_data[:5000]

reference_data = bank_marketing_data[5000:7000]

prod_simulation_data = bank_marketing_data[7000:]

batch_size = 2000

In [None]:
processed_train = feature_engineering(train_data)

In [None]:
processed_reference = feature_engineering(reference_data)

In [None]:
processed_train.head()

In [None]:
processed_reference.info()

## Model Training

In [None]:
model = ensemble.RandomForestClassifier(random_state=42, n_estimators=100)

In [None]:
processed_train.iloc[:, :-1]

In [None]:
model.fit(processed_train.iloc[:, :-1], processed_train['target'])

In [None]:
processed_train['prediction'] = model.predict(processed_train.iloc[:, :-1])
processed_reference['prediction'] = model.predict(processed_reference.iloc[:, :-1])

## Model Evaluation

In [None]:
model_quality_report = Report(metrics=[ClassificationPreset()])
model_quality_report.run(reference_data=processed_train, current_data=processed_reference)
model_quality_report.show(mode='inline')

## Save assets

In [None]:
with open('model.pckl', 'wb') as fin:
    pickle.dump(model, fin)