Autoreload notebook

In [4]:
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
import os
import warnings
import sys

import dvc.api
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet, LogisticRegression
from sklearn.preprocessing import LabelEncoder
from fast_ml.model_development import train_valid_test_split
from urllib.parse import urlparse
import mlflow
import mlflow.sklearn
import logging

>> #### Import modules

In [7]:
sys.path.append(os.path.abspath(os.path.join('../scripts')))
from ml import Ml
from preprocess import Preprocess

Instantiate preprocessing & ml class

In [8]:
ml = Ml()
preprocess = Preprocess()

In [42]:
# Gets or creates a logger
logger = logging.getLogger(__name__)
# set log level
logger.setLevel(logging.INFO)
file_handler = logging.FileHandler(f'../logs/decission_trees.log')
formatter = logging.Formatter(
    '%(asctime)s : %(levelname)s : %(name)s : %(message)s', '%m-%d-%Y %H:%M:%S')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

In [10]:
# Get URL from DVC
path = 'data/AdSmartABdata.csv'
repo = 'https://github.com/jedisam/abtest-mlops'
version = '6db449393c9626c4fbca44946dfa103660685a27'

In [11]:
# Load data from dvc using the dvc.api.Dataset class
data_url = dvc.api.get_url(
    path=path,
    repo=repo,
    rev=version
)

In [21]:
# Read CSV file from remote repository
data = pd.read_csv(data_url, sep=',')

Convert date column to datetime

In [24]:
# change the date column to datetime
data = preprocess.convert_to_datetime(data, 'date')

Get numerical & categorical features

In [29]:
numerical_column = preprocess.get_numerical_columns(data)
categorical_column = preprocess.get_categorical_columns(data)

Remove the id column

In [30]:
# drop auction_id from categorical_column
categorical_column.remove('auction_id')

In [31]:
# Get column names have less than 10 more than 2 unique values
to_one_hot_encoding = [col for col in categorical_column if data[col].nunique() <= 10 and data[col].nunique() > 2]

# Get Categorical Column names thoose are not in "to_one_hot_encoding"
to_label_encoding = [col for col in categorical_column if not col in to_one_hot_encoding]

Encode categorical features

In [34]:
# Label encoding
label_encoded_columns = preprocess.label_encode(data, to_label_encoding)

Select relevant rows

In [36]:
# Copy our DataFrame to X variable
X = data.copy()

# Droping Categorical Columns,
# "inplace" means replace our data with new one
# Don't forget to "axis=1"
X.drop(categorical_column, axis=1, inplace=True)

# Merge DataFrames
X = pd.concat([X, label_encoded_columns], axis=1)

In [38]:
# Select only rows with responses
X = X.query('yes == 1 | no == 1')

In [39]:
# Drop auction_id column
X.drop(["auction_id"], axis=1, inplace=True)

In [40]:
# add 1 to yes and 0 to no
X['yes'] = X['yes'].replace(1, 2)