In [3]:
#Auto relode the notebooks
%load_ext autoreload
%autoreload 2

In [4]:
#import python libraries
import os
import warnings
import sys
import datetime

import dvc.api
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet, LogisticRegression, LinearRegression
from sklearn.preprocessing import LabelEncoder
from fast_ml.model_development import train_valid_test_split
from sklearn.tree import DecisionTreeClassifier
from urllib.parse import urlparse
import mlflow
import mlflow.sklearn
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
import pickle

In [5]:
#import Scripts
sys.path.append(os.path.abspath(os.path.join('../scripts')))
from plot_data import PlotData
from preprocessing import Preprocess
from load_data import LoadData

In [6]:
loader = LoadData()

In [7]:
preprocess = Preprocess()

In [8]:
#get the data
train_path = 'data/train_cleaned.csv'
test_path = 'data/test_cleaned.csv'
repo = 'https://github.com/yonamg/Pharmaceutical-Sales-Prediction'
tra_ver = '48de72c'
tst_ver = '3e8c890'

In [9]:
#Load train data from dvc using the dvc.api.Dataset class
data_url = dvc.api.get_url(
    path=train_path,
    repo=repo,
    rev=tra_ver
)

In [10]:
#Load test data from dvc using the dvc.api.Dataset class
data_urlt = dvc.api.get_url(
    path=test_path,
    repo=repo,
    rev=tst_ver
)

In [11]:
train_data = loader.read_csv(data_url)
train_data.drop(['Unnamed: 0'], axis=1, inplace=True)
train_data.set_index('Date', inplace=True)

In [12]:
test_data = loader.read_csv(data_urlt)

In [13]:
test_data.drop(['Unnamed: 0'], axis=1, inplace=True)
test_data.set_index('Date', inplace=True)

In [14]:
train_data.head()

Unnamed: 0_level_0,Store,DayOfWeek,Sales,Open,Promo,StateHoliday,SchoolHoliday,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,...,Promo2SinceYear,Year,Month,Day,weekday,weekofyear,weekend,PromoInterval,Assortment,StoreType
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-07-31,1,4,5263.0,1,1,0,1,1270.0,9.0,2008.0,...,2012.0,2015,7,31,4,31,0,1,0,2
2015-07-31,2,4,6064.0,1,1,0,1,570.0,11.0,2007.0,...,2010.0,2015,7,31,4,31,0,1,0,0
2015-07-31,3,4,8314.0,1,1,0,1,14130.0,12.0,2006.0,...,2011.0,2015,7,31,4,31,0,1,0,0
2015-07-31,4,4,13995.0,1,1,0,1,620.0,9.0,2009.0,...,2012.0,2015,7,31,4,31,0,1,2,2
2015-07-31,5,4,4822.0,1,1,0,1,29910.0,4.0,2015.0,...,2012.0,2015,7,31,4,31,0,1,0,0


In [15]:
train_data.drop(['StateHoliday'], axis=1, inplace=True)

Train with only by the opened stores and Sales greater than zero

In [16]:
train_data = train_data[train_data['Open'] == 1]
train_data = train_data[train_data['Sales'] > 0.0]

In [17]:
from log import get_logger
my_logger = get_logger("Prediction")

In [18]:
train_data.columns

Index(['Store', 'DayOfWeek', 'Sales', 'Open', 'Promo', 'SchoolHoliday',
       'CompetitionDistance', 'CompetitionOpenSinceMonth',
       'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek',
       'Promo2SinceYear', 'Year', 'Month', 'Day', 'weekday', 'weekofyear',
       'weekend', 'PromoInterval', 'Assortment', 'StoreType'],
      dtype='object')

In [19]:
test_data.columns

Index(['Id', 'Store', 'DayOfWeek', 'Open', 'Promo', 'StateHoliday',
       'SchoolHoliday', 'StoreType', 'Assortment', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval', 'Year', 'Month',
       'Day', 'weekday', 'weekofyear', 'weekend'],
      dtype='object')

### Scaling the Data

In [20]:
from sklearn.preprocessing import MinMaxScaler

In [21]:
scaler = MinMaxScaler()

trainining_columns = ['Store', 'DayOfWeek', 'Sales', 'Open', 'Promo', 'SchoolHoliday',
       'CompetitionDistance', 'CompetitionOpenSinceMonth',
       'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek',
       'Promo2SinceYear', 'Year', 'Month', 'Day', 'weekday', 'weekofyear',
       'weekend', 'PromoInterval', 'Assortment', 'StoreType']


temp_train_data = train_data.copy()
  

try:
    for i in trainining_columns:
    
        # fit on training data column
        scale = scaler.fit(temp_train_data[[i]])
        
        # transform the training data column
        temp_train_data[i] = scale.transform(temp_train_data[[i]])
 
    
    my_logger.debug("Data is Normalized successfully.")
    
except Exception as e:
    my_logger.debug(f"Exception occured while Normalizing the dataset, {e}")

2022-09-08 20:35:01,452 — Prediction — DEBUG — Data is Normalized successfully.


In [22]:
temp_train_data.describe()

Unnamed: 0,Store,DayOfWeek,Sales,Open,Promo,SchoolHoliday,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,...,Promo2SinceYear,Year,Month,Day,weekday,weekofyear,weekend,PromoInterval,Assortment,StoreType
count,844338.0,844338.0,844338.0,844338.0,844338.0,844338.0,844338.0,844338.0,844338.0,844338.0,...,844338.0,844338.0,844338.0,844338.0,844338.0,844338.0,844338.0,844338.0,844338.0,844338.0
mean,0.500378,0.420058,0.486304,0.0,0.446356,0.193578,0.071599,0.588312,0.9488,0.49867,...,0.479549,0.415973,0.440525,0.494524,0.420058,0.444058,0.337782,0.489727,0.468247,0.401814
std,0.288807,0.287285,0.196722,0.0,0.497114,0.395102,0.102862,0.243206,0.043248,0.499999,...,0.196556,0.388636,0.302178,0.289446,0.287285,0.282156,0.472954,0.229762,0.49655,0.454458
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.250449,0.166667,0.3437,0.0,0.0,0.0,0.009098,0.454545,0.93913,0.0,...,0.5,0.0,0.181818,0.233333,0.166667,0.196078,0.0,0.5,0.0,0.0
50%,0.5,0.333333,0.45153,0.0,0.0,0.0,0.030459,0.636364,0.956522,0.0,...,0.5,0.5,0.454545,0.5,0.333333,0.431373,0.0,0.5,0.0,0.0
75%,0.750449,0.666667,0.593709,0.0,1.0,0.0,0.090454,0.727273,0.965217,1.0,...,0.5,0.5,0.636364,0.733333,0.666667,0.666667,1.0,0.5,1.0,1.0
max,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [23]:
#Sample train data
y_target = temp_train_data['Sales']
x_features =  temp_train_data.drop(columns=['Sales'], axis=1)

Train-test splitting the dataset

In [24]:
try: 
    x_train, x_train_test, y_train, y_train_test = train_test_split(x_features, y_target, test_size=0.20, random_state=15)
    my_logger.info("Train Test split done successfully.")

except Exception as e:
    print(e)
    my_logger.debug(f"Exception occured in separating dataset into x & y_training dataset, {e}")

2022-09-08 20:35:25,623 — Prediction — INFO — Train Test split done successfully.


In [25]:
x_train.shape

(675470, 20)

### Use Sklearn Pipeline

In [26]:
data_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='mean'))
      ,('scaler', StandardScaler())
])

pipeline = Pipeline(steps = [
               ('preprocessor', data_transformer)
              ,('regressor',RandomForestRegressor(max_depth=64, random_state=0))
           ])

In [27]:
rf_model = pipeline.fit(x_train, y_train)

In [29]:
rf_model.score(x_train_test, y_train_test)

0.9181377178741617

In [30]:
try:
    rf_model_path = '../model/' + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + '.pkl'
    pickle.dump(rf_model, open(rf_model_path, 'wb'))
    print("RandomForestRegressor model saved")

except Exception as e:
    my_logger.exception(f"Exception occured saving the model, {e}")

RandomForestRegressor model saved
