In [1]:
#Auto relode the notebooks
%load_ext autoreload
%autoreload 2

In [2]:
#import python libraries
import os
import warnings
import sys
import datetime

import dvc.api
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet, LogisticRegression, LinearRegression
from sklearn.preprocessing import LabelEncoder
from fast_ml.model_development import train_valid_test_split
from sklearn.tree import DecisionTreeClassifier
from urllib.parse import urlparse
import mlflow
import mlflow.sklearn
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
import pickle

In [5]:
#import Scripts
sys.path.append(os.path.abspath(os.path.join('../scripts')))
from plot_data import PlotData
from preprocessing import Preprocess
from load_data import LoadData

In [6]:
loader = LoadData()

In [33]:
preprocess = Preprocess()

In [7]:
#get the data
train_path = 'data/train_cleaned.csv'
test_path = 'data/test_cleaned.csv'
repo = 'https://github.com/yonamg/Pharmaceutical-Sales-Prediction'
tra_ver = '48de72c'
tst_ver = '3e8c890'

In [9]:
#Load train data from dvc using the dvc.api.Dataset class
data_url = dvc.api.get_url(
    path=train_path,
    repo=repo,
    rev=tra_ver
)

In [21]:
#Load test data from dvc using the dvc.api.Dataset class
data_urlt = dvc.api.get_url(
    path=test_path,
    repo=repo,
    rev=tst_ver
)

In [11]:
train_data = loader.read_csv(data_url)
train_data.drop(['Unnamed: 0'], axis=1, inplace=True)
train_data.set_index('Date', inplace=True)

In [22]:
test_data = loader.read_csv(data_urlt)

In [29]:
test_data.drop(['Unnamed: 0'], axis=1, inplace=True)
test_data.set_index('Date', inplace=True)

In [13]:
train_data.head()

Unnamed: 0_level_0,Store,DayOfWeek,Sales,Open,Promo,StateHoliday,SchoolHoliday,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,...,Promo2SinceYear,Year,Month,Day,weekday,weekofyear,weekend,PromoInterval,Assortment,StoreType
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-07-31,1,4,5263.0,1,1,0,1,1270.0,9.0,2008.0,...,2012.0,2015,7,31,4,31,0,1,0,2
2015-07-31,2,4,6064.0,1,1,0,1,570.0,11.0,2007.0,...,2010.0,2015,7,31,4,31,0,1,0,0
2015-07-31,3,4,8314.0,1,1,0,1,14130.0,12.0,2006.0,...,2011.0,2015,7,31,4,31,0,1,0,0
2015-07-31,4,4,13995.0,1,1,0,1,620.0,9.0,2009.0,...,2012.0,2015,7,31,4,31,0,1,2,2
2015-07-31,5,4,4822.0,1,1,0,1,29910.0,4.0,2015.0,...,2012.0,2015,7,31,4,31,0,1,0,0


In [14]:
train_data.drop(['StateHoliday'], axis=1, inplace=True)

Train with only by the opened stores and Sales greater than zero

In [15]:
train_data = train_data[train_data['Open'] == 1]
train_data = train_data[train_data['Sales'] > 0.0]

In [17]:
from log import get_logger
my_logger = get_logger("Prediction")

In [27]:
train_data.columns

Index(['Store', 'DayOfWeek', 'Sales', 'Open', 'Promo', 'SchoolHoliday',
       'CompetitionDistance', 'CompetitionOpenSinceMonth',
       'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek',
       'Promo2SinceYear', 'Year', 'Month', 'Day', 'weekday', 'weekofyear',
       'weekend', 'PromoInterval', 'Assortment', 'StoreType'],
      dtype='object')

In [40]:
test_data.columns

Index(['Id', 'Store', 'DayOfWeek', 'Open', 'Promo', 'StateHoliday',
       'SchoolHoliday', 'StoreType', 'Assortment', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear', 'Year', 'Month', 'Day', 'weekday',
       'weekofyear', 'weekend'],
      dtype='object')

In [42]:
test_data.dtypes

Id                             int64
Store                          int64
DayOfWeek                      int64
Open                         float64
Promo                          int64
StateHoliday                  object
SchoolHoliday                  int64
StoreType                     object
Assortment                    object
CompetitionDistance          float64
CompetitionOpenSinceMonth    float64
CompetitionOpenSinceYear     float64
Promo2                         int64
Promo2SinceWeek              float64
Promo2SinceYear              float64
Year                           int64
Month                          int64
Day                            int64
weekday                        int64
weekofyear                     int64
weekend                        int64
dtype: object

In [39]:
test_data.head()

Unnamed: 0_level_0,Id,Store,DayOfWeek,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,...,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,Year,Month,Day,weekday,weekofyear,weekend
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-09-17,1,1,3,1.0,1,0,0,c,a,1270.0,...,2008.0,0,22.0,2012.0,2015,9,17,3,38,0
2015-09-17,2,3,3,1.0,1,0,0,a,a,14130.0,...,2006.0,1,14.0,2011.0,2015,9,17,3,38,0
2015-09-17,3,7,3,1.0,1,0,0,a,c,24000.0,...,2013.0,0,22.0,2012.0,2015,9,17,3,38,0
2015-09-17,4,8,3,1.0,1,0,0,a,a,7520.0,...,2014.0,0,22.0,2012.0,2015,9,17,3,38,0
2015-09-17,5,9,3,1.0,1,0,0,a,c,2030.0,...,2000.0,0,22.0,2012.0,2015,9,17,3,38,0


### Scaling the Data

In [19]:
from sklearn.preprocessing import MinMaxScaler

In [41]:
scaler = MinMaxScaler()

trainining_columns = ['Store', 'DayOfWeek', 'Sales', 'Open', 'Promo', 'SchoolHoliday',
       'CompetitionDistance', 'CompetitionOpenSinceMonth',
       'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek',
       'Promo2SinceYear', 'Year', 'Month', 'Day', 'weekday', 'weekofyear',
       'weekend', 'PromoInterval', 'Assortment', 'StoreType']

testing_columns = ['Id', 'Store', 'DayOfWeek', 'Open', 'Promo', 'StateHoliday',
       'SchoolHoliday', 'StoreType', 'Assortment', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear', 'Year', 'Month',
       'Day', 'weekday', 'weekofyear', 'weekend']

temp_train_data = train_data.copy()
temp_test_data = test_data.copy()   

try:
    for i in trainining_columns:
    
        # fit on training data column
        scale = scaler.fit(temp_train_data[[i]])
        
        # transform the training data column
        temp_train_data[i] = scale.transform(temp_train_data[[i]])
    
    for i in testing_columns:
    
        # fit on training data column
        scale = scaler.fit(temp_test_data[[i]])
        
        # transform the training data column
        temp_test_data[i] = scale.transform(temp_test_data[[i]]) 
    
    my_logger.debug("Data is Normalized successfully.")
    
except Exception as e:
    my_logger.debug(f"Exception occured while Normalizing the dataset, {e}")

2022-09-08 18:36:13,440 — Prediction — DEBUG — Exception occured while Normalizing the dataset, could not convert string to float: 'a'
