In [2]:
%matplotlib inline

import pandas as pd  
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV

from sklearn.tree import DecisionTreeRegressor

from sklearn.ensemble import RandomForestRegressor

from sklearn.linear_model import Lasso

import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

%matplotlib inline
from dmba import regressionSummary, adjusted_r2_score, plotDecisionTree

In [3]:
data_df = pd.read_csv("historical_data.csv")


In [4]:
X = data_df.drop(columns=['sales'])
y = data_df['sales']

print(X.columns)
print(len(X.columns))

Index(['Type', 'Immediate_vs_Future', 'Segment', 'Package_Type', 'Mega_Flavor',
       'Seasonality', 'q2_2018', 'q3_2018', 'q4_2018', 'q1_2019'],
      dtype='object')
10


In [5]:
X = pd.get_dummies(X, drop_first = True)

print(X.columns)
len(X.columns)

Index(['q2_2018', 'q3_2018', 'q4_2018', 'q1_2019', 'Type_AO CHEWY',
       'Type_ASSORTMENT MIXES', 'Type_CANDY CANE', 'Type_CANDY CORN',
       'Type_CONVERSATION HEARTS', 'Type_FRUITY CHEWY', 'Type_GUMMY',
       'Type_HARD CANDY', 'Type_JELLY BEANS', 'Type_LICORICE', 'Type_LOLLIPOP',
       'Type_NOVELTY', 'Type_NUT', 'Immediate_vs_Future_IMMEDIATE',
       'Segment_CLASSIC FAVORITES', 'Segment_ETHNIC', 'Segment_GUMMY',
       'Segment_KIDS NOVELTY', 'Segment_LICORICE', 'Segment_MAINLINE',
       'Segment_NOT APPLICABLE', 'Segment_NUT', 'Segment_SUGAR FREE CANDY',
       'Package_Type_BOX', 'Package_Type_CHANGEMAKER',
       'Package_Type_LAYDOWN BAG', 'Package_Type_PEG BAG',
       'Package_Type_SHARE SIZE', 'Package_Type_SINGLE SIZE',
       'Package_Type_STANDUP BAG', 'Package_Type_THEATER BOX',
       'Package_Type_TUB', 'Mega_Flavor_ASSORTED', 'Mega_Flavor_CARAMEL',
       'Mega_Flavor_FRUIT', 'Mega_Flavor_NUT', 'Mega_Flavor_SOUR',
       'Seasonality_SEASONAL'],
      dtype='o

42

In [6]:
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.5, random_state=1)


In [7]:
scaler = StandardScaler()
train_X=scaler.fit_transform(train_X)
valid_X=scaler.transform(valid_X)

In [8]:
data_lm = LinearRegression()

data_lm.fit(train_X, train_y)

LinearRegression()

In [9]:
print('intercept ', data_lm.intercept_)
print(pd.DataFrame({'Predictor': X.columns, 'coefficient': data_lm.coef_}))

intercept  73984.69301464407
                        Predictor   coefficient
0                         q2_2018  1.943479e+04
1                         q3_2018  2.229749e+04
2                         q4_2018  5.784488e+04
3                         q1_2019  7.930519e+04
4                   Type_AO CHEWY  4.644499e+03
5           Type_ASSORTMENT MIXES -1.130840e+02
6                 Type_CANDY CANE  4.258224e+03
7                 Type_CANDY CORN  2.122071e+03
8        Type_CONVERSATION HEARTS -8.558365e+02
9               Type_FRUITY CHEWY  6.579439e+03
10                     Type_GUMMY  9.142013e+03
11                Type_HARD CANDY  3.861653e+03
12               Type_JELLY BEANS  2.645430e+03
13                  Type_LICORICE  3.515097e+03
14                  Type_LOLLIPOP  4.137588e+03
15                   Type_NOVELTY  8.722288e+03
16                       Type_NUT -2.804348e+02
17  Immediate_vs_Future_IMMEDIATE -1.608447e+16
18      Segment_CLASSIC FAVORITES -3.880819e+03
19         

In [12]:
#First predictive model performance
data_lm_pred = data_lm.predict(valid_X)

regressionSummary(valid_y, data_lm_pred)


Regression statistics

                      Mean Error (ME) : -909.2199
       Root Mean Squared Error (RMSE) : 105983.1605
            Mean Absolute Error (MAE) : 34865.8249
          Mean Percentage Error (MPE) : -1370.1962
Mean Absolute Percentage Error (MAPE) : 2285.9623


In [11]:
#Second predictive linear model
lambda_val = 35000

data_lasso = Lasso(alpha = lambda_val)

data_lasso.fit(train_X, train_y)

data_lasso_predict = data_lasso.predict(valid_X)

In [19]:
# Set lambdas
lambda_val = 35000

# Fit lasso regression for each lambda, save coefficients
coefs = []
for val in lambdas:
    data_lasso = Lasso(alpha = val)
    data_lasso.fit(train_X, train_y)
    coefs.append(data_lasso.coef_)
    
    
fig, ax = plt.subplots(figsize=(30, 20))

ax.plot(lambdas, coefs, marker="o")
ax.set_xscale('log')
plt.xlabel('lambda')
plt.ylabel('Coefficients (beta hats)')
plt.title('Lasso coefficients as a function of choice of lambda')
plt.axis('tight')

NameError: name 'lambdas' is not defined

In [20]:
regressionSummary(valid_y, data_lasso_predict)



Regression statistics

                      Mean Error (ME) : 427.6519
       Root Mean Squared Error (RMSE) : 112660.8668
            Mean Absolute Error (MAE) : 45118.2497
          Mean Percentage Error (MPE) : -4290.8230
Mean Absolute Percentage Error (MAPE) : 4302.9991


In [21]:
#Tree based model
X = data_df.drop(columns=['sales'])
y = data_df['sales']

print(X.columns)
print(len(X.columns))

Index(['Type', 'Immediate_vs_Future', 'Segment', 'Package_Type', 'Mega_Flavor',
       'Seasonality', 'q2_2018', 'q3_2018', 'q4_2018', 'q1_2019'],
      dtype='object')
10


In [22]:
X = pd.get_dummies(X, drop_first = False)

print(X.columns)
len(X.columns)

Index(['q2_2018', 'q3_2018', 'q4_2018', 'q1_2019', 'Type_ALL OTHER',
       'Type_AO CHEWY', 'Type_ASSORTMENT MIXES', 'Type_CANDY CANE',
       'Type_CANDY CORN', 'Type_CONVERSATION HEARTS', 'Type_FRUITY CHEWY',
       'Type_GUMMY', 'Type_HARD CANDY', 'Type_JELLY BEANS', 'Type_LICORICE',
       'Type_LOLLIPOP', 'Type_NOVELTY', 'Type_NUT',
       'Immediate_vs_Future_FUTURE', 'Immediate_vs_Future_IMMEDIATE',
       'Segment_ASSORTMENT MIXES', 'Segment_CLASSIC FAVORITES',
       'Segment_ETHNIC', 'Segment_GUMMY', 'Segment_KIDS NOVELTY',
       'Segment_LICORICE', 'Segment_MAINLINE', 'Segment_NOT APPLICABLE',
       'Segment_NUT', 'Segment_SUGAR FREE CANDY', 'Package_Type_ALL OTHER',
       'Package_Type_BOX', 'Package_Type_CHANGEMAKER',
       'Package_Type_LAYDOWN BAG', 'Package_Type_PEG BAG',
       'Package_Type_SHARE SIZE', 'Package_Type_SINGLE SIZE',
       'Package_Type_STANDUP BAG', 'Package_Type_THEATER BOX',
       'Package_Type_TUB', 'Mega_Flavor_ALL OTHER', 'Mega_Flavor_ASSORT

48

In [23]:
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size =.5,random_state =1 )


In [None]:
param_grid = {
    'n_estimators': [210], 
    'max_depth' : [5],
    'criterion' : ['mae'],
    'oob_score': [True],
    'min_impurity_decrease': [0.001], 
    'min_samples_split': [5], 
    'min_samples_leaf':[2],
    'random_state':[1],
}

In [None]:
gridSearch = GridSearchCV(RandomForestRegressor(), param_grid, cv=2, n_jobs=12)

gridSearch.fit(train_X, train_y)

In [None]:
print('Initial parameters: ', gridSearch.best_params_)

rfTree = gridSearch.best_estimator_

rfTree.oob_score_

In [None]:
smallClassTree = DecisionTreeRegressor(random_state=1,max_depth=5, min_samples_split=5, min_impurity_decrease=0.001)

smallClassTree.fit(train_X, train_y)

rf = RandomForestRegressor(n_estimators= 210, random_state=1)
rf.fit(valid_X, valid_y)

plotDecisionTree(smallClassTree, feature_names=train_X.columns)