# Pre-Processing and Training Data

# Imports

In [179]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.datasets import fetch_openml
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
import datetime


# Load Data

In [180]:
df_merged = pd.read_csv('/Users/swatisharma/Documents/GitHub/Capstone2_Demand_Forecast/df_merged_features.csv')
df_merged.head().T

Unnamed: 0,0,1,2,3,4
record_ID,1,2,3,4,5
week,2011-01-17,2011-01-17,2011-01-17,2011-01-17,2011-01-17
store_id,8091,8091,8091,8091,8091
sku_id,216418,216419,216425,216233,217390
total_price,99.0375,99.0375,133.95,133.95,141.075
base_price,111.8625,99.0375,133.95,133.95,141.075
is_featured_sku,0,0,0,0,0
is_display_sku,0,0,0,0,0
units_sold,20,28,19,44,52
year,2011,2011,2011,2011,2011


In [181]:
# drop the column "week"
df_merged = df_merged.drop('week', axis=1)


In [182]:
df_merged.shape

(150150, 13)

In [189]:
df_merged["month"].unique()

array([ 1,  7,  2,  3,  4, 11,  9,  5,  6,  8, 12, 10])

In [183]:
#Create dummy or indicator features for categorical variable "month"
df = pd.get_dummies(df_merged, columns = ["month"], prefix = "M")
df

Unnamed: 0,record_ID,store_id,sku_id,total_price,base_price,is_featured_sku,is_display_sku,units_sold,year,day,...,M_3,M_4,M_5,M_6,M_7,M_8,M_9,M_10,M_11,M_12
0,1,8091,216418,99.0375,111.8625,0,0,20,2011,17,...,False,False,False,False,False,False,False,False,False,False
1,2,8091,216419,99.0375,99.0375,0,0,28,2011,17,...,False,False,False,False,False,False,False,False,False,False
2,3,8091,216425,133.9500,133.9500,0,0,19,2011,17,...,False,False,False,False,False,False,False,False,False,False
3,4,8091,216233,133.9500,133.9500,0,0,44,2011,17,...,False,False,False,False,False,False,False,False,False,False
4,5,8091,217390,141.0750,141.0750,0,0,52,2011,17,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150145,212638,9984,223245,235.8375,235.8375,0,0,38,2013,7,...,False,False,False,False,False,False,True,False,False,False
150146,212639,9984,223153,235.8375,235.8375,0,0,30,2013,7,...,False,False,False,False,False,False,True,False,False,False
150147,212642,9984,245338,357.6750,483.7875,1,1,31,2013,7,...,False,False,False,False,False,False,True,False,False,False
150148,212643,9984,547934,141.7875,191.6625,0,1,12,2013,7,...,False,False,False,False,False,False,True,False,False,False


In [198]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
# creating initial dataframe
month_list = (1, 7, 2, 3, 4, 11, 9, 5, 6, 8, 12, 10)
month_df = pd.DataFrame(month_list, columns=['month'])
# creating instance of labelencoder
labelencoder = LabelEncoder()
# Assigning numerical values and storing in another column
month_df['month_encoded'] = labelencoder.fit_transform(month_df['month'])
month_df

Unnamed: 0,month,month_encoded
0,1,0
1,7,6
2,2,1
3,3,2
4,4,3
5,11,10
6,9,8
7,5,4
8,6,5
9,8,7


In [205]:
result = pd.concat([df_merged, month_df], axis=1)
result

Unnamed: 0,record_ID,store_id,sku_id,total_price,base_price,is_featured_sku,is_display_sku,units_sold,year,month,day,average_price,revenue,month.1,month_encoded
0,1,8091,216418,99.0375,111.8625,0,0,20,2011,1,17,219,4380,1.0,0.0
1,2,8091,216419,99.0375,99.0375,0,0,28,2011,1,17,219,6132,7.0,6.0
2,3,8091,216425,133.9500,133.9500,0,0,19,2011,1,17,219,4161,2.0,1.0
3,4,8091,216233,133.9500,133.9500,0,0,44,2011,1,17,219,9636,3.0,2.0
4,5,8091,217390,141.0750,141.0750,0,0,52,2011,1,17,219,11388,4.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150145,212638,9984,223245,235.8375,235.8375,0,0,38,2013,9,7,219,8322,,
150146,212639,9984,223153,235.8375,235.8375,0,0,30,2013,9,7,219,6570,,
150147,212642,9984,245338,357.6750,483.7875,1,1,31,2013,9,7,219,6789,,
150148,212643,9984,547934,141.7875,191.6625,0,1,12,2013,9,7,219,2628,,


In [212]:
result.isnull().sum(axis = 0)

record_ID               0
store_id                0
sku_id                  0
total_price             1
base_price              0
is_featured_sku         0
is_display_sku          0
units_sold              0
year                    0
month                   0
day                     0
average_price           0
revenue                 0
month              150138
month_encoded      150138
dtype: int64

In [286]:
result = result.dropna(subset = ["total_price"])
result

Unnamed: 0,record_ID,store_id,sku_id,total_price,base_price,is_featured_sku,is_display_sku,units_sold,year,month,day,average_price,revenue,month.1,month_encoded
0,1,8091,216418,99.0375,111.8625,0,0,20,2011,1,17,219,4380,1.0,0.0
1,2,8091,216419,99.0375,99.0375,0,0,28,2011,1,17,219,6132,7.0,6.0
2,3,8091,216425,133.9500,133.9500,0,0,19,2011,1,17,219,4161,2.0,1.0
3,4,8091,216233,133.9500,133.9500,0,0,44,2011,1,17,219,9636,3.0,2.0
4,5,8091,217390,141.0750,141.0750,0,0,52,2011,1,17,219,11388,4.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150145,212638,9984,223245,235.8375,235.8375,0,0,38,2013,9,7,219,8322,,
150146,212639,9984,223153,235.8375,235.8375,0,0,30,2013,9,7,219,6570,,
150147,212642,9984,245338,357.6750,483.7875,1,1,31,2013,9,7,219,6789,,
150148,212643,9984,547934,141.7875,191.6625,0,1,12,2013,9,7,219,2628,,


In [287]:
result.isnull().sum(axis = 0)

record_ID               0
store_id                0
sku_id                  0
total_price             0
base_price              0
is_featured_sku         0
is_display_sku          0
units_sold              0
year                    0
month                   0
day                     0
average_price           0
revenue                 0
month              150137
month_encoded      150137
dtype: int64

In [341]:
# dealing with missing values in two numeric columns month and month_encoded using suitable mean value, a measure of central tendency
result["month"] = result["month"].fillna(result["month"].mean())
result["month_encoded"] = result["month_encoded"].fillna(result["month_encoded"].mean())

In [342]:
# Chceking if there are still nan values
result.isnull().sum(axis = 0)

record_ID          0
store_id           0
sku_id             0
total_price        0
base_price         0
is_featured_sku    0
is_display_sku     0
units_sold         0
year               0
month              0
day                0
average_price      0
revenue            0
month              0
month_encoded      0
dtype: int64

# Train/Test Split

Partition sizes with a 70/30 train/test split

In [343]:
len(result) * .7, len(result) * .3

(105104.29999999999, 45044.7)

In [344]:
X_train, X_test, y_train, y_test = train_test_split(result.drop(columns='units_sold'), 
                                                    result.units_sold, test_size=0.3, 
                                                    random_state=47)

In [345]:
X_train.shape, X_test.shape

((105104, 14), (45045, 14))

In [346]:
y_train.shape, y_test.shape

((105104,), (45045,))

In [347]:
#Code task 1#
#Save the ''record_ID', 'total_price', 'revenue', 'month' columns from the train/test data into units_sold_train and units_sold_test
#Then drop those columns from `X_train` and `X_test`. Use 'inplace=True'
column_list = ['record_ID', 'total_price', 'revenue']
column_list_train = X_train[column_list]
column_list_test = X_test[column_list]
X_train.drop(columns=column_list, inplace=True)
X_test.drop(columns=column_list, inplace=True)
X_train.shape, X_test.shape

((105104, 11), (45045, 11))

In [348]:
#Code task 2#
#Check the `dtypes` attribute of `X_train` to verify all features are numeric
X_train.dtypes

store_id             int64
sku_id               int64
base_price         float64
is_featured_sku      int64
is_display_sku       int64
year                 int64
month                int64
day                  int64
average_price        int64
month              float64
month_encoded      float64
dtype: object

In [349]:
#Code task 3#
#Repeat this check for the test split in `X_test`
X_test.dtypes

store_id             int64
sku_id               int64
base_price         float64
is_featured_sku      int64
is_display_sku       int64
year                 int64
month                int64
day                  int64
average_price        int64
month              float64
month_encoded      float64
dtype: object

We have only numeric features in our X now!

# Initial Not-Even-A-Model

A good place to start is to see how good the mean is as a predictor. In other words, what if we simply say our best guess is the average price?

In [350]:
#Code task 4#
#Calculate the mean of `y_train`
train_mean = y_train.mean()
train_mean

51.70557733292738

sklearn's DummyRegressor easily does this:

In [351]:
#Code task 5#
#Fit the dummy regressor on the training data
#Hint, call its `.fit()` method with `X_train` and `y_train` as arguments
#Then print the object's `constant_` attribute and verify it's the same as the mean above
dumb_reg = DummyRegressor(strategy='mean')
dumb_reg.fit(X_train, y_train)
dumb_reg.constant_

array([[51.70557733]])

# Metrics

In [352]:
#Code task 6#
#Calculate the R^2 as defined above
def r_squared(y, ypred):
    """R-squared score.
    
    Calculate the R-squared, or coefficient of determination, of the input.
    
    Arguments:
    y -- the observed values
    ypred -- the predicted values
    """
    ybar = np.sum(y) / len(y) #yes, we could use np.mean(y)
    sum_sq_tot = np.sum((y - ybar)**2) #total sum of squares error
    sum_sq_res = np.sum((y - ypred)**2) #residual sum of squares error
    R2 = 1.0 - sum_sq_res / sum_sq_tot
    return R2

In [353]:
y_tr_pred_ = train_mean * np.ones(len(y_train))
y_tr_pred_[:5]

array([51.70557733, 51.70557733, 51.70557733, 51.70557733, 51.70557733])

In [354]:
y_tr_pred = dumb_reg.predict(X_train)
y_tr_pred[:5]

array([51.70557733, 51.70557733, 51.70557733, 51.70557733, 51.70557733])

We see that DummyRegressor produces exactly the same results and saves us having to mess about broadcasting the mean (or whichever other statistic we used. It also gives us an object with fit() and predict() methods as well so we can use them as conveniently as any other sklearn estimator.

In [355]:
r_squared(y_train, y_tr_pred)

0.0

Exactly as expected, if we use the average value as your prediction, we get an 
 of zero on our training set. 

Make our predictions by creating an array of length the size of the test set with the single value of the (training) mean.

In [356]:
y_te_pred = train_mean * np.ones(len(y_test))
r_squared(y_test, y_te_pred)

-3.0974410798467744e-06

# Mean Absolute Error

This is very simply the average of the absolute errors:

 
 


In [357]:
#Code task 7#
#Calculate the MAE as defined above
def mae(y, ypred):
    """Mean absolute error.
    
    Calculate the mean absolute error of the arguments

    Arguments:
    y -- the observed values
    ypred -- the predicted values
    """
    abs_error = np.abs(y - ypred)
    mae = np.mean(abs_error)
    return mae

In [358]:
mae(y_train, y_tr_pred)

35.373134508889166

In [359]:
mae(y_test, y_te_pred)

35.23044624963676

Mean absolute error is arguably the most intuitive of all the metrics.

# Mean Squared Error

Another common metric (and an important one internally for optimizing machine learning models) is the mean squared error. This is simply the average of the square of the errors.

In [360]:
#Code task 8#
#Calculate the MSE as defined above
def mse(y, ypred):
    """Mean square error.
    
    Calculate the mean square error of the arguments

    Arguments:
    y -- the observed values
    ypred -- the predicted values
    """
    sq_error = (y - ypred)**2
    mse = np.mean(sq_error)
    return mse


In [361]:
mse(y_train, y_tr_pred)

3697.8953806759732

In [362]:
mse(y_test, y_te_pred)

3454.830889700147

In [363]:
np.sqrt([mse(y_train, y_tr_pred), mse(y_test, y_te_pred)])

array([60.81032298, 58.7778095 ])

# sklearn metrics

Functions are good, but you don't want to have to define functions every time we want to assess performance. sklearn.metrics provides many commonly used metrics, included the ones above.

R-squared

In [364]:
r2_score(y_train, y_tr_pred), r2_score(y_test, y_te_pred)

(0.0, -3.0974410798467744e-06)

Mean absolute error

In [365]:
mean_absolute_error(y_train, y_tr_pred), mean_absolute_error(y_test, y_te_pred)

(35.373134508889166, 35.23044624963676)

Mean squared error

In [366]:
mean_squared_error(y_train, y_tr_pred), mean_squared_error(y_test, y_te_pred)

(3697.8953806759732, 3454.830889700147)

Explore what happens when we reverse the order of the arguments and compare behaviour of sklearn's function and yours.

In [367]:
# train set - sklearn
# correct order, incorrect order
r2_score(y_train, y_tr_pred), r2_score(y_tr_pred, y_train)

(0.0, -8.138262671692872e+30)

In [368]:
# test set - sklearn
# correct order, incorrect order
r2_score(y_test, y_te_pred), r2_score(y_te_pred, y_test)

(-3.0974410798467744e-06, -6.8429975797115925e+31)

In [369]:
# train set - using our homebrew function
# correct order, incorrect order
r_squared(y_train, y_tr_pred), r_squared(y_tr_pred, y_train)

(0.0, -8.138262671692872e+30)

In [370]:
# test set - using our homebrew function
# correct order, incorrect order
r_squared(y_test, y_te_pred), r_squared(y_te_pred, y_test)

(-3.0974410798467744e-06, -6.8429975797115925e+31)

# Initial Models

Impute missing values with median

In [371]:
# These are the values we'll use to fill in any missing values
X_defaults_median = X_train.median()
X_defaults_median

store_id             9371.000000
sku_id             222087.000000
base_price            205.912500
is_featured_sku         0.000000
is_display_sku          0.000000
year                 2012.000000
month                   6.000000
day                    16.000000
average_price         219.000000
month                   6.130783
month_encoded           5.500000
dtype: float64

Apply the imputation to both train and test splits

In [372]:
#Code task 9#
#Call `X_train` and `X_test`'s `fillna()` method, passing `X_defaults_median` as the values to use
#Assign the results to `X_tr` and `X_te`, respectively
X_tr = X_train.fillna(X_defaults_median)
X_te = X_test.fillna(X_defaults_median)

# Scale the data

As we have features measured in many different units, with numbers that vary by orders of magnitude, start off by scaling them to put them all on a consistent scale. The StandardScaler scales each feature to zero mean and unit variance.

In [373]:
#Code task 10#
#Call the StandardScaler`s fit method on `X_tr` to fit the scaler
#then use it's `transform()` method to apply the scaling to both the train and test split
#data (`X_tr` and `X_te`), naming the results `X_tr_scaled` and `X_te_scaled`, respectively
scaler = StandardScaler()
scaler.fit(X_tr)
X_tr_scaled = scaler.transform(X_tr)
X_te_scaled = scaler.transform(X_te)

Train the model on the train split

In [374]:
lm = LinearRegression().fit(X_tr_scaled, y_train)

Make predictions using the model on both train and test splits

In [375]:
#Code task 11#
#Call the `predict()` method of the model (`lm`) on both the (scaled) train and test data
#Assign the predictions to `y_tr_pred` and `y_te_pred`, respectively
y_tr_pred = lm.predict(X_tr_scaled)
y_te_pred = lm.predict(X_te_scaled)

Assess model performance

In [376]:
# r^2 - train, test
median_r2 = r2_score(y_train, y_tr_pred), r2_score(y_test, y_te_pred)
median_r2

(0.23914516018291965, 0.24776279027215353)

Recall that you estimated ticket price by simply using a known average. As expected, this produced an 
 of zero for both the training and test set, because 
 tells us how much of the variance you're explaining beyond that of using just the mean, and you were using just the mean. Here we see that our simple linear regression model explains over 80% of the variance on the train set and over 70% on the test set. Clearly you are onto something, although the much lower value for the test set suggests you're overfitting somewhat. This isn't a surprise as you've made no effort to select a parsimonious set of features or deal with multicollinearity in our data.

In [377]:
#Code task 12#
#Now calculate the mean absolute error scores using `sklearn`'s `mean_absolute_error` function
# as we did above for R^2
# MAE - train, test
median_mae = mean_absolute_error(y_train, y_tr_pred), mean_absolute_error(y_test, y_te_pred)
median_mae

(30.36836872177887, 30.180082907976857)

In [378]:
#Code task 13#
#And also do the same using `sklearn`'s `mean_squared_error`
# MSE - train, test
median_mse = mean_squared_error(y_train, y_tr_pred), mean_squared_error(y_test, y_te_pred)
median_mse

(2813.5615975245387, 2598.84429878252)

# Impute missing values with the mean

In [379]:
#Code task 14#
#As we did for the median above, calculate mean values for imputing missing values
# These are the values we'll use to fill in any missing values
X_defaults_mean = X_train.mean()
X_defaults_mean

store_id             9199.419128
sku_id             254910.859625
base_price            219.184582
is_featured_sku         0.095705
is_display_sku          0.132811
year                 2011.831662
month                   6.132583
day                    15.685607
average_price         219.000000
month                   6.130772
month_encoded           5.499957
dtype: float64

Apply the imputation to both train and test splits

In [380]:
X_tr = X_train.fillna(X_defaults_mean)
X_te = X_test.fillna(X_defaults_mean)

Scale the data

In [381]:
scaler = StandardScaler()
scaler.fit(X_tr)
X_tr_scaled = scaler.transform(X_tr)
X_te_scaled = scaler.transform(X_te)

Train the model on the train split

In [382]:
lm = LinearRegression().fit(X_tr_scaled, y_train)

Make predictions using the model on both train and test splits

In [383]:
y_tr_pred = lm.predict(X_tr_scaled)
y_te_pred = lm.predict(X_te_scaled)

Assess model performance

In [384]:
r2_score(y_train, y_tr_pred), r2_score(y_test, y_te_pred)

(0.23914516018291965, 0.24776279027215353)

In [385]:
mean_absolute_error(y_train, y_tr_pred), mean_absolute_error(y_test, y_te_pred)

(30.36836872177887, 30.180082907976857)

In [386]:
mean_squared_error(y_train, y_tr_pred), mean_squared_error(y_test, y_te_pred)

(2813.5615975245387, 2598.84429878252)

These results don't seem very different to when you used the median for imputing missing values. Perhaps it doesn't make much difference here. Maybe your overtraining dominates. Maybe other feature transformations, such as taking the log, would help. You could try with just a subset of features rather than using all of them as inputs.

To perform the median/mean comparison, you copied and pasted a lot of code just to change the function for imputing missing values. It would make more sense to write a function that performed the sequence of steps:

impute missing values
scale the features
train a model
calculate model performance
But these are common steps and sklearn provides something much better than writing custom functions.

# Pipelines

One of the most important and useful components of sklearn is the pipeline. In place of panda's fillna DataFrame method, there is sklearn's SimpleImputer. Remember the first linear model above performed the steps:

replace missing values with the median for each feature
scale the data to zero mean and unit variance
train a linear regression model
and all these steps were trained on the train split and then applied to the test split for assessment.

The pipeline below defines exactly those same steps. Crucially, the resultant Pipeline object has a fit() method and a predict() method, just like the LinearRegression() object itself. Just as you might create a linear regression model and train it with .fit() and predict with .predict(), you can wrap the entire process of imputing and feature scaling and regression in a single object you can train with .fit() and predict with .predict(). And that's basically a pipeline: a model on steroids.



Define the pipeline

In [387]:
pipe = make_pipeline(
    SimpleImputer(strategy='median'), 
    StandardScaler(), 
    LinearRegression()
)

In [388]:
type(pipe)

sklearn.pipeline.Pipeline

In [389]:
hasattr(pipe, 'fit'), hasattr(pipe, 'predict')

(True, True)

In next step we will perform modeling and build two to three different models including linear regression, randome forest etc., identify the best one and perform hyperparameter tuning. Also, access the data quality using various metrics. 