<a href="https://colab.research.google.com/github/victoriawhite17/Sales_Predictions/blob/main/Transforming_Sales_Predictions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Sales Predictions
- Victoria White
- 17 August 2022

In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn import set_config
set_config(display='diagram')
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
df = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vTm38yOK7rXwn_p6FXOnqmqh_Wbq_C1RrpevFVocGFoK7d7cCqW80-yQwpQMP0t-H7Cei94Wo0E8cEO/pub?gid=1919646656&single=true&output=csv')

In [3]:
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [4]:
ml_df = df.copy()

In [5]:
df.duplicated().sum()

0

In [6]:
df.isna().sum()
#there is some missing data. We will drop the item weight column since it's not necessary for this data set and we will address the missing outlet sizes after the split.

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

##Data Exploration

In [7]:
#checking unique values
df.nunique()

Item_Identifier              1559
Item_Weight                   415
Item_Fat_Content                5
Item_Visibility              7880
Item_Type                      16
Item_MRP                     5938
Outlet_Identifier              10
Outlet_Establishment_Year       9
Outlet_Size                     3
Outlet_Location_Type            3
Outlet_Type                     4
Item_Outlet_Sales            3493
dtype: int64

In [8]:
df['Item_Fat_Content'].unique()

array(['Low Fat', 'Regular', 'low fat', 'LF', 'reg'], dtype=object)

In [9]:
#correcting unique values
df['Item_Fat_Content'].replace({'low fat':'Low Fat', 'LF':'Low Fat','reg':'Regular'}, inplace=True)

In [10]:
df['Item_Fat_Content'].unique()

array(['Low Fat', 'Regular'], dtype=object)

In [11]:
X = df.drop(columns=['Item_Identifier', 'Item_Weight', 'Outlet_Identifier', 'Item_Outlet_Sales'])
y = df['Item_Outlet_Sales']

##Train Test Split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
display(X_train.info())
#nominal features are item type, location type, outlet type, and item fat content
#numerical features are item visibility, item establishment year, and item mrp
#orindal features are outlet location type, and outlet size

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6392 entries, 4776 to 7270
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Fat_Content           6392 non-null   object 
 1   Item_Visibility            6392 non-null   float64
 2   Item_Type                  6392 non-null   object 
 3   Item_MRP                   6392 non-null   float64
 4   Outlet_Establishment_Year  6392 non-null   int64  
 5   Outlet_Size                4580 non-null   object 
 6   Outlet_Location_Type       6392 non-null   object 
 7   Outlet_Type                6392 non-null   object 
dtypes: float64(2), int64(1), object(5)
memory usage: 449.4+ KB


None

##Data Cleaning

In [13]:
#instantiating column selectors
cat_selector = make_column_selector(dtype_include='object')
num_selector = make_column_selector(dtype_include='number')

In [14]:
#instantiating transformers
freq_imputer = SimpleImputer(strategy='most_frequent')
mean_imputer = SimpleImputer(strategy='mean')
scaler = StandardScaler()
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

In [15]:
#instantiating pipeline for numerical features
num_pipe = make_pipeline(mean_imputer, scaler)
num_pipe

In [16]:
#instantiating pipeline for nominal features
cat_pipe = make_pipeline(freq_imputer, ohe)
cat_pipe

##Data Transformation

In [17]:
#instantiating column transformer
num_tuple = (num_pipe, num_selector)
cat_tuple = (cat_pipe, cat_selector)
preprocessor = make_column_transformer(num_tuple, cat_tuple, remainder='drop')
preprocessor

In [62]:
#Fitting the transformer
preprocessor.fit(X_train)

In [19]:
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [20]:
X_train_processed

array([[-0.71277507,  1.82810922,  1.32784893, ...,  0.        ,
         1.        ,  0.        ],
       [-1.29105225,  0.60336888,  1.32784893, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.81331864,  0.24454056,  0.13618724, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [-0.92052713,  1.52302674,  0.49368575, ...,  1.        ,
         0.        ,  0.        ],
       [-0.2277552 , -0.38377708,  1.0895166 , ...,  1.        ,
         0.        ,  0.        ],
       [-0.95867683, -0.73836105, -0.10214509, ...,  1.        ,
         0.        ,  0.        ]])

In [21]:
X_test_processed

array([[-0.77664625, -0.99881554, -1.29380678, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.1003166 , -1.58519423, -0.10214509, ...,  1.        ,
         0.        ,  0.        ],
       [-0.48299432, -1.59578435,  0.13618724, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [ 1.21832428,  1.09397975,  0.49368575, ...,  1.        ,
         0.        ,  0.        ],
       [-0.77809567, -0.36679966,  0.13618724, ...,  1.        ,
         0.        ,  0.        ],
       [-0.77976293,  0.11221189,  1.0895166 , ...,  1.        ,
         0.        ,  0.        ]])

##Linear Regression

In [24]:
#importing and instantiating the model
from sklearn.linear_model import LinearRegression
reg = LinearRegression()

In [25]:
#creating a model pipeline
from sklearn.pipeline import make_pipeline
reg_pipe = make_pipeline(scaler, reg)

In [28]:
#Training the model on your training data
reg_pipe.fit(X_train_processed, y_train)

In [32]:
train_pred = reg_pipe.predict(X_train_processed)
test_pred = reg_pipe.predict(X_test_processed)

In [63]:
#creating function to easily check all error values
def eval_regression(y_true, y_pred):
  mae = mean_absolute_error(y_true, y_pred)
  mse = mean_squared_error(y_true, y_pred)
  rmse = np.sqrt(mse)
  r2 = r2_score(y_true, y_pred)

  print(f'MAE {mae},\n MSE {mse},\n RMSE: {rmse},\n R^2: {r2} ')

In [64]:
eval_regression(y_train, train_pred)

MAE 847.5155601831553,
 MSE 1298706.3133285826,
 RMSE: 1139.6079647530473,
 R^2: 0.5611671729561014 


In [65]:
eval_regression(y_test, test_pred)

MAE 804.2622015721729,
 MSE 1194483.0430061328,
 RMSE: 1092.924079250765,
 R^2: 0.567055862226496 


We have all our error values, but we're focusing on the r^2 and rmse values. The rmse value shows us that our model is making an average error of 1,139 on the training data and 1,092 on our testing data. Our r^2 shows both train and test data have ~56% variation accuracy.

##Regression Tree

In [39]:
from sklearn.tree import DecisionTreeRegressor

In [41]:
dec_tree = DecisionTreeRegressor(random_state = 42)

In [43]:
dec_tree.fit(X_train_processed, y_train)

In [68]:
dec_trains_preds = dec_tree.predict(X_train_processed)
dec_test_preds = dec_tree.predict(X_test_processed)

In [45]:
dec_train_score = dec_tree.score(X_train_processed, y_train)
dec_test_score = dec_tree.score(X_test_processed, y_test)
print(dec_train_score)
print(dec_test_score)

1.0
0.2139336336340717


The train score being 1 and being drastically different from the test score tells us that our model is overfitting. We'll need to make some adjustments

In [46]:
#finding what params we can adjust
dec_tree.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 42,
 'splitter': 'best'}

In [47]:
dec_tree.get_depth()

39

In [58]:
#creating a loop to easily find where depth has the highest accuracy
depths = list(range(2, 39))
scores = pd.DataFrame(index=depths, columns=['Test Score', 'Train Score'])
for depth in depths:
    dec_tree = DecisionTreeRegressor(max_depth=depth, random_state=42)
    dec_tree.fit(X_train_processed, y_train)
    dec_train_score = dec_tree.score(X_train_processed, y_train)
    dec_test_score = dec_tree.score(X_test_processed, y_test)
    scores.loc[depth, 'Train Score'] = dec_train_score
    scores.loc[depth, 'Test Score'] = dec_test_score

In [56]:
#finding exactly where our accuracy is highest
sorted_scores = scores.sort_values(by='Test Score', ascending=False)
sorted_scores.head()

Unnamed: 0,Test Score,Train Score
5,0.594747,0.603925
4,0.584005,0.582625
6,0.583674,0.614877
7,0.578045,0.625969
8,0.559352,0.641612


In [None]:
plt.show

In [59]:
#fitting the model on our data at depth 5
dec_tree_5 = DecisionTreeRegressor(max_depth = 5, random_state=42)
dec_tree_5.fit(X_train_processed, y_train)
train_5_score = dec_tree_5.score(X_train_processed, y_train)
test_5_score = dec_tree_5.score(X_test_processed, y_test)
print(train_5_score)
print(test_5_score)

0.6039254897160836
0.5947470502499344


In [69]:
eval_regression(y_train, dec_trains_preds)

MAE 0.0019790675844806073,
 MSE 0.012517800378598327,
 RMSE: 0.11188297626805575,
 R^2: 0.9999999957702356 


In [70]:
eval_regression(y_test, dec_test_preds)

MAE 1045.5214242609104,
 MSE 2257321.0437885625,
 RMSE: 1502.4383660531844,
 R^2: 0.18182688427163574 


Our RMSE score on our testing model for our linear regression and decision tree match closely. The training RMSE score is greatly different, as is the R^2 score which shows overfitting. Between these two models, the linear regression shows numbers that match closer and have an R^2 score of 56% which is greater than the testing model for decision tree at only 18% variance. 

After seeing the different results, linear regression shows more consistent variance and can account for 56% of the variance. The RMSE scores match showing both models have the same erorrs. For these reason, I would use the linear regression model to predict sales. 