# Import Libraries

In [1]:
import pickle
import pandas as pd
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler

# Import Data

In [2]:
DATA_RAW_PATH = '../data/raw/'

In [3]:
test_data_raw = pd.read_csv(DATA_RAW_PATH + 'Test_BigMart.csv')

In [4]:
test_data_raw.shape

(5681, 11)

In [5]:
test_data_raw.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDW58,20.75,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,FDW14,8.3,reg,0.038428,Dairy,87.3198,OUT017,2007,,Tier 2,Supermarket Type1
2,NCN55,14.6,Low Fat,0.099575,Others,241.7538,OUT010,1998,,Tier 3,Grocery Store
3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.034,OUT017,2007,,Tier 2,Supermarket Type1
4,FDY38,,Regular,0.118599,Dairy,234.23,OUT027,1985,Medium,Tier 3,Supermarket Type3


# Data transformation

We need the test data to have the same format as the train data, which was used as input to the model.

Therefore a future improvement would be to generate a Sklearn pipeline, which can be exported and reused to apply the test data transformations efficiently.

In [6]:
test_data = test_data_raw.copy()

**Removing uninteresting columns from the model**

In [7]:
test_data = test_data.drop(['Item_Weight','Outlet_Establishment_Year'], axis=1)

**'Outlet_Size' nulls imputation**

In [8]:
test_data.isnull().sum()/test_data.shape[0]*100

Item_Identifier          0.000000
Item_Fat_Content         0.000000
Item_Visibility          0.000000
Item_Type                0.000000
Item_MRP                 0.000000
Outlet_Identifier        0.000000
Outlet_Size             28.269671
Outlet_Location_Type     0.000000
Outlet_Type              0.000000
dtype: float64

In [9]:
imp_most_frequent = SimpleImputer(missing_values=np.nan, strategy='most_frequent') 
test_data["Outlet_Size"]  = imp_most_frequent.fit_transform(test_data["Outlet_Size"].to_numpy().reshape(-1,1))

**Categorical data transformation**

In [10]:
outlet_size_dict = { 
    'Small': 0,
    'Medium': 1,
    'High': 2,
}
test_data['Outlet_Size'].replace(outlet_size_dict, inplace = True)

In [11]:
label = ['Item_Identifier', 'Item_Type', 'Outlet_Identifier']
le = LabelEncoder()
for col in label:
    test_data[col]=le.fit_transform(test_data[col]) 

In [12]:
test_data['Item_Fat_Content'].replace({'Low Fat':0, 'Regular':1}, inplace=True)

In [13]:
transformer = make_column_transformer(
    (OneHotEncoder(sparse_output=False), ['Outlet_Location_Type', 'Outlet_Type']),
    remainder='passthrough')

transformed = transformer.fit_transform(test_data)
test_data = pd.DataFrame(transformed, columns=transformer.get_feature_names_out().tolist())

**Delete columns with high correlation with each other and columns with low correlation with the target**

In [14]:
test_data.drop(['remainder__Outlet_Identifier', # correlation 0.61 with "onehotencoder__Outlet_Location_Type_Tier 1" 
                       'onehotencoder__Outlet_Location_Type_Tier 3', # 0,05
                       'onehotencoder__Outlet_Type_Supermarket Type2', # -0,04
                       'remainder__Item_Identifier', # 0
                       'remainder__Item_Fat_Content', # 0.02
                       'remainder__Item_Type', # 0.02
                      ],axis=1,inplace=True)

In [15]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5681 entries, 0 to 5680
Data columns (total 8 columns):
 #   Column                                        Non-Null Count  Dtype 
---  ------                                        --------------  ----- 
 0   onehotencoder__Outlet_Location_Type_Tier 1    5681 non-null   object
 1   onehotencoder__Outlet_Location_Type_Tier 2    5681 non-null   object
 2   onehotencoder__Outlet_Type_Grocery Store      5681 non-null   object
 3   onehotencoder__Outlet_Type_Supermarket Type1  5681 non-null   object
 4   onehotencoder__Outlet_Type_Supermarket Type3  5681 non-null   object
 5   remainder__Item_Visibility                    5681 non-null   object
 6   remainder__Item_MRP                           5681 non-null   object
 7   remainder__Outlet_Size                        5681 non-null   object
dtypes: object(8)
memory usage: 355.2+ KB


In [16]:
test_data.head()

Unnamed: 0,onehotencoder__Outlet_Location_Type_Tier 1,onehotencoder__Outlet_Location_Type_Tier 2,onehotencoder__Outlet_Type_Grocery Store,onehotencoder__Outlet_Type_Supermarket Type1,onehotencoder__Outlet_Type_Supermarket Type3,remainder__Item_Visibility,remainder__Item_MRP,remainder__Outlet_Size
0,1.0,0.0,0.0,1.0,0.0,0.007565,107.8622,1
1,0.0,1.0,0.0,1.0,0.0,0.038428,87.3198,1
2,0.0,0.0,1.0,0.0,0.0,0.099575,241.7538,1
3,0.0,1.0,0.0,1.0,0.0,0.015388,155.034,1
4,0.0,0.0,0.0,0.0,1.0,0.118599,234.23,1


In [17]:
test_data[test_data.columns] = test_data[test_data.columns].astype(float)

In [18]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5681 entries, 0 to 5680
Data columns (total 8 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   onehotencoder__Outlet_Location_Type_Tier 1    5681 non-null   float64
 1   onehotencoder__Outlet_Location_Type_Tier 2    5681 non-null   float64
 2   onehotencoder__Outlet_Type_Grocery Store      5681 non-null   float64
 3   onehotencoder__Outlet_Type_Supermarket Type1  5681 non-null   float64
 4   onehotencoder__Outlet_Type_Supermarket Type3  5681 non-null   float64
 5   remainder__Item_Visibility                    5681 non-null   float64
 6   remainder__Item_MRP                           5681 non-null   float64
 7   remainder__Outlet_Size                        5681 non-null   float64
dtypes: float64(8)
memory usage: 355.2 KB


# Data Model Preparation

In [19]:
X_test=test_data

**Scale data**

In [20]:
scaler=StandardScaler()
X_test=scaler.fit_transform(X_test)

**Load model**

In [21]:
DATA_MODELS_PATH = '../data/models/'

In [22]:
pickled_model_linear_regression = pickle.load(open(DATA_MODELS_PATH+'linear_regression.pkl', 'rb'))

In [23]:
y_pred_test = pickled_model_linear_regression.predict(X_test)

In [24]:
len(y_pred_test)

5681

## Export Predictions

In [25]:
DATA_TEST_PREDICTION_PATH = '../data/test_predictions/'

In [26]:
Item_Outlet_Sales_pred = pd.DataFrame()
Item_Outlet_Sales_pred['Item_Outlet_Sales'] = y_pred_test

In [27]:
Item_Outlet_Sales_pred.shape

(5681, 1)

In [28]:
Item_Outlet_Sales_pred.head()

Unnamed: 0,Item_Outlet_Sales
0,1788.343256
1,1486.577042
2,1932.881693
3,2556.982157
4,5140.7106


In [29]:
Item_Outlet_Sales_pred.to_csv(DATA_TEST_PREDICTION_PATH + 'Item_Outlet_Sales_pred.csv', index=True)

## Backup code

Note: we also load the linear model, which was the second with the best metrics, because when I upload the polynomial pikle file, it requires a much larger number of arguments for the predict than the existing ones and fails.

In [30]:
pickled_model_polynomial = pickle.load(open(DATA_MODELS_PATH+'polynomial_model.pkl', 'rb'))

In [31]:
pickled_model_polynomial.predict(X_test)

ValueError: X has 8 features, but LinearRegression is expecting 45 features as input.

## Possible improvements

- Therefore a future improvement would be to generate a transformer Sklearn pipeline, which can be exported and reused to apply the test data transformations efficiently.
- Test other models or hyperparameter adjustment to find better metrics.
- Try to create new columns, maybe regrouping categories (with the Items Id prefix).
- We did not have data such as whether the stores are comparable (if they are still operating, for example). We also did not have the units in which the sales are expressed.
- - It would be interesting to have some data such as whether the stores are comparable (if they are still operating, for example). As well as the units in which the sales are expressed.
- It would also be interesting to have the information disaggregated by dates, in order to be able to make a prediction with some granularity over time into the future.
- Having demographic data on the areas of the stores could also be a useful factor for the model.
- If there were a product catalog, with a more detailed description of the products, it would be useful to analyze.
- The best model obtained, polynomial, fails to use its pickle to predict, I would like to analyze it to understand why. Maybe the lack of experience in its use made me unable to solve it.
- I found the challenge interesting to solve using each step of the Machine Learning process, the description was also very clear.