## Preprocessing

In [1]:
import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt

save_figures = False
figure_path = "../figures/preprocessing/"
df_init = pd.read_parquet("../data/init.parquet")

### Data Cleaning

In [2]:
df_init

Unnamed: 0,y,Year,Country,Supermarket,Time,Weather,ObsSize,FemaleN,MaleN,Gender,Age,Child
0,1,2016,Wales,Tesco Metro,Morning (10.30-11.30),Sunny,1.0,0.0,1.0,Male,36 to 55,
1,1,2016,Wales,Tesco Metro,Morning (10.30-11.30),Sunny,1.0,0.0,1.0,Male,36 to 55,
2,1,2016,Wales,Tesco Metro,Morning (10.30-11.30),Sunny,1.0,1.0,0.0,Female,18-35,
3,1,2016,Wales,Tesco Metro,Morning (10.30-11.30),Sunny,2.0,2.0,0.0,Group: Gender not applicable,Group: Age not applicable,
4,1,2016,Wales,Tesco Metro,Morning (10.30-11.30),Sunny,1.0,1.0,0.0,Female,56+,
...,...,...,...,...,...,...,...,...,...,...,...,...
3759,1,2015,England,Tesco Extra,Saturday Morning (11-12),Sunny,1.0,1.0,0.0,Female,56+,
3760,1,2015,England,Tesco Extra,Saturday Morning (11-12),Sunny,1.0,1.0,0.0,Female,18-35,
3761,1,2015,England,Tesco Extra,Saturday Morning (11-12),Sunny,1.0,1.0,0.0,Female,56+,
3762,0,2015,England,Tesco Extra,Saturday Morning (11-12),Sunny,1.0,1.0,0.0,Female,56+,


In [3]:
year_rename = {"2015":"Y2015",
               "2016":"Y2016"}

df_init["Year"] = df_init["Year"].cat.rename_categories(year_rename)

In [4]:
time_rename = {"Evening (4.30-5.30)":"Evening",
               "Morning (10.30-11.30)":"Morning",
               "Saturday Morning (11-12)":"SatMorning",
               "Saturday evening (1-2)":"SatEvening"}

df_init["Time"] = df_init["Time"].cat.rename_categories(time_rename)

In [5]:
df_init

Unnamed: 0,y,Year,Country,Supermarket,Time,Weather,ObsSize,FemaleN,MaleN,Gender,Age,Child
0,1,Y2016,Wales,Tesco Metro,Morning,Sunny,1.0,0.0,1.0,Male,36 to 55,
1,1,Y2016,Wales,Tesco Metro,Morning,Sunny,1.0,0.0,1.0,Male,36 to 55,
2,1,Y2016,Wales,Tesco Metro,Morning,Sunny,1.0,1.0,0.0,Female,18-35,
3,1,Y2016,Wales,Tesco Metro,Morning,Sunny,2.0,2.0,0.0,Group: Gender not applicable,Group: Age not applicable,
4,1,Y2016,Wales,Tesco Metro,Morning,Sunny,1.0,1.0,0.0,Female,56+,
...,...,...,...,...,...,...,...,...,...,...,...,...
3759,1,Y2015,England,Tesco Extra,SatMorning,Sunny,1.0,1.0,0.0,Female,56+,
3760,1,Y2015,England,Tesco Extra,SatMorning,Sunny,1.0,1.0,0.0,Female,18-35,
3761,1,Y2015,England,Tesco Extra,SatMorning,Sunny,1.0,1.0,0.0,Female,56+,
3762,0,Y2015,England,Tesco Extra,SatMorning,Sunny,1.0,1.0,0.0,Female,56+,


In [6]:
df_init["ObsSize"] = df_init["ObsSize"].astype("int")
df_init["FemaleN"] = df_init["FemaleN"].astype("int")
df_init["MaleN"]   = df_init["MaleN"].astype("int")

In [7]:
df_init["Gender"].replace({"Group: Gender not applicable":np.nan}, inplace = True)
df_init["Age"].replace({"Group: Age not applicable":np.nan}, inplace = True)

age_rename = {"18-35":"Age_g1",
              "36 to 55":"Age_g2",
              "56+":"Age_g3"}

df_init["Age"] = df_init["Age"].cat.rename_categories(age_rename)

In [8]:
df_init

Unnamed: 0,y,Year,Country,Supermarket,Time,Weather,ObsSize,FemaleN,MaleN,Gender,Age,Child
0,1,Y2016,Wales,Tesco Metro,Morning,Sunny,1,0,1,Male,Age_g2,
1,1,Y2016,Wales,Tesco Metro,Morning,Sunny,1,0,1,Male,Age_g2,
2,1,Y2016,Wales,Tesco Metro,Morning,Sunny,1,1,0,Female,Age_g1,
3,1,Y2016,Wales,Tesco Metro,Morning,Sunny,2,2,0,,,
4,1,Y2016,Wales,Tesco Metro,Morning,Sunny,1,1,0,Female,Age_g3,
...,...,...,...,...,...,...,...,...,...,...,...,...
3759,1,Y2015,England,Tesco Extra,SatMorning,Sunny,1,1,0,Female,Age_g3,
3760,1,Y2015,England,Tesco Extra,SatMorning,Sunny,1,1,0,Female,Age_g1,
3761,1,Y2015,England,Tesco Extra,SatMorning,Sunny,1,1,0,Female,Age_g3,
3762,0,Y2015,England,Tesco Extra,SatMorning,Sunny,1,1,0,Female,Age_g3,


In [9]:
df_init["Child"].replace({"With child":"Child"}, inplace = True)
df_init["Child"].replace({"None":np.nan}, inplace = True)

In [10]:
df_init

Unnamed: 0,y,Year,Country,Supermarket,Time,Weather,ObsSize,FemaleN,MaleN,Gender,Age,Child
0,1,Y2016,Wales,Tesco Metro,Morning,Sunny,1,0,1,Male,Age_g2,
1,1,Y2016,Wales,Tesco Metro,Morning,Sunny,1,0,1,Male,Age_g2,
2,1,Y2016,Wales,Tesco Metro,Morning,Sunny,1,1,0,Female,Age_g1,
3,1,Y2016,Wales,Tesco Metro,Morning,Sunny,2,2,0,,,
4,1,Y2016,Wales,Tesco Metro,Morning,Sunny,1,1,0,Female,Age_g3,
...,...,...,...,...,...,...,...,...,...,...,...,...
3759,1,Y2015,England,Tesco Extra,SatMorning,Sunny,1,1,0,Female,Age_g3,
3760,1,Y2015,England,Tesco Extra,SatMorning,Sunny,1,1,0,Female,Age_g1,
3761,1,Y2015,England,Tesco Extra,SatMorning,Sunny,1,1,0,Female,Age_g3,
3762,0,Y2015,England,Tesco Extra,SatMorning,Sunny,1,1,0,Female,Age_g3,


### Interaction Terms

In [11]:
df_init["YearCountry"] = df_init["Year"].astype("str") + df_init["Country"].astype("str")
df_init["YearCountry"] = df_init["YearCountry"].astype("category")

In [12]:
df_init["YearCountryTime"] = df_init["YearCountry"].astype("str") + df_init["Time"].astype("str")
df_init["YearCountryTime"] = df_init["YearCountryTime"].astype("category")

df_init["YearCountrySupermarket"] = df_init["YearCountry"].astype("str") + df_init["Supermarket"].astype("str")
df_init["YearCountrySupermarket"] = df_init["YearCountrySupermarket"].astype("category")
df_init

Unnamed: 0,y,Year,Country,Supermarket,Time,Weather,ObsSize,FemaleN,MaleN,Gender,Age,Child,YearCountry,YearCountryTime,YearCountrySupermarket
0,1,Y2016,Wales,Tesco Metro,Morning,Sunny,1,0,1,Male,Age_g2,,Y2016Wales,Y2016WalesMorning,Y2016WalesTesco Metro
1,1,Y2016,Wales,Tesco Metro,Morning,Sunny,1,0,1,Male,Age_g2,,Y2016Wales,Y2016WalesMorning,Y2016WalesTesco Metro
2,1,Y2016,Wales,Tesco Metro,Morning,Sunny,1,1,0,Female,Age_g1,,Y2016Wales,Y2016WalesMorning,Y2016WalesTesco Metro
3,1,Y2016,Wales,Tesco Metro,Morning,Sunny,2,2,0,,,,Y2016Wales,Y2016WalesMorning,Y2016WalesTesco Metro
4,1,Y2016,Wales,Tesco Metro,Morning,Sunny,1,1,0,Female,Age_g3,,Y2016Wales,Y2016WalesMorning,Y2016WalesTesco Metro
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3759,1,Y2015,England,Tesco Extra,SatMorning,Sunny,1,1,0,Female,Age_g3,,Y2015England,Y2015EnglandSatMorning,Y2015EnglandTesco Extra
3760,1,Y2015,England,Tesco Extra,SatMorning,Sunny,1,1,0,Female,Age_g1,,Y2015England,Y2015EnglandSatMorning,Y2015EnglandTesco Extra
3761,1,Y2015,England,Tesco Extra,SatMorning,Sunny,1,1,0,Female,Age_g3,,Y2015England,Y2015EnglandSatMorning,Y2015EnglandTesco Extra
3762,0,Y2015,England,Tesco Extra,SatMorning,Sunny,1,1,0,Female,Age_g3,,Y2015England,Y2015EnglandSatMorning,Y2015EnglandTesco Extra


In [13]:
df_init["YearCountryGender"] = df_init["YearCountry"].astype("str") + df_init["Gender"].astype("str")
df_init["YearCountryGender"] = df_init["YearCountryGender"].astype("category")
df_init.loc[df_init["Gender"].isna(), "YearCountryGender"] = np.nan
df_init

Unnamed: 0,y,Year,Country,Supermarket,Time,Weather,ObsSize,FemaleN,MaleN,Gender,Age,Child,YearCountry,YearCountryTime,YearCountrySupermarket,YearCountryGender
0,1,Y2016,Wales,Tesco Metro,Morning,Sunny,1,0,1,Male,Age_g2,,Y2016Wales,Y2016WalesMorning,Y2016WalesTesco Metro,Y2016WalesMale
1,1,Y2016,Wales,Tesco Metro,Morning,Sunny,1,0,1,Male,Age_g2,,Y2016Wales,Y2016WalesMorning,Y2016WalesTesco Metro,Y2016WalesMale
2,1,Y2016,Wales,Tesco Metro,Morning,Sunny,1,1,0,Female,Age_g1,,Y2016Wales,Y2016WalesMorning,Y2016WalesTesco Metro,Y2016WalesFemale
3,1,Y2016,Wales,Tesco Metro,Morning,Sunny,2,2,0,,,,Y2016Wales,Y2016WalesMorning,Y2016WalesTesco Metro,
4,1,Y2016,Wales,Tesco Metro,Morning,Sunny,1,1,0,Female,Age_g3,,Y2016Wales,Y2016WalesMorning,Y2016WalesTesco Metro,Y2016WalesFemale
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3759,1,Y2015,England,Tesco Extra,SatMorning,Sunny,1,1,0,Female,Age_g3,,Y2015England,Y2015EnglandSatMorning,Y2015EnglandTesco Extra,Y2015EnglandFemale
3760,1,Y2015,England,Tesco Extra,SatMorning,Sunny,1,1,0,Female,Age_g1,,Y2015England,Y2015EnglandSatMorning,Y2015EnglandTesco Extra,Y2015EnglandFemale
3761,1,Y2015,England,Tesco Extra,SatMorning,Sunny,1,1,0,Female,Age_g3,,Y2015England,Y2015EnglandSatMorning,Y2015EnglandTesco Extra,Y2015EnglandFemale
3762,0,Y2015,England,Tesco Extra,SatMorning,Sunny,1,1,0,Female,Age_g3,,Y2015England,Y2015EnglandSatMorning,Y2015EnglandTesco Extra,Y2015EnglandFemale


In [14]:
df_init["YearCountryAge"] = df_init["YearCountry"].astype("str") + df_init["Age"].astype("str")
df_init["YearCountryAge"] = df_init["YearCountryAge"].astype("category")
df_init.loc[df_init["Age"].isna(), "YearCountryAge"] = np.nan
df_init

Unnamed: 0,y,Year,Country,Supermarket,Time,Weather,ObsSize,FemaleN,MaleN,Gender,Age,Child,YearCountry,YearCountryTime,YearCountrySupermarket,YearCountryGender,YearCountryAge
0,1,Y2016,Wales,Tesco Metro,Morning,Sunny,1,0,1,Male,Age_g2,,Y2016Wales,Y2016WalesMorning,Y2016WalesTesco Metro,Y2016WalesMale,Y2016WalesAge_g2
1,1,Y2016,Wales,Tesco Metro,Morning,Sunny,1,0,1,Male,Age_g2,,Y2016Wales,Y2016WalesMorning,Y2016WalesTesco Metro,Y2016WalesMale,Y2016WalesAge_g2
2,1,Y2016,Wales,Tesco Metro,Morning,Sunny,1,1,0,Female,Age_g1,,Y2016Wales,Y2016WalesMorning,Y2016WalesTesco Metro,Y2016WalesFemale,Y2016WalesAge_g1
3,1,Y2016,Wales,Tesco Metro,Morning,Sunny,2,2,0,,,,Y2016Wales,Y2016WalesMorning,Y2016WalesTesco Metro,,
4,1,Y2016,Wales,Tesco Metro,Morning,Sunny,1,1,0,Female,Age_g3,,Y2016Wales,Y2016WalesMorning,Y2016WalesTesco Metro,Y2016WalesFemale,Y2016WalesAge_g3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3759,1,Y2015,England,Tesco Extra,SatMorning,Sunny,1,1,0,Female,Age_g3,,Y2015England,Y2015EnglandSatMorning,Y2015EnglandTesco Extra,Y2015EnglandFemale,Y2015EnglandAge_g3
3760,1,Y2015,England,Tesco Extra,SatMorning,Sunny,1,1,0,Female,Age_g1,,Y2015England,Y2015EnglandSatMorning,Y2015EnglandTesco Extra,Y2015EnglandFemale,Y2015EnglandAge_g1
3761,1,Y2015,England,Tesco Extra,SatMorning,Sunny,1,1,0,Female,Age_g3,,Y2015England,Y2015EnglandSatMorning,Y2015EnglandTesco Extra,Y2015EnglandFemale,Y2015EnglandAge_g3
3762,0,Y2015,England,Tesco Extra,SatMorning,Sunny,1,1,0,Female,Age_g3,,Y2015England,Y2015EnglandSatMorning,Y2015EnglandTesco Extra,Y2015EnglandFemale,Y2015EnglandAge_g3


### Data Preprocessing 

In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

In [16]:
df_init.to_parquet("../data/before_split.parquet")

#### Test, Validation, Training 

In [17]:
y = df_init["y"].to_numpy()
X = df_init.loc[:, df_init.columns != "y"]

random_state = 132

# Train 
X_train, X_other, y_train, y_other = \
    train_test_split(X, y,
                     train_size = 0.8, 
                     random_state = random_state,
                     stratify = y)

# Val and Test
X_val, X_test, y_val, y_test = \
    train_test_split(X_other, y_other,
                     train_size = 0.5,
                     random_state = random_state,
                     stratify = y_other)

In [18]:
onehot_ftrs = ['Year', 'Country', 'Supermarket', 'Time', 
               'Weather', 'Gender', 'Age', 'Child', 
               'YearCountry', 'YearCountryTime', 'YearCountrySupermarket', 
               'YearCountryGender', 'YearCountryAge']
std_ftrs = ['ObsSize', 'FemaleN', 'MaleN']

In [19]:
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(sparse=False,handle_unknown='ignore'), onehot_ftrs),
        ('std', StandardScaler(), std_ftrs)])

clf = Pipeline(steps=[('preprocessor', preprocessor)]) 

X_train_prep = clf.fit_transform(X_train)
X_val_prep = clf.transform(X_val)
X_test_prep = clf.transform(X_test)

In [20]:
print(X_train_prep.shape)
print(X_val_prep.shape)
print(X_test_prep.shape)

(3011, 85)
(376, 85)
(377, 85)


In [21]:
feature_names = list(preprocessor.named_transformers_["onehot"].get_feature_names(onehot_ftrs)) + \
                     preprocessor.transformers_[1][-1]

In [22]:
for each in feature_names:
    print(each, end = ", ")

Year_Y2015, Year_Y2016, Country_England, Country_Wales, Supermarket_Asda, Supermarket_Tesco Extra, Supermarket_Tesco Metro, Supermarket_Waitrose, Time_Evening, Time_Morning, Time_SatEvening, Time_SatMorning, Weather_Cloudy, Weather_Rainy, Weather_Sunny, Gender_Female, Gender_Male, Gender_nan, Age_Age_g1, Age_Age_g2, Age_Age_g3, Age_nan, Child_Child, Child_nan, YearCountry_Y2015England, YearCountry_Y2015Wales, YearCountry_Y2016England, YearCountry_Y2016Wales, YearCountryTime_Y2015EnglandEvening, YearCountryTime_Y2015EnglandMorning, YearCountryTime_Y2015EnglandSatEvening, YearCountryTime_Y2015EnglandSatMorning, YearCountryTime_Y2015WalesEvening, YearCountryTime_Y2015WalesMorning, YearCountryTime_Y2015WalesSatEvening, YearCountryTime_Y2015WalesSatMorning, YearCountryTime_Y2016EnglandEvening, YearCountryTime_Y2016EnglandMorning, YearCountryTime_Y2016EnglandSatEvening, YearCountryTime_Y2016EnglandSatMorning, YearCountryTime_Y2016WalesEvening, YearCountryTime_Y2016WalesMorning, YearCountryTi

In [23]:
df_train = pd.DataFrame(data = X_train_prep, 
                        columns = feature_names)
df_val = pd.DataFrame(data = X_val_prep, 
                      columns = feature_names)
df_test = pd.DataFrame(data = X_test_prep, 
                       columns = feature_names)

df_train["y"] = y_train
df_val["y"] = y_val
df_test["y"] = y_test

In [24]:
df_train.to_parquet("../data/prep_train.parquet")
df_val.to_parquet("../data/prep_val.parquet")
df_test.to_parquet("../data/prep_test.parquet")

#### Test, Training 

In [25]:
y = df_init["y"].to_numpy()
X = df_init.loc[:, df_init.columns != "y"]

random_state = 132

# Just Train and Test
X_tt_train, X_tt_test, y_tt_train, y_tt_test = \
    train_test_split(X, y,
                     train_size = 0.9, 
                     random_state = random_state,
                     stratify = y)


In [26]:
X_tt_train_prep = clf.fit_transform(X_tt_train)
X_tt_test_prep = clf.transform(X_tt_test)

print(X_tt_train_prep.shape)
print(X_tt_test_prep.shape)

(3387, 85)
(377, 85)


In [27]:
df_tt_train = pd.DataFrame(data = X_tt_train_prep, 
                           columns = feature_names)
df_tt_test = pd.DataFrame(data = X_tt_test_prep, 
                          columns = feature_names)
df_tt_train["y"] = y_tt_train
df_tt_test["y"] = y_tt_test

In [28]:
df_tt_train.to_parquet("../data/prep_tt_train.parquet")
df_tt_test.to_parquet("../data/prep_tt_test.parquet")

#### All

In [29]:
X_all_prep = clf.fit_transform(X)
df_all = pd.DataFrame(data = X_all_prep, 
                      columns = feature_names)
print(df_all.shape)
df_all["y"] = y
df_all.to_parquet("../data/prep_all.parquet")

(3764, 85)


In [30]:
df_tt_test.shape

(377, 86)