In [41]:
# Import libraries and the data set
import pandas as pd
import numpy as np
import sidetable
import DataScience

df = pd.read_csv('train_v9rqX0R.csv', index_col=['Item_Identifier', 'Outlet_Identifier'])
X_test = pd.read_csv('test_AbJTz2l.csv', index_col=['Item_Identifier', 'Outlet_Identifier'])

In [42]:
# Have a look at the first five rows
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
Item_Identifier,Outlet_Identifier,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
FDA15,OUT049,9.3,Low Fat,0.016047,Dairy,249.8092,1999,Medium,Tier 1,Supermarket Type1,3735.138
DRC01,OUT018,5.92,Regular,0.019278,Soft Drinks,48.2692,2009,Medium,Tier 3,Supermarket Type2,443.4228
FDN15,OUT049,17.5,Low Fat,0.01676,Meat,141.618,1999,Medium,Tier 1,Supermarket Type1,2097.27
FDX07,OUT010,19.2,Regular,0.0,Fruits and Vegetables,182.095,1998,,Tier 3,Grocery Store,732.38
NCD19,OUT013,8.93,Low Fat,0.0,Household,53.8614,1987,High,Tier 3,Supermarket Type1,994.7052


In [43]:
# See the missing data in columns
df.stb.missing(style=True)

Unnamed: 0,missing,total,percent
Outlet_Size,2410,8523,28.28%
Item_Weight,1463,8523,17.17%
Item_Fat_Content,0,8523,0.00%
Item_Visibility,0,8523,0.00%
Item_Type,0,8523,0.00%
Item_MRP,0,8523,0.00%
Outlet_Establishment_Year,0,8523,0.00%
Outlet_Location_Type,0,8523,0.00%
Outlet_Type,0,8523,0.00%
Item_Outlet_Sales,0,8523,0.00%


In [44]:
# Check data types of columns
df.dtypes

Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
dtype: object

In [45]:
# Get X_train and y_train
y_train = df.Item_Outlet_Sales.copy()
X_train = df.drop(columns='Item_Outlet_Sales').copy()

In [46]:
# Retrive num_cols and cat_cols
num_cols = X_train._get_numeric_data().columns
cat_cols = list(set(X_train.columns) - set(num_cols))

In [47]:
# Have a look at cat_cols values
print('cat')
for cat_col in cat_cols:
    print(cat_col, X_train[cat_col].unique())

cat
Item_Type ['Dairy' 'Soft Drinks' 'Meat' 'Fruits and Vegetables' 'Household'
 'Baking Goods' 'Snack Foods' 'Frozen Foods' 'Breakfast'
 'Health and Hygiene' 'Hard Drinks' 'Canned' 'Breads' 'Starchy Foods'
 'Others' 'Seafood']
Outlet_Location_Type ['Tier 1' 'Tier 3' 'Tier 2']
Outlet_Type ['Supermarket Type1' 'Supermarket Type2' 'Grocery Store'
 'Supermarket Type3']
Item_Fat_Content ['Low Fat' 'Regular' 'low fat' 'LF' 'reg']
Outlet_Size ['Medium' nan 'High' 'Small']


In [48]:
# Unify cat_cols values
def fat_content(v):
    if v in ['Low Fat', 'LF']:
        return 'low fat'
    elif v in ['reg', 'Regular']:
        return 'regular'
    else:
        return np.nan

def outlet_size(size):
    if size == 'Small':
        return 0
    elif size == 'Medium':
        return 1
    elif size == 'High':
        return 2
    else:
        return np.nan
    
def outlet_type(t):
    if t == 'Supermarket Type1':
        return 1
    elif t == 'Supermarket Type2':
        return 2
    elif t == 'Supermarket Type3':
        return 3
    else:
        return 0
X_train['Item_Fat_Content'] = X_train['Item_Fat_Content'].apply(fat_content)
X_train['Outlet_Size'] = X_train['Outlet_Size'].apply(outlet_size)
X_train['Outlet_Type'] = X_train['Outlet_Type'].apply(outlet_type)
X_test['Item_Fat_Content'] = X_test['Item_Fat_Content'].apply(fat_content)
X_test['Outlet_Size'] = X_test['Outlet_Size'].apply(outlet_size)
X_test['Outlet_Type'] = X_test['Outlet_Type'].apply(outlet_type)

In [49]:
# Retrive num_cols and cat_cols
num_cols = X_train._get_numeric_data().columns
cat_cols = list(set(X_train.columns) - set(num_cols))

In [50]:
# Verify the columns
print('cat')
for cat_col in cat_cols:
    print(cat_col, X_train[cat_col].unique())


cat
Item_Fat_Content ['low fat' 'regular' nan]
Item_Type ['Dairy' 'Soft Drinks' 'Meat' 'Fruits and Vegetables' 'Household'
 'Baking Goods' 'Snack Foods' 'Frozen Foods' 'Breakfast'
 'Health and Hygiene' 'Hard Drinks' 'Canned' 'Breads' 'Starchy Foods'
 'Others' 'Seafood']
Outlet_Location_Type ['Tier 1' 'Tier 3' 'Tier 2']


In [51]:
# Impute cat_cols
from sklearn.impute import SimpleImputer
si = SimpleImputer(strategy='most_frequent')
X_train[cat_cols] = si.fit_transform(X_train[cat_cols])
X_test[cat_cols] = si.fit_transform(X_test[cat_cols])

In [52]:
# Encode cat_cols
from sklearn.preprocessing import OrdinalEncoder
X_train = pd.get_dummies(data=X_train, columns=['Item_Type'], drop_first=True)
X_test = pd.get_dummies(data=X_test, columns=['Item_Type'], drop_first=True)
oe = OrdinalEncoder()
X_train[['Outlet_Location_Type', 'Item_Fat_Content']] = oe.fit_transform(X_train[['Outlet_Location_Type', 'Item_Fat_Content']])
X_test[['Outlet_Location_Type', 'Item_Fat_Content']] = oe.fit_transform(X_test[['Outlet_Location_Type', 'Item_Fat_Content']])

In [53]:
# Have a look at the current columns
X_train.columns

Index(['Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_MRP',
       'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type', 'Item_Type_Breads', 'Item_Type_Breakfast',
       'Item_Type_Canned', 'Item_Type_Dairy', 'Item_Type_Frozen Foods',
       'Item_Type_Fruits and Vegetables', 'Item_Type_Hard Drinks',
       'Item_Type_Health and Hygiene', 'Item_Type_Household', 'Item_Type_Meat',
       'Item_Type_Others', 'Item_Type_Seafood', 'Item_Type_Snack Foods',
       'Item_Type_Soft Drinks', 'Item_Type_Starchy Foods'],
      dtype='object')

In [54]:
# This is the module I wrote included in folder 'package'
# Test for the best imputation algorithm for num_cols
DataScience.test_imputations(X_train, y_train, num_cols)

Imputing row 1/8523 with 0 missing, elapsed time: 11.969
Imputing row 101/8523 with 1 missing, elapsed time: 11.973
Imputing row 201/8523 with 1 missing, elapsed time: 11.977
Imputing row 301/8523 with 1 missing, elapsed time: 11.981
Imputing row 401/8523 with 1 missing, elapsed time: 11.985
Imputing row 501/8523 with 1 missing, elapsed time: 11.988
Imputing row 601/8523 with 0 missing, elapsed time: 11.992
Imputing row 701/8523 with 1 missing, elapsed time: 11.996
Imputing row 801/8523 with 0 missing, elapsed time: 11.999
Imputing row 901/8523 with 1 missing, elapsed time: 12.002
Imputing row 1001/8523 with 1 missing, elapsed time: 12.006
Imputing row 1101/8523 with 1 missing, elapsed time: 12.010
Imputing row 1201/8523 with 0 missing, elapsed time: 12.014
Imputing row 1301/8523 with 0 missing, elapsed time: 12.018
Imputing row 1401/8523 with 1 missing, elapsed time: 12.021
Imputing row 1501/8523 with 0 missing, elapsed time: 12.025
Imputing row 1601/8523 with 1 missing, elapsed time:

We can see that KNN yields the best Adjusted R Squared score

In [55]:
# Test for the best k neighors in KNN for num_cols
DataScience.test_KNN_imputation(X_train, y_train, num_cols, range(2, 10))

Imputing row 1/8523 with 0 missing, elapsed time: 12.019
Imputing row 101/8523 with 1 missing, elapsed time: 12.022
Imputing row 201/8523 with 1 missing, elapsed time: 12.026
Imputing row 301/8523 with 1 missing, elapsed time: 12.030
Imputing row 401/8523 with 1 missing, elapsed time: 12.034
Imputing row 501/8523 with 1 missing, elapsed time: 12.037
Imputing row 601/8523 with 0 missing, elapsed time: 12.041
Imputing row 701/8523 with 1 missing, elapsed time: 12.044
Imputing row 801/8523 with 0 missing, elapsed time: 12.047
Imputing row 901/8523 with 1 missing, elapsed time: 12.051
Imputing row 1001/8523 with 1 missing, elapsed time: 12.055
Imputing row 1101/8523 with 1 missing, elapsed time: 12.058
Imputing row 1201/8523 with 0 missing, elapsed time: 12.062
Imputing row 1301/8523 with 0 missing, elapsed time: 12.066
Imputing row 1401/8523 with 1 missing, elapsed time: 12.069
Imputing row 1501/8523 with 0 missing, elapsed time: 12.073
Imputing row 1601/8523 with 1 missing, elapsed time:

The best k neighbors is 2

In [56]:
# Impute num_cols
from fancyimpute import KNN
ii = KNN(2)
X_train[num_cols] = ii.fit_transform(X_train[num_cols])
X_test[num_cols] = ii.fit_transform(X_test[num_cols])

Imputing row 1/8523 with 0 missing, elapsed time: 12.464
Imputing row 101/8523 with 1 missing, elapsed time: 12.467
Imputing row 201/8523 with 1 missing, elapsed time: 12.471
Imputing row 301/8523 with 1 missing, elapsed time: 12.475
Imputing row 401/8523 with 1 missing, elapsed time: 12.479
Imputing row 501/8523 with 1 missing, elapsed time: 12.483
Imputing row 601/8523 with 0 missing, elapsed time: 12.487
Imputing row 701/8523 with 1 missing, elapsed time: 12.490
Imputing row 801/8523 with 0 missing, elapsed time: 12.494
Imputing row 901/8523 with 1 missing, elapsed time: 12.497
Imputing row 1001/8523 with 1 missing, elapsed time: 12.501
Imputing row 1101/8523 with 1 missing, elapsed time: 12.505
Imputing row 1201/8523 with 0 missing, elapsed time: 12.508
Imputing row 1301/8523 with 0 missing, elapsed time: 12.512
Imputing row 1401/8523 with 1 missing, elapsed time: 12.515
Imputing row 1501/8523 with 0 missing, elapsed time: 12.519
Imputing row 1601/8523 with 1 missing, elapsed time:

In [57]:
# Scale num_cols
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
num_scaled = sc.fit_transform(X_train[num_cols])
X_train[num_cols] = pd.DataFrame(num_scaled, index=X_train.index, columns=num_cols)
num_scaled = sc.fit_transform(X_test[num_cols])
X_test[num_cols] = pd.DataFrame(num_scaled, index=X_test.index, columns=num_cols)

In [58]:
# Verify X_train
X_train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Type_Breads,Item_Type_Breakfast,...,Item_Type_Fruits and Vegetables,Item_Type_Hard Drinks,Item_Type_Health and Hygiene,Item_Type_Household,Item_Type_Meat,Item_Type_Others,Item_Type_Seafood,Item_Type_Snack Foods,Item_Type_Soft Drinks,Item_Type_Starchy Foods
Item_Identifier,Outlet_Identifier,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
FDA15,OUT049,-0.803677,0.0,-0.970732,1.747454,0.139541,0.4804,0.0,-0.252658,0,0,...,0,0,0,0,0,0,0,0,0,0
DRC01,OUT018,-1.561878,1.0,-0.908111,-1.489023,1.334103,0.4804,2.0,1.002972,0,0,...,0,0,0,0,0,0,0,0,1,0
FDN15,OUT049,1.035744,0.0,-0.956917,0.01004,0.139541,0.4804,0.0,-0.252658,0,0,...,0,0,0,0,1,0,0,0,0,0
FDX07,OUT010,1.417088,1.0,-1.281758,0.66005,0.020085,-0.231304,2.0,-1.508289,0,0,...,1,0,0,0,0,0,0,0,0,0
NCD19,OUT013,-0.886676,0.0,-1.281758,-1.39922,-1.293934,2.019415,2.0,-0.252658,0,0,...,0,0,0,1,0,0,0,0,0,0


In [59]:
# Import XGBGeressor and GridSearchCV libraries
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold

parameters = {
        'n_estimators': [200, 500],
        'max_features': ['auto', 'sqrt', 'log2'],
        'max_depth' : [4,5,6,7,8],
        'criterion' :['mse', 'mae']
             }
rfr = RandomForestRegressor()
xgb_grid = GridSearchCV(estimator=rfr,
                        param_grid=parameters,
                        cv = KFold(5),
                        verbose=True,
                        n_jobs = -1)
# Try fitting training data sets with all parameters
xgb_grid.fit(X_train,y_train)

# Print the best parameters
print(xgb_grid.best_params_)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   26.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 30.3min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 96.9min finished


{'criterion': 'mse', 'max_depth': 6, 'max_features': 'auto', 'n_estimators': 500}


In [60]:
# Fit the training tests using the best parameters
gbm = RandomForestRegressor(**xgb_grid.best_params_)
gbm.fit(X_train,y_train)

# Get the predicted values
predictions = gbm.predict(X_test)

In [61]:
# Get the index of the predicted table
index = X_test.index
predictions = pd.DataFrame(predictions, columns=['Item_Outlet_Sales'], index=index)

# saving the DataFrame as a CSV file 
csv_data = predictions.to_csv('Predictions.csv', index = True) 

In [62]:
# Verify the prediction's format
print(predictions.head())

                                   Item_Outlet_Sales
Item_Identifier Outlet_Identifier                   
FDW58           OUT049                   1653.280812
FDW14           OUT017                   1365.826046
NCN55           OUT010                    572.994079
FDQ58           OUT017                   2486.481676
FDY38           OUT027                   6065.373986


In [None]:
# from sklearn.pipeline import make_pipeline, Pipeline
# from sklearn.impute import SimpleImputer
# from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
# from sklearn.compose import make_column_transformer, ColumnTransformer, make_column_selector
# from sklearn.model_selection import KFold
# from fancyimpute import IterativeImputer
# from xgboost import XGBRegressor
# from sklearn.ensemble import RandomForestRegressor

# # define the data preparation for the columns
# t = [('cat_impute', SimpleImputer(strategy='most_frequent', missing_values=np.nan), cat_cols),
#      ('cat_ordinal_encode', OrdinalEncoder(), ['Outlet_Location_Type', 'Item_Fat_Content']),
#      ('cat_encode', OneHotEncoder(handle_unknown='ignore'), ['Item_Type']),
#      ('num_impute', IterativeImputer(missing_values=np.nan), num_cols), 
#      ('num_scale', StandardScaler(), num_cols)]
# col_transform = ColumnTransformer(transformers=t)

# # Create an object of XGBRegressor
# xgbr = XGBRegressor(random_state=1)

# # define the data preparation and modeling pipeline
# pipeline = Pipeline(steps=[('prep',col_transform), ('model', xgbr)])