In [112]:
# Import libraries and the data set
import pandas as pd
import numpy as np
import sidetable

df = pd.read_csv('train_v9rqX0R.csv', index_col=['Item_Identifier', 'Outlet_Identifier'])
X_test = pd.read_csv('test_AbJTz2l.csv', index_col=['Item_Identifier', 'Outlet_Identifier'])

In [113]:
# Have a look at the first five rows
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
Item_Identifier,Outlet_Identifier,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
FDA15,OUT049,9.3,Low Fat,0.016047,Dairy,249.8092,1999,Medium,Tier 1,Supermarket Type1,3735.138
DRC01,OUT018,5.92,Regular,0.019278,Soft Drinks,48.2692,2009,Medium,Tier 3,Supermarket Type2,443.4228
FDN15,OUT049,17.5,Low Fat,0.01676,Meat,141.618,1999,Medium,Tier 1,Supermarket Type1,2097.27
FDX07,OUT010,19.2,Regular,0.0,Fruits and Vegetables,182.095,1998,,Tier 3,Grocery Store,732.38
NCD19,OUT013,8.93,Low Fat,0.0,Household,53.8614,1987,High,Tier 3,Supermarket Type1,994.7052


In [114]:
# See the missing data in columns
df.stb.missing(style=True)

Unnamed: 0,missing,total,percent
Outlet_Size,2410,8523,28.28%
Item_Weight,1463,8523,17.17%
Item_Fat_Content,0,8523,0.00%
Item_Visibility,0,8523,0.00%
Item_Type,0,8523,0.00%
Item_MRP,0,8523,0.00%
Outlet_Establishment_Year,0,8523,0.00%
Outlet_Location_Type,0,8523,0.00%
Outlet_Type,0,8523,0.00%
Item_Outlet_Sales,0,8523,0.00%


In [115]:
# Check data types of columns
df.dtypes

Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
dtype: object

In [116]:
# Get X_train and y_train
y_train = df.Item_Outlet_Sales.copy()
X_train = df.drop(columns='Item_Outlet_Sales').copy()

In [117]:
# Retrive num_cols and cat_cols
num_cols = X_train._get_numeric_data().columns
cat_cols = list(set(X_train.columns) - set(num_cols))

In [118]:
# Have a look at cat_cols values
print('cat')
for cat_col in cat_cols:
    print(cat_col, X_train[cat_col].unique())

cat
Outlet_Size ['Medium' nan 'High' 'Small']
Outlet_Type ['Supermarket Type1' 'Supermarket Type2' 'Grocery Store'
 'Supermarket Type3']
Item_Type ['Dairy' 'Soft Drinks' 'Meat' 'Fruits and Vegetables' 'Household'
 'Baking Goods' 'Snack Foods' 'Frozen Foods' 'Breakfast'
 'Health and Hygiene' 'Hard Drinks' 'Canned' 'Breads' 'Starchy Foods'
 'Others' 'Seafood']
Outlet_Location_Type ['Tier 1' 'Tier 3' 'Tier 2']
Item_Fat_Content ['Low Fat' 'Regular' 'low fat' 'LF' 'reg']


In [119]:
# Unify cat_cols values
def fat_content(v):
    if v in ['Low Fat', 'LF']:
        return 'low fat'
    elif v in ['reg', 'Regular']:
        return 'regular'
    else:
        return np.nan

def outlet_size(size):
    if size == 'Small':
        return 0
    elif size == 'Medium':
        return 1
    elif size == 'High':
        return 2
    else:
        return np.nan
    
def outlet_type(t):
    if t == 'Supermarket Type1':
        return 1
    elif t == 'Supermarket Type2':
        return 2
    elif t == 'Supermarket Type3':
        return 3
    else:
        return 0
X_train['Item_Fat_Content'] = X_train['Item_Fat_Content'].apply(fat_content)
X_train['Outlet_Size'] = X_train['Outlet_Size'].apply(outlet_size)
X_train['Outlet_Type'] = X_train['Outlet_Type'].apply(outlet_type)
X_test['Item_Fat_Content'] = X_test['Item_Fat_Content'].apply(fat_content)
X_test['Outlet_Size'] = X_test['Outlet_Size'].apply(outlet_size)
X_test['Outlet_Type'] = X_test['Outlet_Type'].apply(outlet_type)

In [120]:
# Retrive num_cols and cat_cols
num_cols = X_train._get_numeric_data().columns
cat_cols = list(set(X_train.columns) - set(num_cols))

In [121]:
# Verify the columns
print('cat')
for cat_col in cat_cols:
    print(cat_col, X_train[cat_col].unique())


cat
Outlet_Location_Type ['Tier 1' 'Tier 3' 'Tier 2']
Item_Type ['Dairy' 'Soft Drinks' 'Meat' 'Fruits and Vegetables' 'Household'
 'Baking Goods' 'Snack Foods' 'Frozen Foods' 'Breakfast'
 'Health and Hygiene' 'Hard Drinks' 'Canned' 'Breads' 'Starchy Foods'
 'Others' 'Seafood']
Item_Fat_Content ['low fat' 'regular' nan]


In [122]:
# Impute cat_cols
from sklearn.impute import SimpleImputer
si = SimpleImputer(strategy='most_frequent')
X_train[cat_cols] = si.fit_transform(X_train[cat_cols])
X_test[cat_cols] = si.fit_transform(X_test[cat_cols])

In [123]:
# Encode cat_cols
from sklearn.preprocessing import OrdinalEncoder
X_train = pd.get_dummies(data=X_train, columns=['Item_Type'], drop_first=True)
X_test = pd.get_dummies(data=X_test, columns=['Item_Type'], drop_first=True)
oe = OrdinalEncoder()
X_train[['Outlet_Location_Type', 'Item_Fat_Content']] = oe.fit_transform(X_train[['Outlet_Location_Type', 'Item_Fat_Content']])
X_test[['Outlet_Location_Type', 'Item_Fat_Content']] = oe.fit_transform(X_test[['Outlet_Location_Type', 'Item_Fat_Content']])

In [124]:
X_train.columns

Index(['Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_MRP',
       'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type', 'Item_Type_Breads', 'Item_Type_Breakfast',
       'Item_Type_Canned', 'Item_Type_Dairy', 'Item_Type_Frozen Foods',
       'Item_Type_Fruits and Vegetables', 'Item_Type_Hard Drinks',
       'Item_Type_Health and Hygiene', 'Item_Type_Household', 'Item_Type_Meat',
       'Item_Type_Others', 'Item_Type_Seafood', 'Item_Type_Snack Foods',
       'Item_Type_Soft Drinks', 'Item_Type_Starchy Foods'],
      dtype='object')

In [125]:
# Impute num_cols
from fancyimpute import IterativeImputer
ii = IterativeImputer(random_state=1)
X_train[num_cols] = ii.fit_transform(X_train[num_cols])
X_test[num_cols] = ii.fit_transform(X_test[num_cols])

In [126]:
# Scale num_cols
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
num_scaled = sc.fit_transform(X_train[num_cols])
X_train[num_cols] = pd.DataFrame(num_scaled, index=X_train.index, columns=num_cols)
num_scaled = sc.fit_transform(X_test[num_cols])
X_test[num_cols] = pd.DataFrame(num_scaled, index=X_test.index, columns=num_cols)

In [127]:
# Verify X_train
X_train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Type_Breads,Item_Type_Breakfast,...,Item_Type_Fruits and Vegetables,Item_Type_Hard Drinks,Item_Type_Health and Hygiene,Item_Type_Household,Item_Type_Meat,Item_Type_Others,Item_Type_Seafood,Item_Type_Snack Foods,Item_Type_Soft Drinks,Item_Type_Starchy Foods
Item_Identifier,Outlet_Identifier,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
FDA15,OUT049,-0.842112,0.0,-0.970732,1.747454,0.139541,0.534133,0.0,-0.252658,0,0,...,0,0,0,0,0,0,0,0,0,0
DRC01,OUT018,-1.6419,1.0,-0.908111,-1.489023,1.334103,0.534133,2.0,1.002972,0,0,...,0,0,0,0,0,0,0,0,1,0
FDN15,OUT049,1.098201,0.0,-0.956917,0.01004,0.139541,0.534133,0.0,-0.252658,0,0,...,0,0,0,0,1,0,0,0,0,0
FDX07,OUT010,1.500461,1.0,-1.281758,0.66005,0.020085,-0.320761,2.0,-1.508289,0,0,...,1,0,0,0,0,0,0,0,0,0
NCD19,OUT013,-0.929663,0.0,-1.281758,-1.39922,-1.293934,2.177615,2.0,-0.252658,0,0,...,0,0,0,1,0,0,0,0,0,0


In [128]:
# Import XGBGeressor and GridSearchCV libraries
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.model_selection import KFold

parameters = {'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
             }
xgbr = XGBRegressor()
xgb_grid = GridSearchCV(estimator=xgbr,
                        param_grid=parameters,
                        cv = KFold(5),
                        verbose=True,
                        n_jobs = -1)
# Try fitting training data sets with all parameters
xgb_grid.fit(X_train,y_train)

# Print the best parameters
print(xgb_grid.best_params_)

#Fit the training tests using the best parameters
gbm = XGBRegressor(**xgb_grid.best_params_)
gbm.fit(X_train,y_train)

# Print the accuracy of prediction
predictions = gbm.predict(X_test)

Fitting 5 folds for each of 405 candidates, totalling 2025 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   17.0s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   39.3s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 2025 out of 2025 | elapsed:  3.7min finished


{'colsample_bytree': 0.8, 'gamma': 0.5, 'max_depth': 3, 'min_child_weight': 5, 'subsample': 1.0}


In [129]:
index = X_test.index
predictions = pd.DataFrame(predictions, columns=['Item_Outlet_Sales'], index=index)

# saving the DataFrame as a CSV file 
csv_data = predictions.to_csv('Predictions.csv', index = True) 

In [130]:
# from sklearn.pipeline import make_pipeline, Pipeline
# from sklearn.impute import SimpleImputer
# from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
# from sklearn.compose import make_column_transformer, ColumnTransformer, make_column_selector
# from sklearn.model_selection import KFold
# from fancyimpute import IterativeImputer
# from xgboost import XGBRegressor
# from sklearn.ensemble import RandomForestRegressor

# # define the data preparation for the columns
# t = [('cat_impute', SimpleImputer(strategy='most_frequent', missing_values=np.nan), cat_cols),
#      ('cat_ordinal_encode', OrdinalEncoder(), ['Outlet_Location_Type', 'Item_Fat_Content']),
#      ('cat_encode', OneHotEncoder(handle_unknown='ignore'), ['Item_Type']),
#      ('num_impute', IterativeImputer(missing_values=np.nan), num_cols), 
#      ('num_scale', StandardScaler(), num_cols)]
# col_transform = ColumnTransformer(transformers=t)

# # Create an object of XGBRegressor
# xgbr = XGBRegressor(random_state=1)

# # define the data preparation and modeling pipeline
# pipeline = Pipeline(steps=[('prep',col_transform), ('model', xgbr)])