In [1]:
# Import libraries and the data set
import pandas as pd
import numpy as np
import sidetable
import DataScience

df = pd.read_csv('train_v9rqX0R.csv', index_col=['Item_Identifier', 'Outlet_Identifier'])
X_test = pd.read_csv('test_AbJTz2l.csv', index_col=['Item_Identifier', 'Outlet_Identifier'])

In [2]:
# Have a look at the first five rows
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
Item_Identifier,Outlet_Identifier,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
FDA15,OUT049,9.3,Low Fat,0.016047,Dairy,249.8092,1999,Medium,Tier 1,Supermarket Type1,3735.138
DRC01,OUT018,5.92,Regular,0.019278,Soft Drinks,48.2692,2009,Medium,Tier 3,Supermarket Type2,443.4228
FDN15,OUT049,17.5,Low Fat,0.01676,Meat,141.618,1999,Medium,Tier 1,Supermarket Type1,2097.27
FDX07,OUT010,19.2,Regular,0.0,Fruits and Vegetables,182.095,1998,,Tier 3,Grocery Store,732.38
NCD19,OUT013,8.93,Low Fat,0.0,Household,53.8614,1987,High,Tier 3,Supermarket Type1,994.7052


In [3]:
# See the missing data in columns
df.stb.missing(style=True)

Unnamed: 0,missing,total,percent
Outlet_Size,2410,8523,28.28%
Item_Weight,1463,8523,17.17%
Item_Fat_Content,0,8523,0.00%
Item_Visibility,0,8523,0.00%
Item_Type,0,8523,0.00%
Item_MRP,0,8523,0.00%
Outlet_Establishment_Year,0,8523,0.00%
Outlet_Location_Type,0,8523,0.00%
Outlet_Type,0,8523,0.00%
Item_Outlet_Sales,0,8523,0.00%


In [4]:
# Check data types of columns
df.dtypes

Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
dtype: object

In [5]:
# Get X_train and y_train
y_train = df.Item_Outlet_Sales.copy()
X_train = df.drop(columns='Item_Outlet_Sales').copy()

In [6]:
# Retrive num_cols and cat_cols
num_cols = X_train._get_numeric_data().columns
cat_cols = list(set(X_train.columns) - set(num_cols))

In [7]:
# Have a look at cat_cols values
print('cat')
for cat_col in cat_cols:
    print(cat_col, X_train[cat_col].unique())

cat
Item_Fat_Content ['Low Fat' 'Regular' 'low fat' 'LF' 'reg']
Outlet_Type ['Supermarket Type1' 'Supermarket Type2' 'Grocery Store'
 'Supermarket Type3']
Outlet_Size ['Medium' nan 'High' 'Small']
Item_Type ['Dairy' 'Soft Drinks' 'Meat' 'Fruits and Vegetables' 'Household'
 'Baking Goods' 'Snack Foods' 'Frozen Foods' 'Breakfast'
 'Health and Hygiene' 'Hard Drinks' 'Canned' 'Breads' 'Starchy Foods'
 'Others' 'Seafood']
Outlet_Location_Type ['Tier 1' 'Tier 3' 'Tier 2']


In [8]:
# Unify cat_cols values
def fat_content(v):
    if v in ['Low Fat', 'LF']:
        return 'low fat'
    elif v in ['reg', 'Regular']:
        return 'regular'
    else:
        return np.nan

def outlet_size(size):
    if size == 'Small':
        return 0
    elif size == 'Medium':
        return 1
    elif size == 'High':
        return 2
    else:
        return np.nan
    
def outlet_type(t):
    if t == 'Supermarket Type1':
        return 1
    elif t == 'Supermarket Type2':
        return 2
    elif t == 'Supermarket Type3':
        return 3
    else:
        return 0
X_train['Item_Fat_Content'] = X_train['Item_Fat_Content'].apply(fat_content)
X_train['Outlet_Size'] = X_train['Outlet_Size'].apply(outlet_size)
X_train['Outlet_Type'] = X_train['Outlet_Type'].apply(outlet_type)
X_test['Item_Fat_Content'] = X_test['Item_Fat_Content'].apply(fat_content)
X_test['Outlet_Size'] = X_test['Outlet_Size'].apply(outlet_size)
X_test['Outlet_Type'] = X_test['Outlet_Type'].apply(outlet_type)

In [9]:
# Retrive num_cols and cat_cols
num_cols = X_train._get_numeric_data().columns
cat_cols = list(set(X_train.columns) - set(num_cols))

In [10]:
# Verify the columns
print('cat')
for cat_col in cat_cols:
    print(cat_col, X_train[cat_col].unique())


cat
Outlet_Location_Type ['Tier 1' 'Tier 3' 'Tier 2']
Item_Fat_Content ['low fat' 'regular' nan]
Item_Type ['Dairy' 'Soft Drinks' 'Meat' 'Fruits and Vegetables' 'Household'
 'Baking Goods' 'Snack Foods' 'Frozen Foods' 'Breakfast'
 'Health and Hygiene' 'Hard Drinks' 'Canned' 'Breads' 'Starchy Foods'
 'Others' 'Seafood']


In [11]:
# Impute cat_cols
from sklearn.impute import SimpleImputer
si = SimpleImputer(strategy='most_frequent')
X_train[cat_cols] = si.fit_transform(X_train[cat_cols])
X_test[cat_cols] = si.fit_transform(X_test[cat_cols])

In [12]:
# Encode cat_cols
from sklearn.preprocessing import OrdinalEncoder
X_train = pd.get_dummies(data=X_train, columns=['Item_Type'], drop_first=True)
X_test = pd.get_dummies(data=X_test, columns=['Item_Type'], drop_first=True)
oe = OrdinalEncoder()
X_train[['Outlet_Location_Type', 'Item_Fat_Content']] = oe.fit_transform(X_train[['Outlet_Location_Type', 'Item_Fat_Content']])
X_test[['Outlet_Location_Type', 'Item_Fat_Content']] = oe.fit_transform(X_test[['Outlet_Location_Type', 'Item_Fat_Content']])

In [13]:
X_train.columns

Index(['Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_MRP',
       'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type', 'Item_Type_Breads', 'Item_Type_Breakfast',
       'Item_Type_Canned', 'Item_Type_Dairy', 'Item_Type_Frozen Foods',
       'Item_Type_Fruits and Vegetables', 'Item_Type_Hard Drinks',
       'Item_Type_Health and Hygiene', 'Item_Type_Household', 'Item_Type_Meat',
       'Item_Type_Others', 'Item_Type_Seafood', 'Item_Type_Snack Foods',
       'Item_Type_Soft Drinks', 'Item_Type_Starchy Foods'],
      dtype='object')

In [14]:
for i in num_cols:
    print(X_train[i].unique())

[ 9.3    5.92  17.5   19.2    8.93  10.395 13.65     nan 16.2   11.8
 18.5   15.1   17.6   16.35   9.    13.35  18.85  14.6   13.85  13.
  7.645 11.65   5.925 19.25  18.6   18.7   17.85  10.     8.85   9.8
 13.6   21.35  12.15   6.42  19.6   15.85   7.39  10.195  9.895 10.895
  7.905  9.195  8.365  7.97  17.7   19.35   8.645 15.6   18.25   7.855
  7.825  8.39  12.85  19.     5.905  7.76  16.75  15.5    6.055  6.305
 20.85  20.75   8.895 19.7    8.75  13.3    8.31  19.75  17.1   10.5
  6.635 14.15   8.89   9.1    7.5   16.85   7.485 11.6   12.65  20.25
  8.6   12.6    8.88  20.5   13.5    7.235  6.92   8.02  12.8   16.6
 14.    16.    21.25   7.365 18.35   5.465  7.27   6.155 19.5   15.2
 14.5   13.1   12.3   11.1   11.3    5.75  11.35   6.525 10.3    5.78
 11.85  18.75   5.26  16.1    9.5   13.8   14.65   6.67   6.11  17.2
  6.32   4.88   5.425 14.1    7.55  17.25  12.    10.1    7.785 13.15
  8.5    7.63   9.285  7.975 15.7    8.985 20.35   6.59  19.85   6.26
 18.2    8.695  7.075  8.

In [15]:
# DataScience.test_imputations(X_train, y_train, num_cols)

We can see that KNN yields the best Adjusted R Squared score

In [16]:
DataScience.test_KNN_imputation(X_train, y_train, num_cols, range(2, 15))

NameError: name 'X_train' is not defined

In [None]:
# Impute num_cols
from fancyimpute import IterativeImputer
ii = IterativeImputer(random_state=1)
X_train[num_cols] = ii.fit_transform(X_train[num_cols])
X_test[num_cols] = ii.fit_transform(X_test[num_cols])

In [None]:
# Scale num_cols
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
num_scaled = sc.fit_transform(X_train[num_cols])
X_train[num_cols] = pd.DataFrame(num_scaled, index=X_train.index, columns=num_cols)
num_scaled = sc.fit_transform(X_test[num_cols])
X_test[num_cols] = pd.DataFrame(num_scaled, index=X_test.index, columns=num_cols)

In [None]:
# Verify X_train
X_train.head()

In [None]:
# Import XGBGeressor and GridSearchCV libraries
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold

parameters = {
        'n_estimators': [200, 500],
        'max_features': ['auto', 'sqrt', 'log2'],
        'max_depth' : [4,5,6,7,8],
        'criterion' :['mse', 'mae']
             }
rfr = RandomForestRegressor()
xgb_grid = GridSearchCV(estimator=rfr,
                        param_grid=parameters,
                        cv = KFold(5),
                        verbose=True,
                        n_jobs = -1)
# Try fitting training data sets with all parameters
xgb_grid.fit(X_train,y_train)

# Print the best parameters
print(xgb_grid.best_params_)

In [None]:
# Fit the training tests using the best parameters
gbm = RandomForestRegressor(**xgb_grid.best_params_)
gbm.fit(X_train,y_train)

# Get the predicted values
predictions = gbm.predict(X_test)

In [None]:
# Get the index of the predicted table
index = X_test.index
predictions = pd.DataFrame(predictions, columns=['Item_Outlet_Sales'], index=index)

# saving the DataFrame as a CSV file 
csv_data = predictions.to_csv('Predictions.csv', index = True) 

In [None]:
# Verify the prediction's format
print(predictions.head())

In [None]:
# from sklearn.pipeline import make_pipeline, Pipeline
# from sklearn.impute import SimpleImputer
# from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
# from sklearn.compose import make_column_transformer, ColumnTransformer, make_column_selector
# from sklearn.model_selection import KFold
# from fancyimpute import IterativeImputer
# from xgboost import XGBRegressor
# from sklearn.ensemble import RandomForestRegressor

# # define the data preparation for the columns
# t = [('cat_impute', SimpleImputer(strategy='most_frequent', missing_values=np.nan), cat_cols),
#      ('cat_ordinal_encode', OrdinalEncoder(), ['Outlet_Location_Type', 'Item_Fat_Content']),
#      ('cat_encode', OneHotEncoder(handle_unknown='ignore'), ['Item_Type']),
#      ('num_impute', IterativeImputer(missing_values=np.nan), num_cols), 
#      ('num_scale', StandardScaler(), num_cols)]
# col_transform = ColumnTransformer(transformers=t)

# # Create an object of XGBRegressor
# xgbr = XGBRegressor(random_state=1)

# # define the data preparation and modeling pipeline
# pipeline = Pipeline(steps=[('prep',col_transform), ('model', xgbr)])