## Load Bigmart Sales data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [2]:
bigmart = pd.read_csv('train_bm.csv')

In [3]:
bigmart.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [4]:
bigmart.shape

(8523, 12)

In [5]:
bigmart.dtypes

Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
dtype: object

## One hot Encoding

In [6]:
bigmart['Outlet_Type'].value_counts()

Supermarket Type1    5577
Grocery Store        1083
Supermarket Type3     935
Supermarket Type2     928
Name: Outlet_Type, dtype: int64

In [7]:
pd.get_dummies(bigmart['Outlet_Type']).head()

Unnamed: 0,Grocery Store,Supermarket Type1,Supermarket Type2,Supermarket Type3
0,0,1,0,0
1,0,0,1,0
2,0,1,0,0
3,1,0,0,0
4,0,1,0,0


In [8]:
bigmart_encoded = pd.get_dummies(bigmart)
bigmart_encoded.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales,Item_Identifier_DRA12,Item_Identifier_DRA24,Item_Identifier_DRA59,Item_Identifier_DRB01,Item_Identifier_DRB13,...,Outlet_Size_High,Outlet_Size_Medium,Outlet_Size_Small,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
0,9.3,0.016047,249.8092,1999,3735.138,0,0,0,0,0,...,0,1,0,1,0,0,0,1,0,0
1,5.92,0.019278,48.2692,2009,443.4228,0,0,0,0,0,...,0,1,0,0,0,1,0,0,1,0
2,17.5,0.01676,141.618,1999,2097.27,0,0,0,0,0,...,0,1,0,1,0,0,0,1,0,0
3,19.2,0.0,182.095,1998,732.38,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
4,8.93,0.0,53.8614,1987,994.7052,0,0,0,0,0,...,1,0,0,0,0,1,0,1,0,0


We have two problems here: 

#### Problem 1:
Look at the newly created variables *'Outlet_Size_High 	Outlet_Size_Medium 	Outlet_Size_Small'*, the order between these variables is destroyed. As a result we are missing out on some important information.  

In [9]:
bigmart_encoded[['Outlet_Size_High', 'Outlet_Size_Medium', 'Outlet_Size_Small']].head()

Unnamed: 0,Outlet_Size_High,Outlet_Size_Medium,Outlet_Size_Small
0,0,1,0
1,0,1,0
2,0,1,0
3,0,0,0
4,1,0,0


#### Problem 2
The number of features has increased from 12 to 1600, where maximum values are 0

In [10]:
bigmart_encoded.shape, bigmart.shape

((8523, 1605), (8523, 12))

In [11]:
# bigmart_encoded[['Item_Identifier_DRA12', 'Item_Identifier_DRA24',
#        'Item_Identifier_DRA59', 'Item_Identifier_DRB01',
#        'Item_Identifier_DRB13', 'Item_Identifier_DRB24',
#        'Item_Identifier_DRB25', 'Item_Identifier_DRB48',
#        'Item_Identifier_DRC01', 'Item_Identifier_DRC12',
#        'Item_Identifier_DRC13', 'Item_Identifier_DRC24',
#        'Item_Identifier_DRC25', 'Item_Identifier_DRC27',
#        'Item_Identifier_DRC36', 'Item_Identifier_DRC49',
#        'Item_Identifier_DRD01', 'Item_Identifier_DRD12',
#        'Item_Identifier_DRD13', 'Item_Identifier_DRD15',
#        'Item_Identifier_DRD24', 'Item_Identifier_DRD25',
#        'Item_Identifier_DRD27', 'Item_Identifier_DRD37',
#        'Item_Identifier_DRD49']].head()

## Problem 1 solution - Label Encoding

In [12]:
from sklearn.preprocessing import LabelEncoder

In [13]:
bigmart['Outlet_Size'].value_counts()

Medium    2793
Small     2388
High       932
Name: Outlet_Size, dtype: int64

In [14]:
le = LabelEncoder()
le.fit_transform(['Small', 'Medium', 'High'])

array([2, 1, 0], dtype=int64)

Label encoder uses the alphabetical order. 

In [15]:
bigmart['Outlet_Size'] = bigmart['Outlet_Size'].map({'Small': 0,
                                                     'Medium': 1,
                                                     'High': 2})

In [16]:
bigmart['Outlet_Size'].head()

0    1.0
1    1.0
2    1.0
3    NaN
4    2.0
Name: Outlet_Size, dtype: float64

So that solves the first challenge we encountered. Now we'll see how to deal with high cardinality.

## Problem 2 solution

In [17]:
bigmart.nunique()

Item_Identifier              1559
Item_Weight                   415
Item_Fat_Content                5
Item_Visibility              7880
Item_Type                      16
Item_MRP                     5938
Outlet_Identifier              10
Outlet_Establishment_Year       9
Outlet_Size                     3
Outlet_Location_Type            3
Outlet_Type                     4
Item_Outlet_Sales            3493
dtype: int64

In [18]:
bigmart['Item_Identifier'].value_counts()

FDW13    10
FDG33    10
FDT07     9
NCF42     9
FDW26     9
NCJ30     9
DRN47     9
FDO19     9
DRE49     9
FDX04     9
FDP25     9
FDV60     9
FDF52     9
NCL31     9
NCB18     9
FDQ40     9
FDF56     9
NCQ06     9
FDV38     9
FDX31     9
NCY18     9
FDX20     9
NCI54     9
FDD38     9
FDU12     9
FDG09     9
FDW49     9
FDR52     8
FDR59     8
DRF27     8
         ..
FDW58     2
FDU43     2
FDU09     2
FDA48     2
FDD48     2
DRG25     2
FDG28     2
FDD22     2
NCW05     2
FDR03     2
FDR57     2
FDI36     2
FDM38     2
NCS41     2
DRE01     2
FDB10     2
FDP15     2
FDE38     2
DRC24     2
FDM16     2
DRL59     2
FDQ60     1
FDC23     1
FDT35     1
FDY43     1
DRF48     1
FDO33     1
FDN52     1
FDK57     1
FDE52     1
Name: Item_Identifier, Length: 1559, dtype: int64

In [19]:
temp= bigmart['Item_Identifier'].value_counts()
temp.head()

FDW13    10
FDG33    10
FDT07     9
NCF42     9
FDW26     9
Name: Item_Identifier, dtype: int64

In [20]:
bigmart['Item_identifier_count'] = bigmart['Item_Identifier'].apply(lambda x: temp[x])
bigmart[['Item_Identifier','Item_identifier_count']].head()

Unnamed: 0,Item_Identifier,Item_identifier_count
0,FDA15,8
1,DRC01,6
2,FDN15,7
3,FDX07,6
4,NCD19,6


In [21]:
for i in range(0, len(bigmart)):
    if bigmart['Item_identifier_count'][i] < 4:
        bigmart['Item_Identifier'][i] = 'other'

In [23]:
bigmart.head(7)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Item_identifier_count
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,1.0,Tier 1,Supermarket Type1,3735.138,8
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,1.0,Tier 3,Supermarket Type2,443.4228,6
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,1.0,Tier 1,Supermarket Type1,2097.27,7
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38,6
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,2.0,Tier 3,Supermarket Type1,994.7052,6
5,other,10.395,Regular,0.0,Baking Goods,51.4008,OUT018,2009,1.0,Tier 3,Supermarket Type2,556.6088,3
6,FDO10,13.65,Regular,0.012741,Snack Foods,57.6588,OUT013,1987,2.0,Tier 3,Supermarket Type1,343.5528,8


In [23]:
 bigmart['Item_Identifier'].value_counts()

other    418
FDW13     10
FDG33     10
FDV60      9
FDX20      9
FDX04      9
NCJ30      9
NCQ06      9
NCL31      9
NCB18      9
NCF42      9
FDW49      9
FDD38      9
DRE49      9
FDF56      9
NCI54      9
NCY18      9
FDU12      9
FDQ40      9
FDX31      9
FDG09      9
FDT07      9
FDF52      9
FDO19      9
FDV38      9
FDW26      9
FDP25      9
DRN47      9
DRF23      8
DRP35      8
        ... 
FDO04      4
NCP41      4
FDS60      4
FDI58      4
FDQ55      4
FDM34      4
FDL27      4
FDQ57      4
NCC06      4
FDG04      4
NCQ17      4
FDP58      4
FDA55      4
FDC05      4
FDF39      4
NCG54      4
FDN22      4
FDN33      4
NCR29      4
FDV35      4
FDV36      4
FDM57      4
NCB54      4
FDK60      4
FDD33      4
FDO03      4
DRI13      4
DRH49      4
DRF60      4
FDB28      4
Name: Item_Identifier, Length: 1403, dtype: int64