In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import category_encoders as ce
from pandas_profiling import profile_report


In [2]:
from sklearn.impute import KNNImputer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

In [3]:
%matplotlib inline

In [4]:
# Load data

train = pd.read_csv("data/train.csv")
test = pd.read_csv('data/test.csv')
sub = pd.read_csv('data/sample.csv')

In [5]:
train.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [6]:
# train_pr = profile_report.ProfileReport(train)
# train_pr.to_file("eda/train_pr.html")

In [7]:
# test_pr = profile_report.ProfileReport(test)
# test_pr.to_file("eda/test_pr.html")

In [8]:
# Item Fat content

train.Item_Fat_Content.unique(), test.Item_Fat_Content.unique()

(array(['Low Fat', 'Regular', 'low fat', 'LF', 'reg'], dtype=object),
 array(['Low Fat', 'reg', 'Regular', 'LF', 'low fat'], dtype=object))

In [9]:
train.Item_Fat_Content=  train.Item_Fat_Content.map({"Low Fat":'Low Fat', "Regular":"Regular","low fat":"Low Fat","LF":"Low Fat","reg":"Regular"})
test.Item_Fat_Content=  test.Item_Fat_Content.map({"Low Fat":'Low Fat', "Regular":"Regular","low fat":"Low Fat","LF":"Low Fat","reg":"Regular"})

In [10]:
# Item Fat content

train.Item_Fat_Content.unique(), test.Item_Fat_Content.unique()

(array(['Low Fat', 'Regular'], dtype=object),
 array(['Low Fat', 'Regular'], dtype=object))

In [11]:
cat_cols = train.select_dtypes('object').columns
cat_cols

Index(['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier',
       'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type'],
      dtype='object')

In [12]:
ce_encoder = ce.CatBoostEncoder(cols=cat_cols)

Missing Columns 
1. Item_Weight
2. Outlet_Size

In [23]:
# Predict for Item weight

In [34]:
item_weight_df = train[['Item_Identifier','Item_Weight']].dropna().drop_duplicates()

In [93]:
def fill_Item_Weight(x):
    #print(x)
    if np.isnan(x.Item_Weight):
        if x.Item_Identifier in item_weight_df.Item_Identifier.values:
            x['Item_Weight'] = item_weight_df[item_weight_df.Item_Identifier == x.Item_Identifier]['Item_Weight'].values[0]
        
    return x

In [94]:
train = train.apply(fill_Item_Weight,axis=1)

In [96]:
test = test.apply(fill_Item_Weight,axis=1)

Item_Identifier                 0
Item_Weight                     1
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  1606
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64

In [110]:
# Fill the remaining weight with mode
train.Item_Weight.fillna(train.Item_Weight.mode()[0],inplace=True)
test.Item_Weight.fillna(train.Item_Weight.mode()[0],inplace=True)

### Predict for outlet size

In [165]:
oi_train = train.drop(['Outlet_Size','Item_Outlet_Sales'],axis=1)[train.Outlet_Size.notna()]
oi_test = train.drop(['Outlet_Size','Item_Outlet_Sales'],axis=1)[train.Outlet_Size.isna()]
oi_target = train['Outlet_Size'].dropna()

In [144]:
oi_data = train.copy()

In [146]:
oi_cat_cols = ['Item_Identifier', 'Item_Fat_Content',
       'Item_Type','Outlet_Identifier',
       'Outlet_Establishment_Year', 'Outlet_Location_Type', 'Outlet_Type']

In [147]:
ce_enc = ce.OrdinalEncoder(cols=oi_cat_cols)
oi_train = ce_enc.fit_transform(oi_train)

  elif pd.api.types.is_categorical(cols):


In [141]:
rf = RandomForestClassifier()
cross_val_score(rf,oi_train,oi_target)

array([1., 1., 1., 1., 1.])

In [148]:
rf.fit(oi_train,oi_target)

RandomForestClassifier()

In [163]:
oi_test1 = ce_enc.transform(oi_test)

In [166]:
oi_test['Outlet_Size'] = rf.predict(oi_test1)

In [168]:
oi_test.groupby(['Outlet_Identifier']).first()

Unnamed: 0_level_0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Establishment_Year,Outlet_Location_Type,Outlet_Type,Outlet_Size
Outlet_Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
OUT010,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,1998,Tier 3,Grocery Store,Medium
OUT017,FDU28,19.2,Regular,0.09445,Frozen Foods,187.8214,2007,Tier 2,Supermarket Type1,Medium
OUT045,FDH17,16.2,Regular,0.016687,Frozen Foods,96.9726,2002,Tier 2,Supermarket Type1,Medium


In [169]:
# Fill Medium for Outlet_Size
train.Outlet_Size.fillna('Medium',inplace=True)
test.Outlet_Size.fillna('Medium',inplace=True)

In [170]:
train.isna().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64

In [171]:
test.isna().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
dtype: int64

In [172]:
train.to_csv("data/train1.csv",index=False)
test.to_csv('data/test1.csv',index=False)