# BUNDAS ANALYSIS

# Group Members
- Jeovine Oruko
- Dennis Ogunde
- Thiga Mureithi

In [1]:
import pandas as pd

In [2]:
bundas_train = pd.read_csv("bundas_train.csv")
bundas_train.head()

Unnamed: 0,Item_ID,Weight,FatContent,Visibility,Category,Max_Price,Store_ID,Store_Establishment_Year,Store_Size,Store_Location_Type,Store_Type,Item_Store_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [3]:
bundas_train.describe(include='object')

Unnamed: 0,Item_ID,FatContent,Category,Store_ID,Store_Size,Store_Location_Type,Store_Type
count,8523,8523,8523,8523,6113,8523,8523
unique,1559,5,16,10,3,3,4
top,FDW13,Low Fat,Fruits and Vegetables,OUT027,Medium,Tier 3,Supermarket Type1
freq,10,5089,1232,935,2793,3350,5577


In [4]:
bundas_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
Item_ID                     8523 non-null object
Weight                      7060 non-null float64
FatContent                  8523 non-null object
Visibility                  8523 non-null float64
Category                    8523 non-null object
Max_Price                   8523 non-null float64
Store_ID                    8523 non-null object
Store_Establishment_Year    8523 non-null int64
Store_Size                  6113 non-null object
Store_Location_Type         8523 non-null object
Store_Type                  8523 non-null object
Item_Store_Sales            8523 non-null float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.1+ KB


In [5]:
bundas_train.shape

(8523, 12)

## Data Cleaning

In [6]:
bundas_train.isnull().any()

Item_ID                     False
Weight                       True
FatContent                  False
Visibility                  False
Category                    False
Max_Price                   False
Store_ID                    False
Store_Establishment_Year    False
Store_Size                   True
Store_Location_Type         False
Store_Type                  False
Item_Store_Sales            False
dtype: bool

### Weight and Store Size columns have missing data

In [7]:
bundas_train.isnull().sum()

Item_ID                        0
Weight                      1463
FatContent                     0
Visibility                     0
Category                       0
Max_Price                      0
Store_ID                       0
Store_Establishment_Year       0
Store_Size                  2410
Store_Location_Type            0
Store_Type                     0
Item_Store_Sales               0
dtype: int64

In [8]:
bundas_train[['Store_Size']].mode()

Unnamed: 0,Store_Size
0,Medium


### Fill null values in Store Size column with the mode which is Medium

In [9]:
bundas_train['Store_Size'] = bundas_train.Store_Size.fillna('Medium')

### Null values in Store Size columns have been dealt with by assigning the mode to them.

In [10]:
bundas_train[['Store_Size']].isnull().sum()

Store_Size    0
dtype: int64

In [11]:
weight = bundas_train[['Weight']]
weight.mean()

Weight    12.857645
dtype: float64

### Fill null values in Weight column with the mean which is 12.857645

In [12]:
bundas_train[['Weight']] = bundas_train.Weight.fillna(weight.mean())

In [13]:
bundas_train[['Weight']].isnull().sum()

Weight    1463
dtype: int64

In [14]:
bundas_train.sample(10)

Unnamed: 0,Item_ID,Weight,FatContent,Visibility,Category,Max_Price,Store_ID,Store_Establishment_Year,Store_Size,Store_Location_Type,Store_Type,Item_Store_Sales
5943,FDM58,16.85,Regular,0.079624,Snack Foods,111.6544,OUT013,1987,High,Tier 3,Supermarket Type1,2125.2336
4318,FDF12,8.235,Low Fat,0.082764,Baking Goods,146.5076,OUT018,2009,Medium,Tier 3,Supermarket Type2,1625.8836
3724,FDQ10,,Low Fat,0.033019,Snack Foods,170.4422,OUT027,1985,Medium,Tier 3,Supermarket Type3,5000.8238
2323,FDE04,19.75,Regular,0.018051,Frozen Foods,179.866,OUT049,1999,Medium,Tier 1,Supermarket Type1,2696.49
8026,NCS53,14.5,Low Fat,0.090286,Health and Hygiene,159.9604,OUT017,2007,Medium,Tier 2,Supermarket Type1,2218.4456
4608,FDN23,,Regular,0.075142,Breads,145.8444,OUT027,1985,Medium,Tier 3,Supermarket Type3,3773.7544
1724,FDQ11,5.695,Regular,0.067701,Breads,256.5988,OUT046,1997,Small,Tier 1,Supermarket Type1,2055.9904
6867,FDQ20,8.325,Low Fat,0.029845,Fruits and Vegetables,41.6138,OUT045,2002,Medium,Tier 2,Supermarket Type1,284.2966
302,NCQ54,17.7,Low Fat,0.01254,Household,167.0474,OUT035,2004,Small,Tier 2,Supermarket Type1,5895.659
3702,FDW24,6.8,Low Fat,0.062762,Baking Goods,50.4034,OUT010,1998,Medium,Tier 3,Grocery Store,48.6034


In [15]:
bundas_train.FatContent.unique()

array(['Low Fat', 'Regular', 'low fat', 'LF', 'reg'], dtype=object)

### Fat Content column has multiple values that are semantically the same thing, e.g Regular = reg

In [16]:
bundas_train.FatContent.replace('low fat', 'Low Fat')
bundas_train.FatContent.replace('LF', 'Low Fat')
bundas_train.FatContent.replace('reg', 'Regular')

0       Low Fat
1       Regular
2       Low Fat
3       Regular
4       Low Fat
5       Regular
6       Regular
7       Low Fat
8       Regular
9       Regular
10      Low Fat
11      Regular
12      Regular
13      Regular
14      Low Fat
15      Regular
16      Low Fat
17      Regular
18      Low Fat
19      Low Fat
20      Regular
21      Regular
22      Low Fat
23      Low Fat
24      Regular
25      Low Fat
26      Regular
27      low fat
28      Regular
29      Regular
         ...   
8493    Regular
8494    Low Fat
8495    Low Fat
8496    Regular
8497    Low Fat
8498    Low Fat
8499    Low Fat
8500    Low Fat
8501    Regular
8502    Low Fat
8503    Low Fat
8504    Low Fat
8505    Regular
8506    Low Fat
8507    Regular
8508    Regular
8509    Low Fat
8510    Regular
8511    Low Fat
8512    Low Fat
8513    Regular
8514    Regular
8515    Low Fat
8516    Low Fat
8517    Regular
8518    Low Fat
8519    Regular
8520    Low Fat
8521    Regular
8522    Low Fat
Name: FatContent, Length

In [17]:
bundas_train.Category.unique()

array(['Dairy', 'Soft Drinks', 'Meat', 'Fruits and Vegetables',
       'Household', 'Baking Goods', 'Snack Foods', 'Frozen Foods',
       'Breakfast', 'Health and Hygiene', 'Hard Drinks', 'Canned',
       'Breads', 'Starchy Foods', 'Others', 'Seafood'], dtype=object)

In [18]:
bundas_train.Store_Type.unique()

array(['Supermarket Type1', 'Supermarket Type2', 'Grocery Store',
       'Supermarket Type3'], dtype=object)

In [19]:
bundas_train.Store_Location_Type.unique()

array(['Tier 1', 'Tier 3', 'Tier 2'], dtype=object)

In [20]:
bundas_train[['Category', 'Store_Type']].sample(20)

Unnamed: 0,Category,Store_Type
7730,Baking Goods,Grocery Store
8521,Snack Foods,Supermarket Type2
1002,Snack Foods,Supermarket Type2
8299,Snack Foods,Supermarket Type1
7698,Canned,Supermarket Type1
8482,Frozen Foods,Supermarket Type2
5778,Baking Goods,Supermarket Type1
5434,Meat,Grocery Store
4206,Dairy,Supermarket Type1
7582,Breads,Supermarket Type1


In [21]:
bundas_train.Visibility.max()

0.328390948