# BUNDAS ANALYSIS

# Group Members
- Jeovine Oruko
- Dennis Ogunde
- Thiga Mureithi
- Brenda Cherono

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn import linear_model
from sklearn.tree import DecisionTreeClassifier

In [2]:
bundas_train = pd.read_csv("bundas_train.csv")
bundas_train.head()

Unnamed: 0,Item_ID,Weight,FatContent,Visibility,Category,Max_Price,Store_ID,Store_Establishment_Year,Store_Size,Store_Location_Type,Store_Type,Item_Store_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [3]:
bundas_train.describe(include='object')

Unnamed: 0,Item_ID,FatContent,Category,Store_ID,Store_Size,Store_Location_Type,Store_Type
count,8523,8523,8523,8523,6113,8523,8523
unique,1559,5,16,10,3,3,4
top,FDG33,Low Fat,Fruits and Vegetables,OUT027,Medium,Tier 3,Supermarket Type1
freq,10,5089,1232,935,2793,3350,5577


In [4]:
bundas_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
Item_ID                     8523 non-null object
Weight                      7060 non-null float64
FatContent                  8523 non-null object
Visibility                  8523 non-null float64
Category                    8523 non-null object
Max_Price                   8523 non-null float64
Store_ID                    8523 non-null object
Store_Establishment_Year    8523 non-null int64
Store_Size                  6113 non-null object
Store_Location_Type         8523 non-null object
Store_Type                  8523 non-null object
Item_Store_Sales            8523 non-null float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.1+ KB


In [5]:
bundas_train.shape

(8523, 12)

## Data Cleaning

In [6]:
bundas_train.isnull().any()

Item_ID                     False
Weight                       True
FatContent                  False
Visibility                  False
Category                    False
Max_Price                   False
Store_ID                    False
Store_Establishment_Year    False
Store_Size                   True
Store_Location_Type         False
Store_Type                  False
Item_Store_Sales            False
dtype: bool

### Weight and Store Size columns have missing data

In [7]:
bundas_train.isnull().sum()

Item_ID                        0
Weight                      1463
FatContent                     0
Visibility                     0
Category                       0
Max_Price                      0
Store_ID                       0
Store_Establishment_Year       0
Store_Size                  2410
Store_Location_Type            0
Store_Type                     0
Item_Store_Sales               0
dtype: int64

In [8]:
bundas_train[['Store_Size']].mode()

Unnamed: 0,Store_Size
0,Medium


### Fill null values in Store Size column with the mode which is Medium

In [9]:
bundas_train['Store_Size'] = bundas_train.Store_Size.fillna('Medium')

### Null values in Store Size columns have been dealt with by assigning the mode to them.

In [10]:
bundas_train[['Store_Size']].isnull().sum()

Store_Size    0
dtype: int64

In [11]:
weight = bundas_train[['Weight']]
weight.mean()

Weight    12.857645
dtype: float64

### Fill null values in Weight column with the mean which is 12.857645

In [12]:
bundas_train[['Weight']] = bundas_train.Weight.fillna(weight.mean())

In [13]:
bundas_train[['Weight']].isnull().sum()

Weight    1463
dtype: int64

In [14]:
bundas_train.sample(10)

Unnamed: 0,Item_ID,Weight,FatContent,Visibility,Category,Max_Price,Store_ID,Store_Establishment_Year,Store_Size,Store_Location_Type,Store_Type,Item_Store_Sales
760,FDC46,17.7,Low Fat,0.116445,Snack Foods,182.8266,OUT013,1987,High,Tier 3,Supermarket Type1,5163.9448
5713,FDY10,,Low Fat,0.04883,Snack Foods,113.1176,OUT027,1985,Medium,Tier 3,Supermarket Type3,2404.8696
4969,FDS02,10.195,Regular,0.146692,Dairy,194.5794,OUT017,2007,Medium,Tier 2,Supermarket Type1,4096.6674
3464,FDA58,9.395,Low Fat,0.103665,Snack Foods,233.6932,OUT013,1987,High,Tier 3,Supermarket Type1,1414.1592
2027,FDG41,8.84,Regular,0.076681,Frozen Foods,110.7228,OUT049,1999,Medium,Tier 1,Supermarket Type1,1657.842
1917,FDL45,15.6,Low Fat,0.037764,Snack Foods,124.9704,OUT045,2002,Medium,Tier 2,Supermarket Type1,2253.0672
4552,FDO10,,Regular,0.0,Snack Foods,58.7588,OUT019,1985,Small,Tier 1,Grocery Store,400.8116
357,FDN39,,Regular,0.065203,Meat,166.0816,OUT027,1985,Medium,Tier 3,Supermarket Type3,5033.448
6927,DRL11,10.5,Low Fat,0.048009,Hard Drinks,157.0946,OUT035,2004,Small,Tier 2,Supermarket Type1,2209.1244
882,NCQ05,11.395,Low Fat,0.036164,Health and Hygiene,150.1708,OUT010,1998,Medium,Tier 3,Grocery Store,300.9416


In [15]:
bundas_train.FatContent.unique()

array(['Low Fat', 'Regular', 'low fat', 'LF', 'reg'], dtype=object)

### Fat Content column has multiple values that are semantically the same thing, e.g Regular = reg

In [16]:
bundas_train.FatContent.replace('low fat', 'Low Fat', inplace=True)
bundas_train.FatContent.replace('LF', 'Low Fat', inplace=True)
bundas_train.FatContent.replace('reg', 'Regular', inplace=True)

In [17]:
bundas_train.Category.unique()

array(['Dairy', 'Soft Drinks', 'Meat', 'Fruits and Vegetables',
       'Household', 'Baking Goods', 'Snack Foods', 'Frozen Foods',
       'Breakfast', 'Health and Hygiene', 'Hard Drinks', 'Canned',
       'Breads', 'Starchy Foods', 'Others', 'Seafood'], dtype=object)

In [18]:
bundas_train.Store_Type.unique()

array(['Supermarket Type1', 'Supermarket Type2', 'Grocery Store',
       'Supermarket Type3'], dtype=object)

In [19]:
bundas_train.Store_Location_Type.unique()

array(['Tier 1', 'Tier 3', 'Tier 2'], dtype=object)

In [20]:
bundas_train[['Category', 'Store_Type']].sample(20)

Unnamed: 0,Category,Store_Type
1149,Baking Goods,Supermarket Type3
4321,Fruits and Vegetables,Grocery Store
1307,Baking Goods,Supermarket Type1
3633,Canned,Supermarket Type1
6811,Hard Drinks,Grocery Store
7359,Household,Supermarket Type1
286,Fruits and Vegetables,Grocery Store
7956,Baking Goods,Supermarket Type1
108,Soft Drinks,Grocery Store
7145,Starchy Foods,Supermarket Type1


In [21]:
bundas_train.Visibility.max()

0.328390948

## Feature Engineering

### Some columns are meaningless in this analysis so they will be dropped. We will also separate the features from the target variable

In [22]:
X_train = bundas_train.drop(['Item_ID', 'Item_Store_Sales'],axis=1)
y_train = bundas_train['Item_Store_Sales']
X_train

Unnamed: 0,Weight,FatContent,Visibility,Category,Max_Price,Store_ID,Store_Establishment_Year,Store_Size,Store_Location_Type,Store_Type
0,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2
2,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1
3,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,Medium,Tier 3,Grocery Store
4,8.930,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1
5,10.395,Regular,0.000000,Baking Goods,51.4008,OUT018,2009,Medium,Tier 3,Supermarket Type2
6,13.650,Regular,0.012741,Snack Foods,57.6588,OUT013,1987,High,Tier 3,Supermarket Type1
7,,Low Fat,0.127470,Snack Foods,107.7622,OUT027,1985,Medium,Tier 3,Supermarket Type3
8,16.200,Regular,0.016687,Frozen Foods,96.9726,OUT045,2002,Medium,Tier 2,Supermarket Type1
9,19.200,Regular,0.094450,Frozen Foods,187.8214,OUT017,2007,Medium,Tier 2,Supermarket Type1


### Now we need to encode the categorical variables

In [23]:
label_encoder = LabelEncoder()
encoded_fat_content = label_encoder.fit_transform(X_train["FatContent"])
encoded_category = label_encoder.fit_transform(X_train["Category"])
encoded_store_size = label_encoder.fit_transform(X_train["Store_Size"])
encoded_store_location_type = label_encoder.fit_transform(X_train["Store_Location_Type"])
encoded_store_type = label_encoder.fit_transform(X_train["Store_Type"])
# encoded_fat_content_df = pd.DataFrame(encoded_fat_content)
encoded_store_size

X_train_features = pd.DataFrame([encoded_fat_content, encoded_category, encoded_store_size, encoded_store_location_type, encoded_store_type]).T
X_train_features

Unnamed: 0,0,1,2,3,4
0,0,4,1,0,1
1,1,14,1,2,2
2,0,10,1,0,1
3,1,6,1,2,0
4,0,9,0,2,1
5,1,0,1,2,2
6,1,13,0,2,1
7,0,13,1,2,3
8,1,5,1,1,1
9,1,5,1,1,1


## Model

In [27]:
model = linear_model.LogisticRegression()
model.fit(X = X_train_features, y = y_train)
y_train.shape



ValueError: Unknown label type: 'continuous'

In [25]:
y_train.dtypes

dtype('float64')