In [1]:
# import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import FactorAnalysis

In [2]:
# read the data
train=pd.read_csv("data.csv")

In [3]:
# first five rows of the dataset
train.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [4]:
# checking number of missing values in each variable
train.isnull().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [5]:
# imputing missing values in Item_Weight with the median and in Outlet_Size with the mode of corresponding variables
train['Item_Weight'].fillna(train['Item_Weight'].median(), inplace=True)
train['Outlet_Size'].fillna(train['Outlet_Size'].mode()[0], inplace=True)

In [6]:
# rechecking the number of missing values
train.isnull().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64

In [7]:
# dropping the ID variables
train=train.drop(['Item_Identifier', 'Outlet_Identifier'], axis=1)

In [8]:
# converting categorical variables to numerical values
train=pd.get_dummies(train)

In [9]:
# checking the shape of dataset
train.shape

(8523, 36)

In [10]:
# separating the target variable
df = train.drop('Item_Outlet_Sales',1)
target = train['Item_Outlet_Sales']

In [11]:
# creating the training and validation set
X_train, X_valid, y_train, y_valid = train_test_split(df, target, random_state = 10, test_size = 0.25)

## Checking the assumptions of applying Factor Analysis

### Assumption 1 - Sample size

In [12]:
# shape of the training data
X_train.shape

(6392, 35)

### Assumption 2 - Variable Sample ratio

In [13]:
# variable to sample ratio
X_train.shape[0]/X_train.shape[1]

182.62857142857143

### Assumption 3 - Correlation Value

In [14]:
# correlation between training variables
X_train.corr()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Fat_Content_LF,Item_Fat_Content_Low Fat,Item_Fat_Content_Regular,Item_Fat_Content_low fat,Item_Fat_Content_reg,Item_Type_Baking Goods,...,Outlet_Size_High,Outlet_Size_Medium,Outlet_Size_Small,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
Item_Weight,1.0,-0.012235,0.035092,0.004093,-0.002178,0.033755,-0.032308,-0.000143,-0.00716,-0.028941,...,0.015916,-0.010365,0.000105,0.011652,-0.013041,0.001825,-0.006254,0.015304,0.001088,-0.017618
Item_Visibility,-0.012235,1.0,-0.002441,-0.072952,-0.002488,-0.042638,0.046715,-0.000701,-0.005563,0.015942,...,-0.041913,-0.032867,0.065237,0.063907,-0.067018,0.005686,0.287307,-0.143032,-0.033601,-0.056058
Item_MRP,0.035092,-0.002441,1.0,0.002182,-0.015249,0.002411,0.007186,-0.003303,-0.011501,-0.074499,...,0.001147,-0.01382,0.014244,-0.006409,0.004953,0.001126,0.006533,0.000451,-0.001061,-0.006596
Outlet_Establishment_Year,0.004093,-0.072952,0.002182,1.0,0.002636,0.002081,-0.00445,-0.018418,0.022945,0.004378,...,-0.453277,0.348528,-0.061089,-0.20174,0.545117,-0.338078,-0.28326,0.250469,0.463976,-0.536173
Item_Fat_Content_LF,-0.002178,-0.002488,-0.015249,0.002636,1.0,-0.237347,-0.139053,-0.022445,-0.023112,-0.011494,...,-0.00509,-0.007753,0.012019,-0.003644,-0.010941,0.013844,0.017747,-0.021439,0.015949,-0.002149
Item_Fat_Content_Low Fat,0.033755,-0.042638,0.002411,0.002081,-0.237347,1.0,-0.872407,-0.140817,-0.145005,-0.080278,...,-0.003404,0.00776,-0.006058,0.004926,0.002218,-0.006649,-0.006263,0.007857,-0.004731,-0.000569
Item_Fat_Content_Regular,-0.032308,0.046715,0.007186,-0.00445,-0.139053,-0.872407,1.0,-0.0825,-0.084953,0.082865,...,0.002645,-0.005767,0.004422,-0.006442,0.003121,0.002915,-0.005293,0.000368,-0.003954,0.008987
Item_Fat_Content_low fat,-0.000143,-0.000701,-0.003303,-0.018418,-0.022445,-0.140817,-0.0825,1.0,-0.013712,-0.001872,...,0.015976,-0.012355,0.00223,0.004847,0.004533,-0.008797,0.013245,0.003237,-0.013481,-0.005709
Item_Fat_Content_reg,-0.00716,-0.005563,-0.011501,0.022945,-0.023112,-0.145005,-0.084953,-0.013712,1.0,0.021505,...,-0.003821,0.015228,-0.013898,0.006515,-0.008786,0.002455,0.006319,-0.003213,0.023352,-0.024921
Item_Type_Baking Goods,-0.028941,0.015942,-0.074499,0.004378,-0.011494,-0.080278,0.082865,-0.001872,0.021505,1.0,...,-0.005335,-0.002504,0.006475,0.009321,-0.002746,-0.005916,0.005146,-0.00274,0.009359,-0.01057


In [15]:
# arranging the correlation in decsending order
c = X_train.corr().abs()
s = c.unstack()
so = s.sort_values(ascending=False)

In [16]:
# number of variables having correlation more than 0.5
count=0
for i in range(len(so.values)):
    if so.values[i] < 1.0 and so.values[i] >= 0.5:
        count = count + 1
print(count)

18


In [17]:
# creating the random forest regressor model
model = RandomForestRegressor(random_state=1, max_depth=3, n_estimators=100)

In [18]:
# fitting the model on original dataset
model.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=3,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=1, verbose=0, warm_start=False)

In [19]:
# predicting for the original validation set
pred_1 = model.predict(X_valid)

In [20]:
# checking the training performance (root mean squared error)
print(np.sqrt(mean_squared_error(model.predict(X_train), y_train)))

1144.2150863170395


In [21]:
# checking the validation performance (root mean squared error)
print(np.sqrt(mean_squared_error(pred_1, y_valid)))

1178.0230078335858


In [22]:
# creating the factor analysis model
fa = FactorAnalysis(n_components=9)

In [23]:
# transforming the original train and validation set
X_train_transformed = fa.fit_transform(X_train)
X_valid_transformed = fa.fit_transform(X_valid)

In [24]:
# fitting the same random forest model on transformed data
model.fit(X_train_transformed,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=3,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=1, verbose=0, warm_start=False)

In [25]:
# predicting for the transformed validation set
pred_2 = model.predict(X_valid_transformed)

In [26]:
# checking the training performance on transformed data (root mean squared error)
print(np.sqrt(mean_squared_error(model.predict(X_train_transformed), y_train)))

1137.4631254119918


In [27]:
# checking the validation performance on transformed data (root mean squared error)
print(np.sqrt(mean_squared_error(pred_2, y_valid)))

1169.664223904249


In [28]:
# correlation between transformed variables
pd.DataFrame(X_train_transformed).corr()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1.0,-3.169991e-16,-9.519919000000001e-17,-7.794314e-17,2.045581e-13,1.794845e-14,-2.322929e-14,7.088934e-14,-3.713712e-12
1,-3.169991e-16,1.0,-5.3483330000000006e-17,-6.155500000000001e-17,1.969971e-13,1.846807e-14,-6.699403e-14,-3.778381e-14,2.518045e-13
2,-9.519919000000001e-17,-5.3483330000000006e-17,1.0,-3.821168e-18,-5.320847e-13,1.030732e-14,3.250305e-14,4.062658e-13,-1.710504e-11
3,-7.794314e-17,-6.155500000000001e-17,-3.821168e-18,1.0,-4.712597e-14,4.581425e-13,7.952595e-13,-5.562185e-14,1.511593e-12
4,2.045581e-13,1.969971e-13,-5.320847e-13,-4.712597e-14,1.0,-1.217117e-14,-1.601656e-15,-1.501887e-16,-1.608776e-15
5,1.794845e-14,1.846807e-14,1.030732e-14,4.581425e-13,-1.217117e-14,1.0,4.799371e-15,-7.007911e-15,-1.159864e-13
6,-2.322929e-14,-6.699403e-14,3.250305e-14,7.952595e-13,-1.601656e-15,4.799371e-15,1.0,3.578601e-13,3.21888e-12
7,7.088934e-14,-3.778381e-14,4.062658e-13,-5.562185e-14,-1.501887e-16,-7.007911e-15,3.578601e-13,1.0,1.047286e-10
8,-3.713712e-12,2.518045e-13,-1.710504e-11,1.511593e-12,-1.608776e-15,-1.159864e-13,3.21888e-12,1.047286e-10,1.0


In [29]:
# arranging the correlation in descending order
c = pd.DataFrame(X_train_transformed).corr().abs()
s = c.unstack()
so = s.sort_values(ascending=False)

In [30]:
# number of transformed variables having correlation more than 0.1
count=0
for i in range(len(so.values)):
    if so.values[i] < 1.0 and so.values[i] >= 0.1:
        count = count + 1
print(count)

0
