In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
import plotly.express as py
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.offline import init_notebook_mode, iplot
import cufflinks as cf
init_notebook_mode(connected=True)
cf.go_offline()

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error

In [5]:
df = pd.read_csv('Company_Data.csv')
df.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Sales        400 non-null    float64
 1   CompPrice    400 non-null    int64  
 2   Income       400 non-null    int64  
 3   Advertising  400 non-null    int64  
 4   Population   400 non-null    int64  
 5   Price        400 non-null    int64  
 6   ShelveLoc    400 non-null    object 
 7   Age          400 non-null    int64  
 8   Education    400 non-null    int64  
 9   Urban        400 non-null    object 
 10  US           400 non-null    object 
dtypes: float64(1), int64(7), object(3)
memory usage: 34.5+ KB


In [7]:
df.shape

(400, 11)

In [8]:
sum(df.duplicated())

0

In [9]:
df.isnull().sum()

Sales          0
CompPrice      0
Income         0
Advertising    0
Population     0
Price          0
ShelveLoc      0
Age            0
Education      0
Urban          0
US             0
dtype: int64

In [10]:
df.describe()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,Age,Education
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,7.496325,124.975,68.6575,6.635,264.84,115.795,53.3225,13.9
std,2.824115,15.334512,27.986037,6.650364,147.376436,23.676664,16.200297,2.620528
min,0.0,77.0,21.0,0.0,10.0,24.0,25.0,10.0
25%,5.39,115.0,42.75,0.0,139.0,100.0,39.75,12.0
50%,7.49,125.0,69.0,5.0,272.0,117.0,54.5,14.0
75%,9.32,135.0,91.0,12.0,398.5,131.0,66.0,16.0
max,16.27,175.0,120.0,29.0,509.0,191.0,80.0,18.0


In [11]:
le = LabelEncoder()
df['US'] = le.fit_transform(df['US'])
df['Urban'] = le.fit_transform(df['Urban'])
df['ShelveLoc'] = le.fit_transform(df['ShelveLoc'])
df.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,0,42,17,1,1
1,11.22,111,48,16,260,83,1,65,10,1,1
2,10.06,113,35,10,269,80,2,59,12,1,1
3,7.4,117,100,4,466,97,2,55,14,1,1
4,4.15,141,64,3,340,128,0,38,13,1,0


In [12]:
heat = py.imshow(df.corr(), text_auto=True, aspect="auto", color_continuous_scale='gnbu')
heat.show()

### Create X value & Y value

In [13]:
x=df.iloc[:, 1:]
x.head()

Unnamed: 0,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,138,73,11,276,120,0,42,17,1,1
1,111,48,16,260,83,1,65,10,1,1
2,113,35,10,269,80,2,59,12,1,1
3,117,100,4,466,97,2,55,14,1,1
4,141,64,3,340,128,0,38,13,1,0


In [14]:
y = df[['Sales']]
y.head()

Unnamed: 0,Sales
0,9.5
1,11.22
2,10.06
3,7.4
4,4.15


In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state = 10)

### Pipeline Testing

In [16]:
arr = []
li = [.65, .7, .75, .8]

for i in range(20, 401, 20):
    for j in range(1, 11):
        for s in range(len(li)):
            sample = li[s]
            model = RandomForestRegressor(n_estimators=i, max_samples=sample, max_features=j)
            model_final = model.fit(x_train, y_train)
            pred = model_final.predict(x_test)
            mse = mean_squared_error(y_test, pred)
            mape = mean_absolute_percentage_error(y_test, pred)

            obj = {
                'n_estimators': i,
                'max_features': j,
                'mse': mse,
                'mape': mape,
                'sample': sample,
                'model': model_final
            }
            arr.append(obj)

In [17]:
modelList = sorted(arr, key=lambda i: i['mse'])
top_model = modelList[0]
top_model

{'n_estimators': 40,
 'max_features': 9,
 'mse': 1.7995826054687505,
 'mape': 0.14882266643986566,
 'sample': 0.75,
 'model': RandomForestRegressor(max_features=9, max_samples=0.75, n_estimators=40)}

#### Based on Pipeline Used Variable

In [18]:
pred = top_model['model'].predict(x_test)

In [19]:
mean_absolute_percentage_error(y_test, pred)

0.14882266643986566

In [20]:
mean_squared_error(y_test, pred)

1.7995826054687505