In [78]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble  import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import mean_absolute_error,mean_squared_error,mean_absolute_percentage_error
import numpy as np
from pickle import dump


## 1. Loading data

In [2]:
data = pd.read_csv('Datasets/Company_Data.csv')
data.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No


## 2.Data Analysis

In [3]:
data.shape

(400, 11)

In [4]:
data.dtypes

Sales          float64
CompPrice        int64
Income           int64
Advertising      int64
Population       int64
Price            int64
ShelveLoc       object
Age              int64
Education        int64
Urban           object
US              object
dtype: object

In [5]:
data.isna().sum()

Sales          0
CompPrice      0
Income         0
Advertising    0
Population     0
Price          0
ShelveLoc      0
Age            0
Education      0
Urban          0
US             0
dtype: int64

## 3.data preprocessing

In [6]:
data = pd.get_dummies(data=data,columns=['Urban','US'])
data['ShelveLoc']=data['ShelveLoc'].map({'Good':1,'Medium':2,'Bad':3})
data.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban_No,Urban_Yes,US_No,US_Yes
0,9.5,138,73,11,276,120,3,42,17,0,1,0,1
1,11.22,111,48,16,260,83,1,65,10,0,1,0,1
2,10.06,113,35,10,269,80,2,59,12,0,1,0,1
3,7.4,117,100,4,466,97,2,55,14,0,1,0,1
4,4.15,141,64,3,340,128,3,38,13,0,1,1,0


In [7]:
data2 = data.drop(labels='Sales',axis=1)
col = data2.columns
data2.head()

Unnamed: 0,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban_No,Urban_Yes,US_No,US_Yes
0,138,73,11,276,120,3,42,17,0,1,0,1
1,111,48,16,260,83,1,65,10,0,1,0,1
2,113,35,10,269,80,2,59,12,0,1,0,1
3,117,100,4,466,97,2,55,14,0,1,0,1
4,141,64,3,340,128,3,38,13,0,1,1,0


In [10]:
scaler = StandardScaler()
data2 = pd.DataFrame(data=scaler.fit_transform(data2),columns=col)
data2.head()

Unnamed: 0,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban_No,Urban_Yes,US_No,US_Yes
0,0.850455,0.155361,0.657177,0.075819,0.177823,1.446917,-0.699782,1.184449,-0.646869,0.646869,-0.741881,0.741881
1,-0.912484,-0.73906,1.409957,-0.032882,-1.386854,-1.528747,0.721723,-1.490113,-0.646869,0.646869,-0.741881,0.741881
2,-0.781896,-1.204159,0.506621,0.028262,-1.513719,-0.040915,0.350895,-0.725953,-0.646869,0.646869,-0.741881,0.741881
3,-0.52072,1.121336,-0.396715,1.366649,-0.794814,-0.040915,0.103677,0.038208,-0.646869,0.646869,-0.741881,0.741881
4,1.046337,-0.166631,-0.547271,0.510625,0.516132,1.446917,-0.947,-0.343872,-0.646869,0.646869,1.347925,-1.347925


### seperating X and y

In [11]:
X = data2
y = data[['Sales']]

### train test split

In [69]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.15,random_state=34,shuffle=True)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((340, 12), (60, 12), (340, 1), (60, 1))

## 4.Base model creation
### 4.1. Model building and training

In [18]:
base_model = RandomForestRegressor()
base_model.fit(X_train,y_train)

RandomForestRegressor()

### 4.2 Model testing and evaluation

In [19]:
y_pred = base_model.predict(X_test)

In [22]:
mean_absolute_error(y_test,y_pred)

1.1700866666666665

In [23]:
mean_squared_error(y_test,y_pred)

2.2486192593333345

### 5. Hyperparameter tuning

In [31]:
params = {'criterion':["mse", "friedman_mse", "mae", "poisson"],'max_depth':[3,4,5,6,7],
          'n_estimators':[60,80,100,120]
         }
grid = GridSearchCV(estimator=RandomForestRegressor(),param_grid=params,cv=5)
grid.fit(X_train,y_train)

grid.best_params_

{'criterion': 'mse', 'max_depth': 7, 'n_estimators': 60}

## feature engineering

In [66]:
feature_list = list(X_train.columns)
# Get numerical feature importances
importances = list(base_model.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: Price                Importance: 0.3
Variable: ShelveLoc            Importance: 0.3
Variable: CompPrice            Importance: 0.12
Variable: Age                  Importance: 0.08
Variable: Advertising          Importance: 0.07
Variable: Income               Importance: 0.05
Variable: Population           Importance: 0.04
Variable: Education            Importance: 0.03
Variable: Urban_No             Importance: 0.0
Variable: Urban_Yes            Importance: 0.0
Variable: US_No                Importance: 0.0
Variable: US_Yes               Importance: 0.0


## 6. final Model bulding with hyperparameter tuning and feature engineering

### 6.1 model building an dtraining

In [72]:
final_model = RandomForestRegressor(n_estimators=60,criterion='mse',max_depth=7)


In [71]:
## selecting only the two important features
important_indices = [feature_list.index('Price'), feature_list.index('ShelveLoc')]
train_data = X_train.iloc[:,important_indices]
test_data = X_test.iloc[:,important_indices]


In [73]:
final_model.fit(train_data,y_train)

RandomForestRegressor(max_depth=7, n_estimators=60)

### model testing and evaluation


In [74]:
prediction = final_model.predict(test_data)

In [75]:
mean_absolute_error(y_test,prediction)

1.5429802966235107

### model deployment

In [79]:
dump(final_model,open('final_model.pkl','wb'))