In [80]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder ,StandardScaler
from sklearn.model_selection import train_test_split,KFold,cross_val_score,GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,IsolationForest
import warnings
warnings.filterwarnings('ignore')
from numpy import mean,std,array,min
from sklearn.metrics import mean_absolute_error,mean_squared_error,mean_absolute_percentage_error
from sklearn.feature_selection import RFE
from pickle import dump

## 1.Load data

In [81]:
data = pd.read_csv('Datasets/Company_Data.csv')
data.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No


## 2.data Analysis

In [3]:
data.shape

(400, 11)

In [4]:
data.dtypes

Sales          float64
CompPrice        int64
Income           int64
Advertising      int64
Population       int64
Price            int64
ShelveLoc       object
Age              int64
Education        int64
Urban           object
US              object
dtype: object

In [5]:
data.isna().sum()

Sales          0
CompPrice      0
Income         0
Advertising    0
Population     0
Price          0
ShelveLoc      0
Age            0
Education      0
Urban          0
US             0
dtype: int64

## 3.Data preprocessing

In [8]:
data.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No


In [7]:

data.ShelveLoc.unique()

array(['Bad', 'Good', 'Medium'], dtype=object)

In [8]:
data.Urban.unique()

array(['Yes', 'No'], dtype=object)

In [9]:
data.US.unique()

array(['Yes', 'No'], dtype=object)

In [82]:
data = pd.get_dummies(data=data,columns=['Urban','US'])
data['ShelveLoc']=data['ShelveLoc'].map({'Good':1,'Medium':2,'Bad':3})
data.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban_No,Urban_Yes,US_No,US_Yes
0,9.5,138,73,11,276,120,3,42,17,0,1,0,1
1,11.22,111,48,16,260,83,1,65,10,0,1,0,1
2,10.06,113,35,10,269,80,2,59,12,0,1,0,1
3,7.4,117,100,4,466,97,2,55,14,0,1,0,1
4,4.15,141,64,3,340,128,3,38,13,0,1,1,0


In [83]:
data2 = data.drop(labels='Sales',axis=1)
col = data2.columns
data2.head()

Unnamed: 0,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban_No,Urban_Yes,US_No,US_Yes
0,138,73,11,276,120,3,42,17,0,1,0,1
1,111,48,16,260,83,1,65,10,0,1,0,1
2,113,35,10,269,80,2,59,12,0,1,0,1
3,117,100,4,466,97,2,55,14,0,1,0,1
4,141,64,3,340,128,3,38,13,0,1,1,0


In [84]:
scaler = StandardScaler()
data2 = pd.DataFrame(data=scaler.fit_transform(data2),columns=col)
data2.head()

Unnamed: 0,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban_No,Urban_Yes,US_No,US_Yes
0,0.850455,0.155361,0.657177,0.075819,0.177823,1.446917,-0.699782,1.184449,-0.646869,0.646869,-0.741881,0.741881
1,-0.912484,-0.73906,1.409957,-0.032882,-1.386854,-1.528747,0.721723,-1.490113,-0.646869,0.646869,-0.741881,0.741881
2,-0.781896,-1.204159,0.506621,0.028262,-1.513719,-0.040915,0.350895,-0.725953,-0.646869,0.646869,-0.741881,0.741881
3,-0.52072,1.121336,-0.396715,1.366649,-0.794814,-0.040915,0.103677,0.038208,-0.646869,0.646869,-0.741881,0.741881
4,1.046337,-0.166631,-0.547271,0.510625,0.516132,1.446917,-0.947,-0.343872,-0.646869,0.646869,1.347925,-1.347925


In [12]:
data2.isna().sum()

CompPrice      0
Income         0
Advertising    0
Population     0
Price          0
ShelveLoc      0
Age            0
Education      0
Urban_No       0
Urban_Yes      0
US_No          0
US_Yes         0
dtype: int64

In [85]:
X = data2
y = data[['Sales']]

In [86]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.15,random_state=34,shuffle=True)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((340, 12), (60, 12), (340, 1), (60, 1))

## 4.Model Crossvalidation

In [87]:
cv = KFold(n_splits=5,shuffle=True,random_state=23)

cv_scores = cross_val_score(estimator=DecisionTreeRegressor(),X=X_train,y=y_train,scoring='neg_mean_squared_error',cv=cv)
print(cv_scores)
print('minimum r2 quare :',mean(cv_scores))
print('with a std of : ',std(cv_scores))


[-4.66971618 -5.18384559 -4.70356029 -6.21232794 -4.89425735]
minimum r2 quare : -5.132741470588234
with a std of :  0.569838952321962


## 5.Base model bulding and training

In [88]:
base_model = DecisionTreeRegressor()
base_model.fit(X_train,y_train)

DecisionTreeRegressor()

## 6.Base model testing and evaluation

In [89]:
y_pred = base_model.predict(X_test)

In [90]:
base_model.score(X_train,y_train)

1.0

In [91]:
mean_absolute_error(y_test,y_pred)

1.7016666666666667

In [92]:
mean_squared_error(y_test,y_pred)

4.30622

In [93]:
mean_absolute_percentage_error(y_test,y_pred)

0.29585410990613187

## Hyper parameter tuning

In [94]:
params = {'criterion':["mse", "friedman_mse", "mae", "poisson"],'max_depth':[3,4,5,6,7]}
grid = GridSearchCV(estimator=DecisionTreeRegressor(),param_grid=params,cv=cv)
grid.fit(X_train,y_train)

grid.best_params_

{'criterion': 'mse', 'max_depth': 6}

## feature engeneering

In [95]:
def get_model(max_depth,features):    
    if type(features) == type(12):
        feture_selection = RFE(estimator=DecisionTreeRegressor(),n_features_to_select=features,)
    else :
        feture_selection = RFE(estimator=DecisionTreeRegressor(),no_of_features=None,)
    feture_selection.fit(X_train,y_train)
    X_train_slected = feture_selection.transform(X_train)    
    X_test_selected = feture_selection.transform(X_test)
    model = DecisionTreeRegressor(criterion='friedman_mse',max_depth=max_depth,random_state=34)
    model.fit(X_train_slected,y_train)
    y_pred = model.predict(X_test_selected)
    return mean_absolute_error(y_test,y_pred),features


### getting the best model

In [96]:
mae_ls = []
features_ls = []
for i in range(1,12):
    #print('{} features and max depth 7 '.format(i))
    mae,features = get_model(7,i)
    features_ls.append(features)
    mae_ls.append(mae)


print('model with {} features and mae {} and max_depth 7 ,is the best model till now'.format(features_ls[mae_ls.index(min(mae_ls))],min(mae_ls)))
features_ls[mae_ls.index(min(mae_ls))],min(mae_ls)

model with 4 features and mae 1.3310737638722934 and max_depth 7 ,is the best model till now


(4, 1.3310737638722934)

## Final model Building and training

In [97]:
final_model = DecisionTreeRegressor(criterion= 'friedman_mse', max_depth= 7,random_state=23)
feature_selection = RFE(estimator=final_model,n_features_to_select=4)
feature_selection.fit(X_train,y_train)
X_train_selected = feature_selection.transform(X_train)
X_test_selected = feature_selection.transform(X_test)

final_model.fit(X_train_selected,y_train)



DecisionTreeRegressor(criterion='friedman_mse', max_depth=7, random_state=23)

## model tetsing and evalation

In [98]:
y_pred2 = final_model.predict(X_test_selected)

In [99]:
mean_absolute_error(y_test,y_pred2)

1.3453237638722935

In [100]:
mean_absolute_error(y_test,y_pred2)

1.3453237638722935

In [101]:
final_model.score(X_train_selected,y_train)

0.8764998554917978

## outliers detection using IsolationForest

In [102]:
is_forest = IsolationForest(contamination=0.1)
yhat = is_forest.fit_predict(data2)

indi =[]
for i in range(len(yhat)):
    if yhat[i] == -1:
        indi.append(i)
        
        
data2['Sales'] = data['Sales']
        
data2['OUtlier'] = yhat
data2[data2['OUtlier']== -1]
data2.drop(data2[data2['OUtlier'] == -1].index, inplace = True)

X = data2.drop(labels=('Sales'),axis=1)
y = data2[['Sales']]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.15,random_state=23,shuffle=True)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((306, 13), (54, 13), (306, 1), (54, 1))

## model deployment

In [159]:
dump(final_model,open('sale_pred.pkl','wb'))