In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import plot_tree
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold,cross_val_score
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier,BaggingClassifier

In [2]:
df=pd.read_csv('Company_Data.csv')
df

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.50,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.40,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No
...,...,...,...,...,...,...,...,...,...,...,...
395,12.57,138,108,17,203,128,Good,33,14,Yes,Yes
396,6.14,139,23,3,37,120,Medium,55,11,No,Yes
397,7.41,162,26,12,368,159,Medium,40,18,Yes,Yes
398,5.94,100,79,7,284,95,Bad,50,12,Yes,Yes


# EDA

In [3]:
df['Sales'] = df["Sales"].astype('int')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Sales        400 non-null    int32 
 1   CompPrice    400 non-null    int64 
 2   Income       400 non-null    int64 
 3   Advertising  400 non-null    int64 
 4   Population   400 non-null    int64 
 5   Price        400 non-null    int64 
 6   ShelveLoc    400 non-null    object
 7   Age          400 non-null    int64 
 8   Education    400 non-null    int64 
 9   Urban        400 non-null    object
 10  US           400 non-null    object
dtypes: int32(1), int64(7), object(3)
memory usage: 32.9+ KB


In [5]:
df.isna().sum()

Sales          0
CompPrice      0
Income         0
Advertising    0
Population     0
Price          0
ShelveLoc      0
Age            0
Education      0
Urban          0
US             0
dtype: int64

In [6]:
df.shape

(400, 11)

In [7]:
df['ShelveLoc']=df['ShelveLoc'].map({'Good':1,'Medium':2,'Bad':3})
df

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9,138,73,11,276,120,3,42,17,Yes,Yes
1,11,111,48,16,260,83,1,65,10,Yes,Yes
2,10,113,35,10,269,80,2,59,12,Yes,Yes
3,7,117,100,4,466,97,2,55,14,Yes,Yes
4,4,141,64,3,340,128,3,38,13,Yes,No
...,...,...,...,...,...,...,...,...,...,...,...
395,12,138,108,17,203,128,1,33,14,Yes,Yes
396,6,139,23,3,37,120,2,55,11,No,Yes
397,7,162,26,12,368,159,2,40,18,Yes,Yes
398,5,100,79,7,284,95,3,50,12,Yes,Yes


In [8]:
colnames = list(df.columns)
colnames

['Sales',
 'CompPrice',
 'Income',
 'Advertising',
 'Population',
 'Price',
 'ShelveLoc',
 'Age',
 'Education',
 'Urban',
 'US']

In [9]:
df.ShelveLoc.value_counts()

2    219
3     96
1     85
Name: ShelveLoc, dtype: int64

In [10]:
predictors = colnames[1:]
target = colnames[0]

In [11]:
print('Target:',target,'\nPredictors:',predictors)

Target: Sales 
Predictors: ['CompPrice', 'Income', 'Advertising', 'Population', 'Price', 'ShelveLoc', 'Age', 'Education', 'Urban', 'US']


In [12]:
X = df[predictors]
Y = df[target]


In [16]:
def filte(x):
    if x<= np.mean(df.Sales):
        return 'Low Sale'
    else: 
        return 'High Sale'

In [17]:
df['sales_cat'] = df['Sales'].copy()
df['sales_cat'] = df['sales_cat'].apply(filte)
df.pop('Sales')
df.head()

Unnamed: 0,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US,sales_cat
0,138,73,11,276,120,3,42,17,Yes,Yes,High Sale
1,111,48,16,260,83,1,65,10,Yes,Yes,High Sale
2,113,35,10,269,80,2,59,12,Yes,Yes,High Sale
3,117,100,4,466,97,2,55,14,Yes,Yes,High Sale
4,141,64,3,340,128,3,38,13,Yes,No,Low Sale


In [18]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
df['ShelveLoc']= label_encoder.fit_transform(df['ShelveLoc']) 
df['Urban']= label_encoder.fit_transform(df['Urban']) 
df['US']= label_encoder.fit_transform(df['US']) 
df['sales_cat']= label_encoder.fit_transform(df['sales_cat'])
df.head()

Unnamed: 0,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US,sales_cat
0,138,73,11,276,120,2,42,17,1,1,0
1,111,48,16,260,83,0,65,10,1,1,0
2,113,35,10,269,80,1,59,12,1,1,0
3,117,100,4,466,97,1,55,14,1,1,0
4,141,64,3,340,128,2,38,13,1,0,1


In [31]:
y.value_counts()

0    216
1    184
Name: sales_cat, dtype: int64

# Model Building

In [22]:
x=df.iloc[:,0:-1]
y=df['sales_cat']

In [39]:
# Split the data into training and testing sets
xtrain,xtest,ytrain,ytest = train_test_split(x, y, test_size = 0.2, random_state=1)


In [44]:
model= RandomForestClassifier(n_estimators=10,max_depth=20,max_features='auto')
model.fit(xtrain,ytrain)

ypred=model.predict(xtest)
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.81      0.81      0.81        43
           1       0.78      0.78      0.78        37

    accuracy                           0.80        80
   macro avg       0.80      0.80      0.80        80
weighted avg       0.80      0.80      0.80        80



In [45]:
#create function for ombine multiple models

def predict(model):
    model=model.fit(xtrain,ytrain)
    ypred=model.predict(xtest)
    
    print(classification_report(ytest,ypred))
    print(model.score(xtrain,ytrain))
    print(model.score(xtest,ytest))

In [46]:
#check multiple models using fuction

predict(DecisionTreeClassifier())

              precision    recall  f1-score   support

           0       0.79      0.72      0.76        43
           1       0.71      0.78      0.74        37

    accuracy                           0.75        80
   macro avg       0.75      0.75      0.75        80
weighted avg       0.75      0.75      0.75        80

1.0
0.75


In [49]:
predict(XGBClassifier())

              precision    recall  f1-score   support

           0       0.82      0.74      0.78        43
           1       0.73      0.81      0.77        37

    accuracy                           0.78        80
   macro avg       0.78      0.78      0.77        80
weighted avg       0.78      0.78      0.78        80

1.0
0.775


In [52]:
predict(AdaBoostClassifier())

              precision    recall  f1-score   support

           0       0.88      0.81      0.84        43
           1       0.80      0.86      0.83        37

    accuracy                           0.84        80
   macro avg       0.84      0.84      0.84        80
weighted avg       0.84      0.84      0.84        80

0.95625
0.8375


In [54]:
predict(GradientBoostingClassifier())

              precision    recall  f1-score   support

           0       0.85      0.81      0.83        43
           1       0.79      0.84      0.82        37

    accuracy                           0.82        80
   macro avg       0.82      0.83      0.82        80
weighted avg       0.83      0.82      0.83        80

0.99375
0.825


In [56]:
#k fold cross validation

kf=KFold(n_splits=10)
score=cross_val_score(GradientBoostingClassifier(),x,y,cv=kf)
score.mean()

0.835