In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *
from imblearn.over_sampling import SMOTE
import itertools

In [None]:
!pip install xgboost



In [None]:
import xgboost as xgb

In [None]:
sales_data = pd.read_csv("sales_data_csv.csv")

In [None]:
sales_data.head()

Unnamed: 0.1,Unnamed: 0,DB_ID,SKU,Store,Date,Unit Sales,Dollar Sales
0,0,95610,7312455520,632,19-08-2004,1,22.99
1,1,95611,7312455520,632,20-08-2004,2,45.98
2,2,95612,7312455520,632,21-08-2004,2,51.98
3,3,95613,7312455520,632,27-08-2004,2,45.98
4,4,95614,7312455520,632,28-08-2004,2,51.98


In [None]:
sales_data.drop(columns = ["Unnamed: 0"])
sales_data.head()

Unnamed: 0.1,Unnamed: 0,DB_ID,SKU,Store,Date,Unit Sales,Dollar Sales
0,0,95610,7312455520,632,19-08-2004,1,22.99
1,1,95611,7312455520,632,20-08-2004,2,45.98
2,2,95612,7312455520,632,21-08-2004,2,51.98
3,3,95613,7312455520,632,27-08-2004,2,45.98
4,4,95614,7312455520,632,28-08-2004,2,51.98


In [None]:
sales_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4953 entries, 0 to 4952
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    4953 non-null   int64  
 1   DB_ID         4953 non-null   int64  
 2   SKU           4953 non-null   object 
 3   Store         4953 non-null   int64  
 4   Date          4953 non-null   object 
 5   Unit Sales    4903 non-null   object 
 6   Dollar Sales  4879 non-null   float64
dtypes: float64(1), int64(3), object(3)
memory usage: 271.0+ KB


In [None]:
sales_data.shape

(4953, 7)

In [None]:
sales_data.isna().sum()
sales_data["SKU"].unique()

array(['7312455520', '7312455530', '8000451112', '8000520021',
       '50012011240', '50012011250', '50012011340', '50012011341',
       '50013000110', '50*12011250'], dtype=object)

In [None]:
sales_data.loc[sales_data["Unit Sales"].notnull(), "SaleFlag"] = 1
sales_data.loc[sales_data["Unit Sales"].isnull(), "SaleFlag"] = 0
sales_data["SaleFlag"].unique()

array([1., 0.])

This implies that 50 records exhibit that the product was not sold.

In [None]:
sales_data = sales_data.fillna(0)
sales_data.isna().sum()

Unnamed: 0      0
DB_ID           0
SKU             0
Store           0
Date            0
Unit Sales      0
Dollar Sales    0
SaleFlag        0
dtype: int64

In [None]:
features = [ "SKU" , "Store" ,"Date"  , "Dollar Sales"]
X = sales_data[features]
Y = sales_data["SaleFlag"]
Y.info()
X["SKU"] = pd.to_numeric(X['SKU'], errors='coerce')

<class 'pandas.core.series.Series'>
RangeIndex: 4953 entries, 0 to 4952
Series name: SaleFlag
Non-Null Count  Dtype  
--------------  -----  
4953 non-null   float64
dtypes: float64(1)
memory usage: 38.8 KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["SKU"] = pd.to_numeric(X['SKU'], errors='coerce')


In [None]:
X = X.fillna(0)
X.isna().sum()

SKU             0
Store           0
Date            0
Dollar Sales    0
dtype: int64

In [None]:
def calculateAccuracy(Y_test , Y_pred):
  print(accuracy_score(Y_test , Y_pred))

In [None]:
X['Date'] = pd.to_datetime(X['Date'])

# Create separate columns for year and month
X['Year'] = X['Date'].dt.year
X['Month'] = X['Date'].dt.month
X = X.drop(columns = ["Date"])
# Display the updated DataFrame with new Year and Month columns
print(X)

               SKU  Store  Dollar Sales  Year  Month
0     7.312456e+09    632         22.99  2004      8
1     7.312456e+09    632         45.98  2004      8
2     7.312456e+09    632         51.98  2004      8
3     7.312456e+09    632         45.98  2004      8
4     7.312456e+09    632         51.98  2004      8
...            ...    ...           ...   ...    ...
4948  0.000000e+00    632         19.98  2005     12
4949  0.000000e+00    632          0.00  2005      6
4950  0.000000e+00    632          0.00  2006     12
4951  0.000000e+00    632         19.98  2006      6
4952  0.000000e+00    632         19.98  2006      4

[4953 rows x 5 columns]


  X['Date'] = pd.to_datetime(X['Date'])


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25)
print(X_train)
print(X_test)

               SKU  Store  Dollar Sales  Year  Month
1215  8.000451e+09    632        419.86  2006     10
280   8.000451e+09    632         91.96  2004      8
3558  5.001201e+10    632         31.96  2005      8
224   7.312456e+09    632         45.98  2006      6
1107  8.000451e+09    632        239.92  2005     11
...            ...    ...           ...   ...    ...
2453  8.000520e+09    632          9.99  2005      9
1245  8.000451e+09    632        285.78  2006      1
4943  5.001300e+10    632        363.72  2006      5
1543  8.000451e+09    632       1143.12  2006      5
1971  8.000520e+09    632        181.86  2004     12

[3714 rows x 5 columns]
               SKU  Store  Dollar Sales  Year  Month
439   8.000451e+09    632         77.97  2004      8
3396  5.001201e+10    632         39.95  2005      5
612   8.000451e+09    632        103.92  2005      3
2468  8.000520e+09    632         91.96  2005      5
1753  8.000451e+09    632        636.51  2006     12
...            ...   

In [None]:
clf_rf = RandomForestClassifier(n_estimators=35, random_state=42)
clf_rf.fit(X_train , Y_train)
Y_pred_rf = clf_rf.predict(X_test)
Y_pred_rf = pd.DataFrame(Y_pred_rf)

In [None]:

Y_pred_rf = np.array(Y_pred_rf).reshape(-1, 1)
print("Accuracy using Random Forest is :")
print(calculateAccuracy(Y_test , Y_pred_rf))
classification_rep = classification_report(Y_test, Y_pred_rf)
print(classification_rep)

Accuracy using Random Forest is :
0.9854721549636803
None
              precision    recall  f1-score   support

         0.0       0.47      0.41      0.44        17
         1.0       0.99      0.99      0.99      1222

    accuracy                           0.99      1239
   macro avg       0.73      0.70      0.72      1239
weighted avg       0.98      0.99      0.99      1239



In [None]:
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
xgb_model.fit(X_train, Y_train)
y_pred_xgb = xgb_model.predict(X_test)
print("Accuracy using XGBoost is :")
calculateAccuracy(Y_test , y_pred_xgb)
classification_rep = classification_report(Y_test, y_pred_xgb)
print(classification_rep)

Accuracy using XGBoost is :
0.9862792574656981
              precision    recall  f1-score   support

         0.0       0.50      0.53      0.51        17
         1.0       0.99      0.99      0.99      1222

    accuracy                           0.99      1239
   macro avg       0.75      0.76      0.75      1239
weighted avg       0.99      0.99      0.99      1239



In [None]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train , Y_train)
y_pred_nb = model.predict(X_test)
print("Accuracy using Naive Bayes is :")
print(accuracy_score(Y_test , y_pred_nb))
classification_rep = classification_report(Y_test, y_pred_xgb)
print(classification_rep)

Accuracy using Naive Bayes is :
0.6844229217110573
              precision    recall  f1-score   support

         0.0       0.50      0.53      0.51        17
         1.0       0.99      0.99      0.99      1222

    accuracy                           0.99      1239
   macro avg       0.75      0.76      0.75      1239
weighted avg       0.99      0.99      0.99      1239

