In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
import seaborn as sns

In [2]:
df = pd.read_csv('datasets/Historical_Product_Demand.csv')
df.head()

Unnamed: 0,Product_Code,Warehouse,Product_Category,Date,Order_Demand
0,Product_0993,Whse_J,Category_028,2012/7/27,100
1,Product_0979,Whse_J,Category_028,2012/1/19,500
2,Product_0979,Whse_J,Category_028,2012/2/3,500
3,Product_0979,Whse_J,Category_028,2012/2/9,500
4,Product_0979,Whse_J,Category_028,2012/3/2,500


In [3]:
df["Date"].value_counts()["2012/1/19"]

804

In [4]:
df.shape

(1048575, 5)

In [5]:
df.dropna(inplace=True)
df.shape

(1037336, 5)

In [6]:
df.drop(['Date'],axis=1,inplace=True)

In [7]:
df.head()

Unnamed: 0,Product_Code,Warehouse,Product_Category,Order_Demand
0,Product_0993,Whse_J,Category_028,100
1,Product_0979,Whse_J,Category_028,500
2,Product_0979,Whse_J,Category_028,500
3,Product_0979,Whse_J,Category_028,500
4,Product_0979,Whse_J,Category_028,500


In [8]:
#Check if Product_Category and Product_Code are correlated
CrosstabResults = pd.crosstab(index=df['Product_Category'],columns=df['Product_Code'])
#print(CrosstabResults)
from scipy.stats import chi2_contingency
ChiSqResults = chi2_contingency(CrosstabResults)
print("The p-value of the ChiSq Test is:",ChiSqResults[1])

The p-value of the ChiSq Test is: 0.0


In [9]:
df['Order_Demand'].value_counts()

1000       112263
100         85738
1           63333
2000        51525
500         48573
            ...  
244000          1
45800           1
(9400)          1
4110            1
758000          1
Name: Order_Demand, Length: 3749, dtype: int64

In [10]:
df.drop(df[df.Order_Demand.str.contains(r"\(.*\)")].index,inplace=True)
df['Order_Demand'].value_counts()

1000       112263
100         85738
1           63333
2000        51525
500         48573
            ...  
345200          1
99250           1
12648           1
861             1
297000          1
Name: Order_Demand, Length: 3283, dtype: int64

In [11]:
df.shape

(1031437, 4)

In [12]:
df["Product_Code"] = LabelEncoder().fit_transform(df["Product_Code"])
df["Warehouse"] = LabelEncoder().fit_transform(df["Warehouse"])
df["Product_Category"] = LabelEncoder().fit_transform(df["Product_Category"])

In [21]:
X = df[["Product_Code","Warehouse"]].values
X_scaled = StandardScaler().fit_transform(X)
y = StandardScaler().fit_transform(df["Order_Demand"].values.reshape(-1,1))
#y = df["Order_Demand"].values

In [22]:
X_scaled.shape

(1031437, 2)

In [23]:
#Split data into training, test, and validation sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.5, random_state=0)
X_validate, X_test, y_validate, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=0)

In [24]:
model_svr = SVR(kernel='rbf',C=1.0,gamma='auto',epsilon=0.1,max_iter=1000)
model_svr.fit(X_train, y_train.ravel())



SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
    kernel='rbf', max_iter=1000, shrinking=True, tol=0.001, verbose=False)

In [25]:
score = model_svr.score(X_test,y_test)
print("R2 of SVR is:",round(score,2))

R2 of SVR is: -12.19


In [26]:
max_depth = range(1,5)
scores = []
results = 0
best_score = 0

for md in max_depth:
    
    print(md)

    model_tree = DecisionTreeRegressor(criterion='mae',max_depth=md,random_state=0)
    model_tree.fit(X_train, y_train)
    results = model_tree.score(X_test,y_test)
    scores.append(round(results,2))
    
    if results > best_score:
        best_score = results
        best_md = md
        best_model = model_tree
        
print("R2:",scores)
print("Optimal tree depth:",best_md)

1
2
3
4
R2: [-0.02, -0.01, -0.01, 0.0]
Optimal tree depth: 4
