In [1]:
# Import the packages
import pandas as pd 
pd.set_option('display.max_columns', 500)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score
import warnings; warnings.simplefilter('ignore')
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler, StandardScaler
from sklearn import svm

In [2]:
df = pd.read_parquet("dataset_dsba.parquet.gzip")
df.sample(5)

Unnamed: 0,product,location,date,volume,type,brand,is_holiday
1041019,490,24,2020-02-10,1.629843,18,14,0.0
487211,71,1,2016-06-13,0.152582,70,5,0.0
79185,168,3,2017-07-24,2.694495,42,4,0.0
7652,4,1,2017-10-02,0.032763,70,0,0.0
649830,241,27,2018-01-29,0.0,49,12,0.0


In [3]:
y = df["volume"]
y=y.astype('int')
df = df.drop(["volume", "date"], axis=1)

In [4]:
df = pd.get_dummies(df, columns=df.select_dtypes(include="object").columns)
df

Unnamed: 0,product,location,type,brand,is_holiday
0,258,18,5,3,0.0
1,258,18,5,3,0.0
2,258,18,5,3,0.0
3,258,18,5,3,0.0
4,258,18,5,3,0.0
...,...,...,...,...,...
1090184,346,29,37,4,0.0
1090185,346,29,37,4,0.0
1090186,347,20,58,14,0.0
1090187,347,20,58,14,0.0


In [5]:
SScaler = StandardScaler()
ssdf = SScaler.fit_transform(df)

MMScaler = MinMaxScaler()
mmdf = MMScaler.fit_transform(df)

In [6]:
_train_mm, X_test_mm, y_train_mm, y_test_mm = train_test_split(mmdf, y, test_size=0.33, random_state=42)
X_train_ss, X_test_ss, y_train_ss, y_test_ss = train_test_split(ssdf, y, test_size=0.33, random_state=42)

In [7]:
def models(X_train, X_test, y_train, y_test):
    kernels = ['linear', 'poly', 'rbf', 'sigmoid']#'precomputed']
    ROC = []
    model = []
    for k in kernels:
        SVM = svm.SVC(kernel = k)
        SVM = SVM.fit(X_train, y_train)
        results = SVM.predict(X_test)
        ROC.append(roc_auc_score(y_test, results))
        model.append(k)
    
    LR = LogisticRegression()
    LR = LR.fit(X_train, y_train)
    results = LR.predict(X_test)
    ROC.append(roc_auc_score(y_test, results))
    model.append("LR")
    
    lda = LDA()
    lda = lda.fit(X_train, y_train
    results = lda.predict(X_test)
    ROC.append(roc_auc_score(y_test, results))
    model.append("LDA")
    return ROC, model

In [None]:
roc_ss, model_ss = models(X_train_ss, X_test_ss, y_train_ss, y_test_ss)
roc_mm, model_mm = models(X_train_mm, X_test_mm, y_train_mm, y_test_mm)

In [None]:
plt.plot(model_ss, roc_ss)
plt.plot(model_mm, roc_mm)
plt.legend(["Standard Scaler", "MinMax Scaler"])
plt.xlabel("model name")
plt.ylabel("ROC score")
plt.show()