In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gradio as gr

# utils...
import json
import requests
import random
import time

import spacy #spacy for quick language prepro
nlp = spacy.load('da_core_news_md') #instantiating Danish module

from sklearn.model_selection import train_test_split

from imblearn.pipeline import Pipeline, make_pipeline
 #pipeline creation
from sklearn.feature_extraction.text import TfidfVectorizer #transforms text to sparse matrix
from sklearn.linear_model import LogisticRegression #Logit model

# For controlling warnings in the code
import warnings
from sklearn.exceptions import ConvergenceWarning

In [2]:
# We will call the below code to ignore all selected warnings in the code for getting more clearn outputs
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [3]:
# stream file from remote online
r = requests.get('https://raw.githubusercontent.com/tobiasmj97/M3_Semester_Project/main/convo_snt_chatbot.json')
json_data = json.loads(r.text)

In [4]:
# Reformating data and reformatting into useable data
data_1 = []

for i in json_data['intents']:
  l = len(i['patterns'])
  tuples = list(zip(i['patterns'], l*[i['tag']]))
  data_1.extend(tuples)

In [5]:
# Created a pandas DataFrame from our data
df = pd.DataFrame(data_1, columns=['txt','label'])

In [6]:
def text_prepro(texts):
  """
  takes in a list/iterable of texts
  lowercases, normalizes text
  """

  clean_container = []

  for text in nlp.pipe(texts, disable=["parser", "ner"]):

    txt = [token.lemma_.lower() for token in text # lemmatize and lower
          if token.is_alpha # remove numbers
          and not token.is_punct] # remove punctoation

    clean_container.append(" ".join(txt))

  return clean_container

# Modelling

In [7]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.svm import SVC
from xgboost.sklearn import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier


In [8]:
# new df.txt_p
df.txt_p = text_prepro(df.txt)

# Divide data between training and test data
X = df.txt_p
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 42)

### LogisticRegression

In [9]:
# make pipeline for LogisticRegression
textclassifier_mlg = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('mlg', LogisticRegression( multi_class='multinomial', # Set algorithm to multinomial since this is a multi class problem
                                warm_start = True, # To make the model converge faster
                                l1_ratio = 0.5)) # Set to the elastic net regularization
])

In [10]:
# Hyperparameters to tune
params = {'mlg__penalty': ['l1', 'l2', 'elasticnet', 'none' ],
          'mlg__C': [0.01, 0.1, 0.3, 0.5, 1.0,],
          'mlg__solver': ['lbfgs', 'newton-cg', 'sag', 'saga'], # List of compatible solver for multi_class = multinomial
          'mlg__random_state': [42],
         }

In [11]:
# definition of GridSearch for hyperparameter tuning
multinomial_lg_grid = GridSearchCV(estimator=textclassifier_mlg, param_grid=params)

In [12]:
# fit the model
multinomial_lg_grid.fit(X_train, y_train)

In [13]:
# evaluation
print('Train Accuracy : %.3f'%multinomial_lg_grid.best_estimator_.score(X_train, y_train))
print('Test Accuracy : %.3f'%multinomial_lg_grid.best_estimator_.score(X_test, y_test))
print('Best Accuracy Through Grid Search : %.3f'%multinomial_lg_grid.best_score_)
print('Best Parameters : ',multinomial_lg_grid.best_params_)

Train Accuracy : 1.000
Test Accuracy : 0.529
Best Accuracy Through Grid Search : 0.654
Best Parameters :  {'mlg__C': 0.01, 'mlg__penalty': 'none', 'mlg__random_state': 42, 'mlg__solver': 'lbfgs'}


In [14]:
# defining y_pred
y_pred = multinomial_lg_grid.predict(X_test)

In [15]:
print("confusion_matrix")
print(confusion_matrix(y_test,y_pred))

confusion_matrix
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 

In [16]:
print("classification_report")
print(classification_report(y_test,y_pred))

classification_report
                                                              precision    recall  f1-score   support

                                          Afhentning i butik       0.00      0.00      0.00         0
                                        Annullering af ordre       0.00      0.00      0.00         0
                                          Bestilling i butik       1.00      0.33      0.50         3
                                         Betalingsmuligheder       0.00      0.00      0.00         1
                                     Defekt produkt modtaget       0.00      0.00      0.00         0
                                                     Erhverv       1.00      1.00      1.00         1
                                                    Gavekort       0.00      0.00      0.00         1
                       Hvornår modtager jeg min refundering?       0.00      0.00      0.00         0
Kan jeg få besked, når en udsolgt vare kommer på lager igen

In [17]:
print("accuracy_score")
print(accuracy_score(y_test, y_pred))

accuracy_score
0.5294117647058824


### SVM

In [18]:
# make pipeline for svm
textclassifier_svm = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svm', svm.SVC(probability = True))
])

In [19]:
# Hyperparameters to tune
params_svm = {
    'svm__C': [1.0, 1.5, 2.0, 3.0],
    'svm__kernel': ['linear', 'rbf', 'poly'],  # Different kernel functions
    'svm__degree': [2, 3, 4],  # Degree of the polynomial kernel (for 'poly' kernel)
    'svm__class_weight': [None, 'balanced'],  # Adjusting class weights
    'svm__decision_function_shape': ['ovr','ovo'],  # One-vs-Rest
    'svm__random_state': [42]
}

In [20]:
# definition of GridSearch for hyperparameter tuning
svm_grid = GridSearchCV(estimator=textclassifier_svm, param_grid=params_svm)

In [21]:
# fit the model
svm_grid.fit(X_train, y_train)

In [22]:
# evaluate
print('Train Accuracy : %.3f'%svm_grid.best_estimator_.score(X_train, y_train))
print('Test Accuracy : %.3f'%svm_grid.best_estimator_.score(X_test, y_test))
print('Best Accuracy Through Grid Search : %.3f'%svm_grid.best_score_)
print('Best Parameters : ',svm_grid.best_params_)

Train Accuracy : 0.985
Test Accuracy : 0.559
Best Accuracy Through Grid Search : 0.631
Best Parameters :  {'svm__C': 1.0, 'svm__class_weight': 'balanced', 'svm__decision_function_shape': 'ovr', 'svm__degree': 2, 'svm__kernel': 'linear', 'svm__random_state': 42}


In [23]:
# defining y_pred
y_pred = svm_grid.predict(X_test)

In [24]:
print("confusion_matrix")
print(confusion_matrix(y_test,y_pred))

confusion_matrix
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 2 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 

In [25]:
print("classification_report")
print(classification_report(y_test,y_pred))

classification_report
                                                              precision    recall  f1-score   support

                                          Afhentning i butik       0.00      0.00      0.00         0
                                        Annullering af ordre       0.00      0.00      0.00         0
                                          Bestilling i butik       1.00      0.67      0.80         3
                                         Betalingsmuligheder       0.00      0.00      0.00         1
                                     Defekt produkt modtaget       0.00      0.00      0.00         0
                                                     Erhverv       0.50      1.00      0.67         1
                                                    Gavekort       0.00      0.00      0.00         1
Kan jeg få besked, når en udsolgt vare kommer på lager igen?       1.00      1.00      1.00         1
                                        Levering i weekende

In [26]:
print("accuracy_score")
print(accuracy_score(y_test, y_pred))

accuracy_score
0.5588235294117647


### Random Forest

In [27]:
# make pipeline for RandomForestClassifier
textclassifier_rn = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('rf', RandomForestClassifier())
])

In [79]:
# Hyperparameters to tune for Random Forest
params_rf = {
    'rf__n_estimators': [120, 130, 140, 150],            # Number of trees in the forest
    'rf__min_samples_split': [6, 7, 8, 9, 10],           # Minimum number of samples required to split an internal node
    'rf__max_features': ['auto', 'sqrt', 'log2'],  # Number of features to consider for the best split
    'rf__bootstrap': [True, False],                # Whether to bootstrap samples
    'rf__random_state': [42]                       # Random seed for reproducibility
}

In [80]:
# definition of GridSearch for hyperparameter tuning
rf_grid = GridSearchCV(estimator=textclassifier_rn, param_grid=params_rf)

In [81]:
# fit the model
rf_grid.fit(X_train, y_train)

In [82]:
# evaluate
print('Train Accuracy : %.3f'%rf_grid.best_estimator_.score(X_train, y_train))
print('Test Accuracy : %.3f'%rf_grid.best_estimator_.score(X_test, y_test))
print('Best Accuracy Through Grid Search : %.3f'%rf_grid.best_score_)
print('Best Parameters : ',rf_grid.best_params_)

Train Accuracy : 0.977
Test Accuracy : 0.412
Best Accuracy Through Grid Search : 0.548
Best Parameters :  {'rf__bootstrap': False, 'rf__max_features': 'log2', 'rf__min_samples_split': 9, 'rf__n_estimators': 140, 'rf__random_state': 42}


In [83]:
# defining y_pred
y_pred = rf_grid.predict(X_test)

In [84]:
print("confusion_matrix")
print(confusion_matrix(y_test,y_pred))

confusion_matrix
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 

In [85]:
print("classification_report")
print(classification_report(y_test,y_pred))

classification_report
                                                              precision    recall  f1-score   support

                                          Afhentning i butik       0.00      0.00      0.00         0
                                        Annullering af ordre       0.00      0.00      0.00         0
                                          Bestilling i butik       1.00      0.33      0.50         3
                                         Betalingsmuligheder       0.00      0.00      0.00         1
                                                     Erhverv       0.50      1.00      0.67         1
                                                    Gavekort       0.00      0.00      0.00         1
Kan jeg få besked, når en udsolgt vare kommer på lager igen?       1.00      1.00      1.00         1
                                        Levering i weekenden       1.00      1.00      1.00         1
                                         Leveringsmulighede

In [86]:
print("accuracy_score")
print(accuracy_score(y_test, y_pred))

accuracy_score
0.4117647058823529


### KNN

In [None]:
# make pipeline for KNeighborsClassifier
textclassifier_knn = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('knn', KNeighborsClassifier())
])

In [37]:
# Hyperparameters to tune
params = {'knn__n_neighbors': [5,6,7,8,9,10],
          'knn__weights': ['uniform', 'distance'],
          'knn__p': [1, 2, 3]
          }

In [38]:
# definition of GridSearch for hyperparameter tuning
knn_grid = GridSearchCV(estimator=textclassifier_knn, param_grid=params)

In [39]:
# fit the model
knn_grid.fit(X_train, y_train)

In [40]:
# evaluate
print('Train Accuracy : %.3f'%knn_grid.best_estimator_.score(X_train, y_train))
print('Test Accuracy : %.3f'%knn_grid.best_estimator_.score(X_test, y_test))
print('Best Accuracy Through Grid Search : %.3f'%knn_grid.best_score_)
print('Best Parameters : ',knn_grid.best_params_)

Train Accuracy : 1.000
Test Accuracy : 0.500
Best Accuracy Through Grid Search : 0.511
Best Parameters :  {'knn__n_neighbors': 5, 'knn__p': 2, 'knn__weights': 'distance'}


In [41]:
# defining y_pred
y_pred = knn_grid.predict(X_test)

In [42]:
print("confusion_matrix")
print(confusion_matrix(y_test,y_pred))

confusion_matrix
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 

In [43]:
print("classification_report")
print(classification_report(y_test,y_pred))

classification_report
                                                              precision    recall  f1-score   support

                                        Annullering af ordre       0.00      0.00      0.00         0
                                          Bestilling i butik       1.00      1.00      1.00         3
                                         Betaling med Klarna       0.00      0.00      0.00         0
                                         Betalingsmuligheder       0.00      0.00      0.00         1
                                                     Erhverv       1.00      1.00      1.00         1
                                                    Gavekort       0.00      0.00      0.00         1
Kan jeg få besked, når en udsolgt vare kommer på lager igen?       1.00      1.00      1.00         1
                                        Levering i weekenden       1.00      1.00      1.00         1
                                         Leveringsmulighede

In [44]:
print("accuracy_score")
print(accuracy_score(y_test, y_pred))

accuracy_score
0.5


### Multinomial Naive Bayes

In [45]:
# make pipeline for MultinomialNB
textclassifier_mnb = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('mnb', MultinomialNB())
])

In [46]:
# Hyperparameters to tune
params = {'mnb__alpha': [0.01, 0.1, 0.3, 0.5, 1.0]}

In [47]:
# definition of GridSearch for hyperparameter tuning
multinomial_nb_grid = GridSearchCV(estimator=textclassifier_mnb, param_grid=params)

In [48]:
# fit the model
multinomial_nb_grid.fit(X_train, y_train)

In [49]:
# evaluate
print('Train Accuracy : %.3f'%multinomial_nb_grid.best_estimator_.score(X_train, y_train))
print('Test Accuracy : %.3f'%multinomial_nb_grid.best_estimator_.score(X_test, y_test))
print('Best Accuracy Through Grid Search : %.3f'%multinomial_nb_grid.best_score_)
print('Best Parameters : ',multinomial_nb_grid.best_params_)

Train Accuracy : 0.992
Test Accuracy : 0.559
Best Accuracy Through Grid Search : 0.571
Best Parameters :  {'mnb__alpha': 0.01}


In [50]:
# defining y_pred
y_pred = multinomial_nb_grid.predict(X_test)

In [51]:
print("confusion_matrix")
print(confusion_matrix(y_test,y_pred))

confusion_matrix
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 

In [52]:
print("classification_report")
print(classification_report(y_test,y_pred))

classification_report
                                                              precision    recall  f1-score   support

                                        Annullering af ordre       0.00      0.00      0.00         0
                                          Bestilling i butik       1.00      1.00      1.00         3
                                         Betaling med Klarna       0.00      0.00      0.00         0
                                         Betalingsmuligheder       0.00      0.00      0.00         1
                                                     Erhverv       0.50      1.00      0.67         1
                                    Forkert produkt modtaget       0.00      0.00      0.00         0
                                                    Gavekort       0.00      0.00      0.00         1
Kan jeg få besked, når en udsolgt vare kommer på lager igen?       1.00      1.00      1.00         1
                                        Levering i weekende

In [53]:
print("accuracy_score")
print(accuracy_score(y_test, y_pred))

accuracy_score
0.5588235294117647


### Decision Tree

In [54]:
# make pipeline for DecisionTreeClassifier
textclassifier_dt = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('dt', DecisionTreeClassifier())
])

In [73]:
# Hyperparameters to tune
params_dt = {
    'dt__min_samples_split': [1, 2, 3],
    'dt__splitter': ['best', 'random'], # 'best' chooses the best split, 'random' chooses the best random split
    'dt__min_samples_leaf': [1, 2, 3],
    'dt__random_state': [42]
}

In [74]:
# definition of GridSearch for hyperparameter tuning
dt_grid = GridSearchCV(estimator=textclassifier_dt, param_grid=params_dt)

In [77]:
# fit the model
dt_grid.fit(X_train, y_train)

In [78]:
# evaluate
print('Train Accuracy : %.3f'%dt_grid.best_estimator_.score(X_train, y_train))
print('Test Accuracy : %.3f'%dt_grid.best_estimator_.score(X_test, y_test))
print('Best Accuracy Through Grid Search : %.3f'%dt_grid.best_score_)
print('Best Parameters : ',dt_grid.best_params_)

Train Accuracy : 1.000
Test Accuracy : 0.353
Best Accuracy Through Grid Search : 0.489
Best Parameters :  {'dt__min_samples_leaf': 1, 'dt__min_samples_split': 2, 'dt__random_state': 42, 'dt__splitter': 'random'}


In [59]:
# defining y_pred
y_pred = dt_grid.predict(X_test)

In [60]:
print("confusion_matrix")
print(confusion_matrix(y_test,y_pred))

confusion_matrix
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 

In [61]:
print("classification_report")
print(classification_report(y_test,y_pred))

classification_report
                                                              precision    recall  f1-score   support

                                          Afhentning i butik       0.00      0.00      0.00         0
                                          Bestilling i butik       0.00      0.00      0.00         3
                                         Betalingsmuligheder       0.00      0.00      0.00         1
                                                     Erhverv       1.00      1.00      1.00         1
                                                    Gavekort       0.00      0.00      0.00         1
Kan jeg få besked, når en udsolgt vare kommer på lager igen?       0.00      0.00      0.00         1
                                        Levering i weekenden       1.00      1.00      1.00         1
                                         Leveringsmuligheder       0.00      0.00      0.00         1
                           Leveringsmuligheder - DHL Expres

In [62]:
print("accuracy_score")
print(accuracy_score(y_test, y_pred))

accuracy_score
0.29411764705882354


### Gradient boosting Classifier

In [63]:
# make pipeline for KNeighborsClassifier
textclassifier_gbc = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('gbc', GradientBoostingClassifier())
])

In [64]:
# Hyperparameters to tune for Gradient Boosting Classifier
params_gbc = {
    'gbc__n_estimators': [50, 100, 150],
    'gbc__learning_rate': [0.01, 0.1, 0.2],
    'gbc__max_depth': [3, 5, 7],
    'gbc__min_samples_split': [2, 5, 10],
    'gbc__min_samples_leaf': [1, 2, 4],
    'gbc__subsample': [0.8, 0.9, 1.0],  # Fraction of samples used for fitting the trees
    'gbc__max_features': ['auto', 'sqrt', 'log2', None],
    'gbc__random_state': [42]
}

In [65]:
# definition of GridSearch for hyperparameter tuning
gbc_grid = GridSearchCV(estimator=textclassifier_gbc, param_grid=params_gbc)

In [66]:
# fit the model
gbc_grid.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
# evaluate
print('Train Accuracy : %.3f'%gbc_grid.best_estimator_.score(X_train, y_train))
print('Test Accuracy : %.3f'%gbc_grid.best_estimator_.score(X_test, y_test))
print('Best Accuracy Through Grid Search : %.3f'%gbc_grid.best_score_)
print('Best Parameters : ',gbc_grid.best_params_)

In [None]:
# defining y_pred
y_pred = gbc_grid.predict(X_test)

In [None]:
print("confusion_matrix")
print(confusion_matrix(y_test,y_pred))

In [None]:
print("classification_report")
print(classification_report(y_test,y_pred))

In [None]:
print("accuracy_score")
print(accuracy_score(y_test, y_pred))