In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gradio as gr

# utils...
import json
import requests
import random
import time

import spacy #spacy for quick language prepro
nlp = spacy.load('da_core_news_md') #instantiating Danish module

from sklearn.model_selection import train_test_split

from imblearn.pipeline import Pipeline, make_pipeline
 #pipeline creation
from sklearn.feature_extraction.text import TfidfVectorizer #transforms text to sparse matrix
from sklearn.linear_model import LogisticRegression #Logit model

# For controlling warnings in the code
import warnings
from sklearn.exceptions import ConvergenceWarning

In [2]:
# We will call the below code to ignore all selected warnings in the code for getting more clearn outputs
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [3]:
# stream file from remote online
r = requests.get('https://raw.githubusercontent.com/tobiasmj97/M3_Semester_Project/main/convo_snt_chatbot.json')
json_data = json.loads(r.text)

In [4]:
# Reformating data and reformatting into useable data
data_1 = []

for i in json_data['intents']:
  l = len(i['patterns'])
  tuples = list(zip(i['patterns'], l*[i['tag']]))
  data_1.extend(tuples)

In [5]:
# Created a pandas DataFrame from our data
df = pd.DataFrame(data_1, columns=['txt','label'])

In [6]:
df.head(5)

Unnamed: 0,txt,label
0,Hej,hilsen
1,Halløj,hilsen
2,Goddag,hilsen
3,God dag,hilsen
4,Går det godt?,svar


In [7]:
filepath = 'revised_synthetic_data_copy.json'
revised_data = json.load(open(filepath,'r'))

In [8]:
# Reformating data and reformatting into useable data
data_revised = []

for i in revised_data['intents']:
  l = len(i['patterns'])
  tuples = list(zip(i['patterns'], l*[i['tag']]))
  data_revised.extend(tuples)

In [9]:
df_revised = pd.DataFrame(data_revised, columns=['txt','label'])

In [10]:
df_revised.head(5)

Unnamed: 0,txt,label
0,Hejsa,hilsen
1,Godmorgen,hilsen
2,Hej der,hilsen
3,Goddag til dig,hilsen
4,Alt vel?,svar


### Merge Dataset

In [11]:
# Concatenate along the rows (axis=0)
merged_df = pd.concat([df, df_revised], axis=0, ignore_index=True)

In [12]:
merged_df.shape

(265, 2)

In [13]:
merged_df['label'][170]

'hilsen'

# Text Preprocessing

In [14]:
def text_prepro(texts):
  """
  takes in a list/iterable of texts
  lowercases, normalizes text
  """

  clean_container = []

  for text in nlp.pipe(texts, disable=["parser", "ner"]):

    txt = [token.lemma_.lower() for token in text # lemmatize and lower
          if token.is_alpha # remove numbers
          and not token.is_punct] # remove punctoation

    clean_container.append(" ".join(txt))

  return clean_container

In [15]:
nlp(df['txt'][15]).vector.shape

(300,)

In [16]:
df['txt'][15]

'Hvordan bestiller jeg produkter fra butikken?'

# Modelling

In [108]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier


In [18]:
# new df.txt_p
df.txt_p = text_prepro(df.txt)
df_revised.txt_p = text_prepro(df_revised.txt)


# Divide data between training and test data
X = df.txt_p
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 42)

#### Merged data split

In [19]:
merged_df.txt_p = text_prepro(merged_df.txt)
# Divide merged data between training and test data
X_merged = merged_df.txt_p
y_merged = merged_df['label']
X_train_merged, X_test_merged, y_train_merged, y_test_merged = train_test_split(X_merged, y_merged, test_size = .2, random_state = 42)

#### Synthetic data

In [20]:
X_val = df_revised.txt_p
y_val = df_revised['label']

# Word2Vec Models

In [21]:
# we grab the vectors for all texts and stack them into a matrix
X_train_W2V = np.vstack([txt.vector for txt in nlp.pipe(X_train, disable=["parser", "ner"])])

In [22]:
X_test_W2V = np.vstack([txt.vector for txt in nlp.pipe(X_test, disable=["parser", "ner"])])

In [23]:
X_train_merged_W2V = np.vstack([txt.vector for txt in nlp.pipe(X_train_merged, disable=["parser", "ner"])])

In [24]:
X_test_merged_W2V = np.vstack([txt.vector for txt in nlp.pipe(X_test_merged, disable=["parser", "ner"])])

### LogisticRegression

In [25]:
# LogisticRegression Model
model_lr = LogisticRegression(multi_class='multinomial')
# fit the model
model_lr.fit(X_train_W2V, y_train)

# predicting for train
y_pred_train = model_lr.predict(X_train_W2V)
# predicting for test
y_pred_test = model_lr.predict(X_test_W2V)
# predicting for merged train
y_pred_train_merged = model_lr.predict(X_train_merged_W2V)
# predicting for merged test
y_pred_test_merged = model_lr.predict(X_test_merged_W2V)


In [26]:
print("classification_report train orginal")
print(classification_report(y_train,y_pred_train))

classification_report train orginal
                                                              precision    recall  f1-score   support

                                          Afhentning i butik       1.00      1.00      1.00         4
                                        Annullering af ordre       1.00      1.00      1.00         4
                                          Bestilling i butik       1.00      1.00      1.00         2
                                         Betaling med Klarna       1.00      1.00      1.00         4
                                         Betalingsmuligheder       1.00      1.00      1.00         2
                                     Defekt produkt modtaget       1.00      1.00      1.00         4
                                                     Erhverv       1.00      1.00      1.00         3
                                    Forkert produkt modtaget       1.00      1.00      1.00         4
                                             

In [27]:
print("classification_report test orginal")
print(classification_report(y_test,y_pred_test))


classification_report test orginal
                                                              precision    recall  f1-score   support

                                        Annullering af ordre       0.00      0.00      0.00         0
                                          Bestilling i butik       1.00      0.67      0.80         3
                                         Betaling med Klarna       0.00      0.00      0.00         0
                                         Betalingsmuligheder       0.00      0.00      0.00         1
                                     Defekt produkt modtaget       0.00      0.00      0.00         0
                                                     Erhverv       0.50      1.00      0.67         1
                                                    Gavekort       0.00      0.00      0.00         1
Kan jeg få besked, når en udsolgt vare kommer på lager igen?       1.00      1.00      1.00         1
                                        Leveri

In [28]:
print("classification_report train merged")
print(classification_report(y_train_merged,y_pred_train_merged))

classification_report train merged
                                                              precision    recall  f1-score   support

                                          Afhentning i butik       0.55      0.86      0.67         7
                                        Annullering af ordre       0.56      0.71      0.63         7
                                          Bestilling i butik       0.67      0.86      0.75         7
                                         Betaling med Klarna       0.75      1.00      0.86         3
                                         Betalingsmuligheder       0.67      0.67      0.67         3
                                     Defekt produkt modtaget       0.88      1.00      0.93         7
                                                     Erhverv       0.67      1.00      0.80         4
                                    Forkert produkt modtaget       0.88      0.88      0.88         8
                                              

In [29]:
print("classification_report test merged")
print(classification_report(y_test_merged,y_pred_test_merged))

classification_report test merged
                                                              precision    recall  f1-score   support

                                          Afhentning i butik       0.50      1.00      0.67         1
                                        Annullering af ordre       0.50      1.00      0.67         1
                                          Bestilling i butik       1.00      0.67      0.80         3
                                         Betaling med Klarna       1.00      1.00      1.00         1
                                     Defekt produkt modtaget       1.00      1.00      1.00         1
                                                     Erhverv       0.00      0.00      0.00         0
                       Hvornår modtager jeg min refundering?       0.75      1.00      0.86         3
Kan jeg få besked, når en udsolgt vare kommer på lager igen?       1.00      1.00      1.00         1
                                        Leverin

In [30]:
# accuracy score for train
print("accuracy_score for train")
print(accuracy_score(y_train, y_pred_train))
# accuracy score for test
print("accuracy_score for test")
print(accuracy_score(y_test, y_pred_test))
# accuracy score for validation
print("accuracy_score for train merged")
print(accuracy_score(y_train_merged, y_pred_train_merged))
# accuracy score for validation
print("accuracy_score for test merged")
print(accuracy_score(y_test_merged, y_pred_test_merged))

accuracy_score for train
0.9849624060150376
accuracy_score for test
0.4117647058823529
accuracy_score for train merged
0.7547169811320755
accuracy_score for test merged
0.7924528301886793


### SVM

In [31]:
# SVM Model
model_svm = svm.SVC()
# fit the model
model_svm.fit(X_train_W2V, y_train)

# predicting for train
y_pred_train = model_svm.predict(X_train_W2V)
# predicting for test
y_pred_test = model_svm.predict(X_test_W2V)
# predicting for merged train
y_pred_train_merged = model_svm.predict(X_train_merged_W2V)
# predicting for merged test
y_pred_test_merged = model_svm.predict(X_test_merged_W2V)



In [32]:
print("classification_report train orginal")
print(classification_report(y_train,y_pred_train))

classification_report train orginal
                                                              precision    recall  f1-score   support

                                          Afhentning i butik       0.40      1.00      0.57         4
                                        Annullering af ordre       0.00      0.00      0.00         4
                                          Bestilling i butik       0.00      0.00      0.00         2
                                         Betaling med Klarna       0.80      1.00      0.89         4
                                         Betalingsmuligheder       0.00      0.00      0.00         2
                                     Defekt produkt modtaget       1.00      0.50      0.67         4
                                                     Erhverv       1.00      1.00      1.00         3
                                    Forkert produkt modtaget       0.67      1.00      0.80         4
                                             

In [33]:
print("classification_report test orginal")
print(classification_report(y_test,y_pred_test))

classification_report test orginal
                                                              precision    recall  f1-score   support

                                          Afhentning i butik       0.00      0.00      0.00         0
                                          Bestilling i butik       0.00      0.00      0.00         3
                                         Betaling med Klarna       0.00      0.00      0.00         0
                                         Betalingsmuligheder       0.00      0.00      0.00         1
                                                     Erhverv       1.00      1.00      1.00         1
                                                    Gavekort       0.00      0.00      0.00         1
                       Hvornår modtager jeg min refundering?       0.00      0.00      0.00         0
Kan jeg få besked, når en udsolgt vare kommer på lager igen?       0.00      0.00      0.00         1
                                        Leveri

In [34]:
print("classification_report train merged")
print(classification_report(y_train_merged,y_pred_train_merged))

classification_report train merged
                                                              precision    recall  f1-score   support

                                          Afhentning i butik       0.40      0.86      0.55         7
                                        Annullering af ordre       0.00      0.00      0.00         7
                                          Bestilling i butik       0.00      0.00      0.00         7
                                         Betaling med Klarna       0.43      1.00      0.60         3
                                         Betalingsmuligheder       0.00      0.00      0.00         3
                                     Defekt produkt modtaget       1.00      0.14      0.25         7
                                                     Erhverv       1.00      1.00      1.00         4
                                    Forkert produkt modtaget       0.75      0.75      0.75         8
                                              

In [35]:
print("classification_report test merged")
print(classification_report(y_test_merged,y_pred_test_merged))

classification_report test merged
                                                              precision    recall  f1-score   support

                                          Afhentning i butik       0.17      1.00      0.29         1
                                        Annullering af ordre       0.00      0.00      0.00         1
                                          Bestilling i butik       0.00      0.00      0.00         3
                                         Betaling med Klarna       1.00      1.00      1.00         1
                                     Defekt produkt modtaget       1.00      1.00      1.00         1
                       Hvornår modtager jeg min refundering?       0.67      0.67      0.67         3
Kan jeg få besked, når en udsolgt vare kommer på lager igen?       0.00      0.00      0.00         1
                                        Levering i weekenden       0.00      0.00      0.00         3
                                      Leverings

In [36]:
# accuracy score for train
print("accuracy_score for train")
print(accuracy_score(y_train, y_pred_train))
# accuracy score for test
print("accuracy_score for test")
print(accuracy_score(y_test, y_pred_test))
# accuracy score for validation
print("accuracy_score for train merged")
print(accuracy_score(y_train_merged, y_pred_train_merged))
# accuracy score for validation
print("accuracy_score for test merged")
print(accuracy_score(y_test_merged, y_pred_test_merged))

accuracy_score for train
0.5789473684210527
accuracy_score for test
0.08823529411764706
accuracy_score for train merged
0.41037735849056606
accuracy_score for test merged
0.4528301886792453


### KNeighbors

In [37]:
#  Multinomial Naive Bayers Model
model_kn = KNeighborsClassifier()

# fit the model
model_kn.fit(X_train_W2V, y_train)

# predicting for train
y_pred_train = model_kn.predict(X_train_W2V)
# predicting for test
y_pred_test = model_kn.predict(X_test_W2V)
# predicting for merged train
y_pred_train_merged = model_kn.predict(X_train_merged_W2V)
# predicting for merged test
y_pred_test_merged = model_kn.predict(X_test_merged_W2V)

In [38]:
print("classification_report train orginal")
print(classification_report(y_train,y_pred_train))

classification_report train orginal
                                                              precision    recall  f1-score   support

                                          Afhentning i butik       0.38      0.75      0.50         4
                                        Annullering af ordre       0.38      0.75      0.50         4
                                          Bestilling i butik       0.33      1.00      0.50         2
                                         Betaling med Klarna       0.50      0.50      0.50         4
                                         Betalingsmuligheder       0.00      0.00      0.00         2
                                     Defekt produkt modtaget       0.43      0.75      0.55         4
                                                     Erhverv       1.00      1.00      1.00         3
                                    Forkert produkt modtaget       0.36      1.00      0.53         4
                                             

In [39]:
print("classification_report test orginal")
print(classification_report(y_test,y_pred_test))

classification_report test orginal
                                                              precision    recall  f1-score   support

                                          Afhentning i butik       0.00      0.00      0.00         0
                                        Annullering af ordre       0.00      0.00      0.00         0
                                          Bestilling i butik       0.67      0.67      0.67         3
                                         Betaling med Klarna       0.00      0.00      0.00         0
                                         Betalingsmuligheder       0.00      0.00      0.00         1
                                     Defekt produkt modtaget       0.00      0.00      0.00         0
                                                     Erhverv       0.50      1.00      0.67         1
                                    Forkert produkt modtaget       0.00      0.00      0.00         0
                                              

In [40]:
print("classification_report train merged")
print(classification_report(y_train_merged,y_pred_train_merged))

classification_report train merged
                                                              precision    recall  f1-score   support

                                          Afhentning i butik       0.26      0.71      0.38         7
                                        Annullering af ordre       0.27      0.43      0.33         7
                                          Bestilling i butik       0.50      1.00      0.67         7
                                         Betaling med Klarna       0.17      0.33      0.22         3
                                         Betalingsmuligheder       0.00      0.00      0.00         3
                                     Defekt produkt modtaget       0.40      0.57      0.47         7
                                                     Erhverv       0.67      1.00      0.80         4
                                    Forkert produkt modtaget       0.32      0.75      0.44         8
                                              

In [41]:
print("classification_report test merged")
print(classification_report(y_test_merged,y_pred_test_merged))

classification_report test merged
                                                              precision    recall  f1-score   support

                                          Afhentning i butik       0.00      0.00      0.00         1
                                        Annullering af ordre       0.50      1.00      0.67         1
                                          Bestilling i butik       0.33      0.33      0.33         3
                                         Betaling med Klarna       1.00      1.00      1.00         1
                                         Betalingsmuligheder       0.00      0.00      0.00         0
                                     Defekt produkt modtaget       1.00      1.00      1.00         1
                                                     Erhverv       0.00      0.00      0.00         0
                                                    Gavekort       0.00      0.00      0.00         0
                       Hvornår modtager jeg min

In [42]:
# accuracy score for train
print("accuracy_score for train")
print(accuracy_score(y_train, y_pred_train))
# accuracy score for test
print("accuracy_score for test")
print(accuracy_score(y_test, y_pred_test))
# accuracy score for validation
print("accuracy_score for train merged")
print(accuracy_score(y_train_merged, y_pred_train_merged))
# accuracy score for validation
print("accuracy_score for test merged")
print(accuracy_score(y_test_merged, y_pred_test_merged))

accuracy_score for train
0.5112781954887218
accuracy_score for test
0.20588235294117646
accuracy_score for train merged
0.4009433962264151
accuracy_score for test merged
0.4528301886792453


### Random Forrest

In [43]:
#  Random Forest Model
model_rf = RandomForestClassifier()

# fit the model
model_rf.fit(X_train_W2V, y_train)

# predicting for train
y_pred_train = model_rf.predict(X_train_W2V)
# predicting for test
y_pred_test = model_rf.predict(X_test_W2V)
# predicting for merged train
y_pred_train_merged = model_rf.predict(X_train_merged_W2V)
# predicting for merged test
y_pred_test_merged = model_rf.predict(X_test_merged_W2V)

In [44]:
print("classification_report train orginal")
print(classification_report(y_train,y_pred_train))

classification_report train orginal
                                                              precision    recall  f1-score   support

                                          Afhentning i butik       1.00      1.00      1.00         4
                                        Annullering af ordre       1.00      1.00      1.00         4
                                          Bestilling i butik       1.00      1.00      1.00         2
                                         Betaling med Klarna       1.00      1.00      1.00         4
                                         Betalingsmuligheder       1.00      1.00      1.00         2
                                     Defekt produkt modtaget       1.00      1.00      1.00         4
                                                     Erhverv       1.00      1.00      1.00         3
                                    Forkert produkt modtaget       1.00      1.00      1.00         4
                                             

In [45]:
print("classification_report test orginal")
print(classification_report(y_test,y_pred_test))

classification_report test orginal
                                                              precision    recall  f1-score   support

                                        Annullering af ordre       0.00      0.00      0.00         0
                                          Bestilling i butik       1.00      0.33      0.50         3
                                         Betaling med Klarna       0.00      0.00      0.00         0
                                         Betalingsmuligheder       0.00      0.00      0.00         1
                                     Defekt produkt modtaget       0.00      0.00      0.00         0
                                                     Erhverv       0.50      1.00      0.67         1
                                                    Gavekort       0.00      0.00      0.00         1
       Hvor kan man købe produkter fra Shaping New Tomorrow?       0.00      0.00      0.00         0
                       Hvornår modtager jeg mi

In [46]:
print("classification_report train merged")
print(classification_report(y_train_merged,y_pred_train_merged))

classification_report train merged
                                                              precision    recall  f1-score   support

                                          Afhentning i butik       0.67      0.86      0.75         7
                                        Annullering af ordre       0.56      0.71      0.63         7
                                          Bestilling i butik       1.00      0.57      0.73         7
                                         Betaling med Klarna       0.43      1.00      0.60         3
                                         Betalingsmuligheder       0.67      0.67      0.67         3
                                     Defekt produkt modtaget       0.67      0.57      0.62         7
                                                     Erhverv       0.67      1.00      0.80         4
                                    Forkert produkt modtaget       0.86      0.75      0.80         8
                                              

In [47]:
print("classification_report test merged")
print(classification_report(y_test_merged,y_pred_test_merged))

classification_report test merged
                                                              precision    recall  f1-score   support

                                          Afhentning i butik       1.00      1.00      1.00         1
                                        Annullering af ordre       0.50      1.00      0.67         1
                                          Bestilling i butik       0.00      0.00      0.00         3
                                         Betaling med Klarna       1.00      1.00      1.00         1
                                     Defekt produkt modtaget       1.00      1.00      1.00         1
                                                     Erhverv       0.00      0.00      0.00         0
       Hvor kan man købe produkter fra Shaping New Tomorrow?       0.00      0.00      0.00         0
                       Hvornår modtager jeg min refundering?       0.75      1.00      0.86         3
Kan jeg få besked, når en udsolgt vare kommer p

In [48]:
# accuracy score for train
print("accuracy_score for train")
print(accuracy_score(y_train, y_pred_train))
# accuracy score for test
print("accuracy_score for test")
print(accuracy_score(y_test, y_pred_test))
# accuracy score for validation
print("accuracy_score for train merged")
print(accuracy_score(y_train_merged, y_pred_train_merged))
# accuracy score for validation
print("accuracy_score for test merged")
print(accuracy_score(y_test_merged, y_pred_test_merged))

accuracy_score for train
0.9849624060150376
accuracy_score for test
0.29411764705882354
accuracy_score for train merged
0.7028301886792453
accuracy_score for test merged
0.7547169811320755


# TF IDF + ML

### Pipeline TFIDF + LR

In [49]:
# Making Pipeline TFIDF + LR
tfidf = TfidfVectorizer()
cls = LogisticRegression()

pipe = make_pipeline(tfidf, cls)


In [50]:
# fit the model 
pipe.fit(X_train, y_train)
pipe.fit(X_train_merged, y_train_merged)

In [51]:
# making prediction for train, test and validation
y_pred_train = pipe.predict(X_train)

y_pred_test = pipe.predict(X_test)

y_pred_train_merged = pipe.predict(X_train_merged)

y_pred_test_merged = pipe.predict(X_test_merged)

In [52]:
# Classification report
print("classification_report")
print(classification_report(y_test,y_pred_test))

classification_report
                                                              precision    recall  f1-score   support

                                          Afhentning i butik       0.00      0.00      0.00         0
                                          Bestilling i butik       1.00      1.00      1.00         3
                                         Betalingsmuligheder       0.00      0.00      0.00         1
                                                     Erhverv       1.00      1.00      1.00         1
                                    Forkert produkt modtaget       0.00      0.00      0.00         0
                                                    Gavekort       1.00      1.00      1.00         1
Kan jeg få besked, når en udsolgt vare kommer på lager igen?       0.00      0.00      0.00         1
                                        Levering i weekenden       1.00      1.00      1.00         1
                                         Leveringsmulighede

In [53]:
# evaluating the accuracy for train, test and validation
print("accuracy_score for train")
print(accuracy_score(y_train, y_pred_train))

print("accuracy_score for test")
print(accuracy_score(y_test, y_pred_test))

print("accuracy_score for merged train")
print(accuracy_score(y_train_merged, y_pred_train_merged))

print("accuracy_score for merged test")
print(accuracy_score(y_test_merged, y_pred_test_merged))



accuracy_score for train
0.7368421052631579
accuracy_score for test
0.7058823529411765
accuracy_score for merged train
0.8584905660377359
accuracy_score for merged test
0.5094339622641509


### Pipeline  TFIDF + SVM

In [54]:
tfidf = TfidfVectorizer()
svm = svm.SVC()

pipe_svm = make_pipeline(tfidf, svm)

In [55]:
pipe_svm.fit(X_train, y_train)
pipe_svm.fit(X_train_merged, y_train_merged)

In [56]:
# making prediction for train, test and validation
y_pred_train = pipe_svm.predict(X_train)

y_pred_test = pipe_svm.predict(X_test)

y_pred_train_merged = pipe_svm.predict(X_train_merged)

y_pred_test_merged = pipe_svm.predict(X_test_merged)

In [57]:
# Classification report
print("classification_report")
print(classification_report(y_test,y_pred_test))

classification_report
                                                              precision    recall  f1-score   support

                                          Afhentning i butik       0.00      0.00      0.00         0
                                          Bestilling i butik       1.00      1.00      1.00         3
                                         Betalingsmuligheder       1.00      1.00      1.00         1
                                                     Erhverv       1.00      1.00      1.00         1
                                                    Gavekort       1.00      1.00      1.00         1
Kan jeg få besked, når en udsolgt vare kommer på lager igen?       1.00      1.00      1.00         1
                                        Levering i weekenden       1.00      1.00      1.00         1
                                         Leveringsmuligheder       0.25      1.00      0.40         1
                           Leveringsmuligheder - DHL Expres

In [58]:
# evaluating the accuracy for train, test and validation
print("accuracy_score for train")
print(accuracy_score(y_train, y_pred_train))

print("accuracy_score for test")
print(accuracy_score(y_test, y_pred_test))

print("accuracy_score for merged train")
print(accuracy_score(y_train_merged, y_pred_train_merged))

print("accuracy_score for merged test")
print(accuracy_score(y_test_merged, y_pred_test_merged))

accuracy_score for train
0.8646616541353384
accuracy_score for test
0.8235294117647058
accuracy_score for merged train
0.9811320754716981
accuracy_score for merged test
0.5094339622641509


### Pipeline  TFIDF + KNeighbors

In [59]:
tfidf = TfidfVectorizer()
kn = KNeighborsClassifier()

pipe_kn = make_pipeline(tfidf, kn)

In [60]:
pipe_kn.fit(X_train, y_train)
pipe_kn.fit(X_train_merged, y_train_merged)

In [61]:
# making prediction for train, test and validation
y_pred_train = pipe_kn.predict(X_train)

y_pred_test = pipe_kn.predict(X_test)

y_pred_train_merged = pipe_kn.predict(X_train_merged)

y_pred_test_merged = pipe_kn.predict(X_test_merged)

In [62]:
# Classification report
print("classification_report")
print(classification_report(y_test,y_pred_test))

classification_report
                                                              precision    recall  f1-score   support

                                          Afhentning i butik       0.00      0.00      0.00         0
                                        Annullering af ordre       0.00      0.00      0.00         0
                                          Bestilling i butik       0.75      1.00      0.86         3
                                         Betalingsmuligheder       0.00      0.00      0.00         1
                                     Defekt produkt modtaget       0.00      0.00      0.00         0
                                                     Erhverv       1.00      1.00      1.00         1
                                                    Gavekort       1.00      1.00      1.00         1
Kan jeg få besked, når en udsolgt vare kommer på lager igen?       1.00      1.00      1.00         1
                                        Levering i weekende

In [63]:
# evaluating the accuracy for train, test and validation
print("accuracy_score for train")
print(accuracy_score(y_train, y_pred_train))

print("accuracy_score for test")
print(accuracy_score(y_test, y_pred_test))

print("accuracy_score for merged train")
print(accuracy_score(y_train_merged, y_pred_train_merged))

print("accuracy_score for merged test")
print(accuracy_score(y_test_merged, y_pred_test_merged))

accuracy_score for train
0.7142857142857143
accuracy_score for test
0.7058823529411765
accuracy_score for merged train
0.75
accuracy_score for merged test
0.6037735849056604


### Pipeline  TFIDF + Random Forest

In [64]:
tfidf = TfidfVectorizer()
rf = RandomForestClassifier()

pipe_rf = make_pipeline(tfidf, rf)

In [65]:
pipe_rf.fit(X_train, y_train)
pipe_rf.fit(X_train_merged, y_train_merged)

In [66]:
# making prediction for train, test and validation
y_pred_train = pipe_rf.predict(X_train)

y_pred_test = pipe_rf.predict(X_test)

y_pred_train_merged = pipe_rf.predict(X_train_merged)

y_pred_test_merged = pipe_rf.predict(X_test_merged)

In [67]:
# Classification report
print("classification_report")
print(classification_report(y_test,y_pred_test))

classification_report
                                                              precision    recall  f1-score   support

                                          Bestilling i butik       1.00      1.00      1.00         3
                                         Betalingsmuligheder       1.00      1.00      1.00         1
                                                     Erhverv       1.00      1.00      1.00         1
                                                    Gavekort       1.00      1.00      1.00         1
Kan jeg få besked, når en udsolgt vare kommer på lager igen?       1.00      1.00      1.00         1
                                        Levering i weekenden       1.00      1.00      1.00         1
                                         Leveringsmuligheder       1.00      1.00      1.00         1
                           Leveringsmuligheder - DHL Express       1.00      1.00      1.00         1
                           Leveringsmuligheder - GLS Erhver

In [68]:
# evaluating the accuracy for train, test and validation
print("accuracy_score for train")
print(accuracy_score(y_train, y_pred_train))

print("accuracy_score for test")
print(accuracy_score(y_test, y_pred_test))

print("accuracy_score for merged train")
print(accuracy_score(y_train_merged, y_pred_train_merged))

print("accuracy_score for merged test")
print(accuracy_score(y_test_merged, y_pred_test_merged))

accuracy_score for train
0.9323308270676691
accuracy_score for test
0.9705882352941176
accuracy_score for merged train
1.0
accuracy_score for merged test
0.6792452830188679


# Hyperparameter Tuning

## Word2Vec pipelines

### Hyperparameter tuning Logistic Regression

In [69]:
params_lg = {'penalty': ['l1', 'l2', 'elasticnet', 'none' ],
          'C': [0.01, 0.1, 0.3, 0.5, 1.0,],
          'solver': ['lbfgs', 'newton-cg', 'sag', 'saga'], # List of compatible solver for multi_class = multinomial
          'random_state': [42]
         }

In [70]:
lg_grid = GridSearchCV(estimator=model_lr, param_grid=params_lg)

In [71]:
lg_grid_merged = GridSearchCV(estimator=model_lr, param_grid=params_lg)

In [72]:
lg_grid.fit(X_train_W2V, y_train)


In [73]:
lg_grid_merged.fit(X_train_merged_W2V, y_train_merged)

In [74]:
print('Train Accuracy : %.3f'%lg_grid.best_estimator_.score(X_train_W2V, y_train))
print('Test Accuracy : %.3f'%lg_grid.best_estimator_.score(X_test_W2V, y_test))
print('Merged Train Accuracy : %.3f'%lg_grid_merged.best_estimator_.score(X_train_merged_W2V, y_train_merged))
print('Merged Test Accuracy : %.3f'%lg_grid_merged.best_estimator_.score(X_test_merged_W2V, y_test_merged))
print('Best Accuracy Through Grid Search : %.3f'%lg_grid.best_score_)
print('Merged Best Accuracy Through Grid Search : %.3f'%lg_grid.best_score_)
print('Best Parameters : ',lg_grid.best_params_)
print('Merged Best Parameters : ',lg_grid.best_params_)

Train Accuracy : 0.985
Test Accuracy : 0.441
Merged Train Accuracy : 0.981
Merged Test Accuracy : 0.717
Best Accuracy Through Grid Search : 0.556
Merged Best Accuracy Through Grid Search : 0.556
Best Parameters :  {'C': 0.01, 'penalty': 'none', 'random_state': 42, 'solver': 'newton-cg'}
Merged Best Parameters :  {'C': 0.01, 'penalty': 'none', 'random_state': 42, 'solver': 'newton-cg'}


### Hyperparameter tuning SVM

In [75]:
# Hyperparameters to tune
params_svm = {
    'C': [1.0, 1.5, 2.0, 3.0],
    'kernel': ['linear', 'rbf', 'poly'],  # Different kernel functions
    'degree': [2, 3, 4],  # Degree of the polynomial kernel (for 'poly' kernel)
    'class_weight': [None, 'balanced'],  # Adjusting class weights
    'decision_function_shape': ['ovr','ovo'],  # One-vs-Rest
    'random_state': [42]
}

In [76]:
svm_grid = GridSearchCV(estimator=model_svm, param_grid=params_svm)
svm_grid_merged = GridSearchCV(estimator=model_svm, param_grid=params_svm)

In [77]:
svm_grid.fit(X_train_W2V, y_train)

In [78]:
svm_grid_merged.fit(X_train_merged_W2V, y_train_merged)

In [79]:
print('Train Accuracy : %.3f'%svm_grid.best_estimator_.score(X_train_W2V, y_train))
print('Test Accuracy : %.3f'%svm_grid.best_estimator_.score(X_test_W2V, y_test))
print('Merged Train Accuracy : %.3f'%svm_grid_merged.best_estimator_.score(X_train_merged_W2V, y_train_merged))
print('Merged Test Accuracy : %.3f'%svm_grid_merged.best_estimator_.score(X_test_merged_W2V, y_test_merged))
print('Best Accuracy Through Grid Search : %.3f'%svm_grid.best_score_)
print('Merged Best Accuracy Through Grid Search : %.3f'%svm_grid_merged.best_score_)
print('Best Parameters : ',svm_grid.best_params_)
print('Merged Best Parameters : ',svm_grid_merged.best_params_)

Train Accuracy : 0.985
Test Accuracy : 0.382
Merged Train Accuracy : 0.981
Merged Test Accuracy : 0.660
Best Accuracy Through Grid Search : 0.436
Merged Best Accuracy Through Grid Search : 0.481
Best Parameters :  {'C': 1.0, 'class_weight': 'balanced', 'decision_function_shape': 'ovr', 'degree': 2, 'kernel': 'linear', 'random_state': 42}
Merged Best Parameters :  {'C': 1.0, 'class_weight': 'balanced', 'decision_function_shape': 'ovr', 'degree': 2, 'kernel': 'linear', 'random_state': 42}


### Hyperparameter tuning KN

In [80]:
# Hyperparameters to tune for KNN
params_kn = {
    'n_neighbors': [1, 2, 3],      # Number of neighbors to consider
    'weights': ['uniform', 'distance'],   # Weight function used in prediction
    'leaf_size': [1, 2, 3],             # Leaf size for tree-based algorithms
}

In [81]:
kn_grid = GridSearchCV(estimator=model_kn, param_grid=params_kn)
kn_grid_merged = GridSearchCV(estimator=model_kn, param_grid=params_kn)

In [82]:
kn_grid.fit(X_train_W2V, y_train)


In [83]:
kn_grid_merged.fit(X_train_merged_W2V, y_train_merged)

In [84]:
print('Train Accuracy : %.3f'%kn_grid.best_estimator_.score(X_train_W2V, y_train))
print('Test Accuracy : %.3f'%kn_grid.best_estimator_.score(X_test_W2V, y_test))
print('Merged Train Accuracy : %.3f'%kn_grid_merged.best_estimator_.score(X_train_merged_W2V, y_train_merged))
print('Merged Test Accuracy : %.3f'%kn_grid_merged.best_estimator_.score(X_test_merged_W2V, y_test_merged))
print('Best Accuracy Through Grid Search : %.3f'%kn_grid.best_score_)
print('Merged Best Accuracy Through Grid Search : %.3f'%kn_grid_merged.best_score_)
print('Best Parameters : ',kn_grid.best_params_)
print('Merged Best Parameters : ',kn_grid_merged.best_params_)

Train Accuracy : 0.985
Test Accuracy : 0.294
Merged Train Accuracy : 0.981
Merged Test Accuracy : 0.528
Best Accuracy Through Grid Search : 0.398
Merged Best Accuracy Through Grid Search : 0.363
Best Parameters :  {'leaf_size': 1, 'n_neighbors': 2, 'weights': 'distance'}
Merged Best Parameters :  {'leaf_size': 1, 'n_neighbors': 2, 'weights': 'distance'}


### Hyperparameter tuning Random Forests

In [85]:
# Hyperparameters to tune for Random Forest
params_rf = {
    'n_estimators': [120, 130, 140, 150],            # Number of trees in the forest
    'min_samples_split': [6, 7, 8, 9, 10],           # Minimum number of samples required to split an internal node
    'max_features': ['auto', 'sqrt', 'log2'],  # Number of features to consider for the best split
    'bootstrap': [True, False],                # Whether to bootstrap samples
    'random_state': [42]                       # Random seed for reproducibility
}

In [86]:
rf_grid = GridSearchCV(estimator=model_rf, param_grid=params_rf)

In [87]:
rf_grid_merged = GridSearchCV(estimator=model_rf, param_grid=params_rf)

In [88]:
rf_grid.fit(X_train_W2V, y_train)


In [89]:
rf_grid_merged.fit(X_train_merged_W2V, y_train_merged)


In [90]:
print('Train Accuracy : %.3f'%rf_grid.best_estimator_.score(X_train_W2V, y_train))
print('Test Accuracy : %.3f'%rf_grid.best_estimator_.score(X_test_W2V, y_test))
print('Merged Train Accuracy : %.3f'%rf_grid_merged.best_estimator_.score(X_train_merged_W2V, y_train_merged))
print('Merged Test Accuracy : %.3f'%rf_grid_merged.best_estimator_.score(X_test_merged_W2V, y_test_merged))
print('Best Accuracy Through Grid Search : %.3f'%rf_grid.best_score_)
print('Merged Best Accuracy Through Grid Search : %.3f'%rf_grid_merged.best_score_)
print('Best Parameters : ',rf_grid.best_params_)
print('Merged Best Parameters : ',rf_grid_merged.best_params_)

Train Accuracy : 0.985
Test Accuracy : 0.294
Merged Train Accuracy : 0.981
Merged Test Accuracy : 0.491
Best Accuracy Through Grid Search : 0.368
Merged Best Accuracy Through Grid Search : 0.400
Best Parameters :  {'bootstrap': False, 'max_features': 'log2', 'min_samples_split': 6, 'n_estimators': 120, 'random_state': 42}
Merged Best Parameters :  {'bootstrap': False, 'max_features': 'log2', 'min_samples_split': 7, 'n_estimators': 130, 'random_state': 42}


## Hyperparameter Tuning TFIDF pipelines

### Logistic Regression

In [91]:
# make pipeline for LogisticRegression
tuning_lg = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lg', LogisticRegression())
])

In [92]:
params_lg_tf = {'lg__penalty': ['l1', 'l2', 'elasticnet', 'none' ],
          'lg__C': [0.01, 0.1, 0.3, 0.5, 1.0,],
          'lg__solver': ['lbfgs', 'newton-cg', 'sag', 'saga'], # List of compatible solver for multi_class = multinomial
          'lg__random_state': [42]
         }

In [93]:
lg_tf_grid = GridSearchCV(estimator=tuning_lg, param_grid=params_lg_tf)
lg_tf_grid_merged = GridSearchCV(estimator=tuning_lg, param_grid=params_lg_tf)

In [94]:
lg_tf_grid.fit(X_train, y_train)


In [95]:
lg_tf_grid_merged.fit(X_train_merged, y_train_merged)

In [96]:
print('Train Accuracy : %.3f'%lg_tf_grid.best_estimator_.score(X_train, y_train))
print('Test Accuracy : %.3f'%lg_tf_grid.best_estimator_.score(X_test, y_test))
print('Merged Train Accuracy : %.3f'%lg_tf_grid_merged.best_estimator_.score(X_train_merged, y_train_merged))
print('Merged Test Accuracy : %.3f'%lg_tf_grid_merged.best_estimator_.score(X_test_merged, y_test_merged))
print('Best Accuracy Through Grid Search : %.3f'%lg_tf_grid.best_score_)
print('Merged Best Accuracy Through Grid Search : %.3f'%lg_tf_grid_merged.best_score_)
print('Best Parameters : ',lg_tf_grid.best_params_)
print('Merged Best Parameters : ',lg_tf_grid_merged.best_params_)

Train Accuracy : 1.000
Test Accuracy : 0.529
Merged Train Accuracy : 1.000
Merged Test Accuracy : 0.774
Best Accuracy Through Grid Search : 0.654
Merged Best Accuracy Through Grid Search : 0.712
Best Parameters :  {'lg__C': 0.01, 'lg__penalty': 'none', 'lg__random_state': 42, 'lg__solver': 'lbfgs'}
Merged Best Parameters :  {'lg__C': 0.01, 'lg__penalty': 'none', 'lg__random_state': 42, 'lg__solver': 'sag'}


### SVM

In [109]:
# make pipeline for svm
tuning_svm = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svm', svm.SVC())
])

In [110]:
# Hyperparameters to tune
params_svm_tf = {
    'svm__C': [1.0, 1.5, 2.0, 3.0],
    'svm__kernel': ['linear', 'rbf', 'poly'],  # Different kernel functions
    'svm__degree': [2, 3, 4],  # Degree of the polynomial kernel (for 'poly' kernel)
    'svm__class_weight': [None, 'balanced'],  # Adjusting class weights
    'svm__decision_function_shape': ['ovr','ovo'],  # One-vs-Rest
    'svm__random_state': [42]
}

In [111]:
svm_tf_grid = GridSearchCV(estimator=tuning_svm, param_grid=params_svm_tf)
svm_tf_grid_merged = GridSearchCV(estimator=tuning_svm, param_grid=params_svm_tf)

In [112]:
svm_tf_grid.fit(X_train, y_train)

In [113]:
svm_tf_grid_merged.fit(X_train_merged, y_train_merged)

In [114]:
print('Train Accuracy : %.3f'%svm_tf_grid.best_estimator_.score(X_train, y_train))
print('Test Accuracy : %.3f'%svm_tf_grid.best_estimator_.score(X_test, y_test))
print('Merged Train Accuracy : %.3f'%svm_tf_grid_merged.best_estimator_.score(X_train_merged, y_train_merged))
print('Merged Test Accuracy : %.3f'%svm_tf_grid_merged.best_estimator_.score(X_test_merged, y_test_merged))
print('Best Accuracy Through Grid Search : %.3f'%svm_tf_grid.best_score_)
print('Merged Best Accuracy Through Grid Search : %.3f'%svm_tf_grid_merged.best_score_)
print('Best Parameters : ',svm_tf_grid.best_params_)
print('Merged Best Parameters : ',svm_tf_grid_merged.best_params_)

Train Accuracy : 0.985
Test Accuracy : 0.559
Merged Train Accuracy : 0.995
Merged Test Accuracy : 0.755
Best Accuracy Through Grid Search : 0.631
Merged Best Accuracy Through Grid Search : 0.675
Best Parameters :  {'svm__C': 1.0, 'svm__class_weight': 'balanced', 'svm__decision_function_shape': 'ovr', 'svm__degree': 2, 'svm__kernel': 'linear', 'svm__random_state': 42}
Merged Best Parameters :  {'svm__C': 3.0, 'svm__class_weight': 'balanced', 'svm__decision_function_shape': 'ovr', 'svm__degree': 2, 'svm__kernel': 'linear', 'svm__random_state': 42}


### KN

In [115]:
# make pipeline for KNeighborsClassifier
tuning_kn = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('kn', KNeighborsClassifier())
])

In [116]:
# Hyperparameters to tune for KNN
params_kn_tf = {
    'kn__n_neighbors': [1, 2, 3],      # Number of neighbors to consider
    'kn__weights': ['uniform', 'distance'],   # Weight function used in prediction
    'kn__leaf_size': [1, 2, 3],             # Leaf size for tree-based algorithms
}

In [117]:
kn_tf_grid = GridSearchCV(estimator=tuning_kn, param_grid=params_kn_tf)
kn_tf_grid_merged = GridSearchCV(estimator=tuning_kn, param_grid=params_kn_tf)

In [118]:
kn_tf_grid.fit(X_train, y_train)

In [119]:
kn_tf_grid_merged.fit(X_train_merged, y_train_merged)

In [120]:
print('Train Accuracy : %.3f'%kn_tf_grid.best_estimator_.score(X_train, y_train))
print('Test Accuracy : %.3f'%kn_tf_grid.best_estimator_.score(X_test, y_test))
print('Merged Train Accuracy : %.3f'%kn_tf_grid_merged.best_estimator_.score(X_train_merged, y_train_merged))
print('Merged Test Accuracy : %.3f'%kn_tf_grid_merged.best_estimator_.score(X_test_merged, y_test_merged))
print('Best Accuracy Through Grid Search : %.3f'%kn_tf_grid.best_score_)
print('Merged Best Accuracy Through Grid Search : %.3f'%kn_tf_grid_merged.best_score_)
print('Best Parameters : ',kn_tf_grid.best_params_)
print('Merged Best Parameters : ',kn_tf_grid_merged.best_params_)

Train Accuracy : 1.000
Test Accuracy : 0.618
Merged Train Accuracy : 1.000
Merged Test Accuracy : 0.717
Best Accuracy Through Grid Search : 0.556
Merged Best Accuracy Through Grid Search : 0.595
Best Parameters :  {'kn__leaf_size': 1, 'kn__n_neighbors': 2, 'kn__weights': 'distance'}
Merged Best Parameters :  {'kn__leaf_size': 1, 'kn__n_neighbors': 3, 'kn__weights': 'distance'}


### Random Forests

In [121]:
# make pipeline for RandomForestClassifier
tuning_rf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('rf', RandomForestClassifier())
])

In [122]:
# Hyperparameters to tune for Random Forest
params_rf_tf = {
    'rf__n_estimators': [120, 130, 140, 150],            # Number of trees in the forest
    'rf__min_samples_split': [6, 7, 8, 9, 10],           # Minimum number of samples required to split an internal node
    'rf__max_features': ['auto', 'sqrt', 'log2'],  # Number of features to consider for the best split
    'rf__bootstrap': [True, False],                # Whether to bootstrap samples
    'rf__random_state': [42]                       # Random seed for reproducibility
}

In [123]:
rf_tf_grid = GridSearchCV(estimator=tuning_rf, param_grid=params_rf_tf)
rf_tf_grid_merged = GridSearchCV(estimator=tuning_rf, param_grid=params_rf_tf)

In [124]:
rf_tf_grid.fit(X_train, y_train)

In [125]:
rf_tf_grid_merged.fit(X_train_merged, y_train_merged)

In [126]:
print('Train Accuracy : %.3f'%rf_tf_grid.best_estimator_.score(X_train, y_train))
print('Test Accuracy : %.3f'%rf_tf_grid.best_estimator_.score(X_test, y_test))
print('Merged Train Accuracy : %.3f'%rf_tf_grid_merged.best_estimator_.score(X_train_merged, y_train_merged))
print('Merged Test Accuracy : %.3f'%rf_tf_grid_merged.best_estimator_.score(X_test_merged, y_test_merged))
print('Best Accuracy Through Grid Search : %.3f'%rf_tf_grid.best_score_)
print('Merged Best Accuracy Through Grid Search : %.3f'%rf_tf_grid_merged.best_score_)
print('Best Parameters : ',rf_tf_grid.best_params_)
print('Merged Best Parameters : ',rf_tf_grid_merged.best_params_)

Train Accuracy : 0.977
Test Accuracy : 0.412
Merged Train Accuracy : 1.000
Merged Test Accuracy : 0.698
Best Accuracy Through Grid Search : 0.548
Merged Best Accuracy Through Grid Search : 0.646
Best Parameters :  {'rf__bootstrap': False, 'rf__max_features': 'log2', 'rf__min_samples_split': 9, 'rf__n_estimators': 140, 'rf__random_state': 42}
Merged Best Parameters :  {'rf__bootstrap': False, 'rf__max_features': 'log2', 'rf__min_samples_split': 8, 'rf__n_estimators': 150, 'rf__random_state': 42}


## Testing on Synthetic Data 

In [127]:
X_val = df_revised.txt_p
y_val = df_revised['label']

### Word2Vec + LogisticRegression

In [128]:
# Vectorize X_train
X_val_W2V= np.vstack([txt.vector for txt in nlp.pipe(X_val, disable=["parser", "ner"])])

In [129]:
print('Accuracy : %.3f'%lg_grid.best_estimator_.score(X_val_W2V, y_val))


Accuracy : 0.582


### TFIDF + LogisticRegression

In [130]:
print('Accuracy : %.3f'%lg_tf_grid.best_estimator_.score(X_val, y_val))

Accuracy : 0.684


### TFIDF + SVM

In [131]:
print('Accuracy : %.3f'%svm_tf_grid.best_estimator_.score(X_val, y_val))

Accuracy : 0.684
