In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from my_nlp_module.preprocessing import preprocess_bbc_to_dict, PrepOption

import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
import nltk
import re
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from tensorflow.keras.preprocessing.sequence import pad_sequences
import gc
import os
from my_nlp_module.tokenizer import Tokenizer
from my_nlp_module.metrics import Metrics
from collections import Counter

label_to_class = {
    "business": 0,
    "entertainment": 1,
    "politics": 2,
    "sport": 3,
    "tech": 4
}

path_to_model = "../pretrained_models/40/model.bin"
model = KeyedVectors.load_word2vec_format(path_to_model, binary=True)
embed_dim = model.vector_size

2023-01-02 14:11:33.171049: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def mean_doc_vector(document, embed_dim, embed_matrix):
    result = np.zeros(embed_dim,)
    for token in document:
        if token != 0:
            result += embed_matrix[token]
    result /= len(document)
    return result

In [3]:
dataset_train_path = "../datasets/bbc-text/train"
dataset_test_path = "../datasets/bbc-text/test"

#options = [PrepOption.STOPWORDS, PrepOption.LOWERCASE, PrepOption.LEMMA, 
#           PrepOption.NUMBERS, PrepOption.RUBBISH, PrepOption.INTERPUNCTION]
options = [PrepOption.NUMBERS, PrepOption.RUBBISH, PrepOption.INTERPUNCTION]
preprocessed_train = preprocess_bbc_to_dict(dataset_train_path, options)
preprocessed_test = preprocess_bbc_to_dict(dataset_test_path, options)

documents_train = []
documents_test = []
labels_train = []
labels_test = []

for key in preprocessed_train.keys():
    for doc in preprocessed_train[key]:
        documents_train.append(doc)
        labels_train.append(label_to_class[key])
    for doc in preprocessed_test[key]:
        documents_test.append(doc)
        labels_test.append(label_to_class[key])
        
df_train = pd.DataFrame({"document": documents_train, "label": labels_train})
df_train = df_train.sample(frac=1, axis=0).reset_index(drop=True)

df_test = pd.DataFrame({"document": documents_test, "label": labels_test})
df_test = df_test.sample(frac=1, axis=0).reset_index(drop=True)

Couldn't read 1 files:
../datasets/bbc-text/train/sport/199.txt

Couldn't read 0 files:



In [4]:
MAX_WORDS = 8000
tok = Tokenizer(MAX_WORDS)
tok.fit(list(df_train['document']))
vocab_size = len(tok.vocab) + 1

embed_matrix=np.zeros(shape=(vocab_size,embed_dim))
for word,i in tok.vocab.items():
    try:
        embed_vector=model[word]
        embed_matrix[i]=embed_vector
    except KeyError:
        continue

print(f"number of words {vocab_size}")

number of words 8001


In [5]:
encoded_train = tok.texts_to_sequences(list(df_train['document']))
encoded_test = tok.texts_to_sequences(list(df_test['document']))

lengths = []
m = -1
for doc in encoded_train:
    lengths.append(len(doc))
lengths = np.array(lengths)
mean = np.mean(lengths)
std = np.std(lengths)
max_doc_len = int(mean + std/2)

print(f"Maksymalna długość dokumentu: {max_doc_len}")
pad_docs_train = pad_sequences(encoded_train, maxlen=max_doc_len, padding='post')
pad_docs_test = pad_sequences(encoded_test, maxlen=max_doc_len, padding='post')
print(f"Kształt macierzy dokumentów treningowych: {pad_docs_train.shape}")
print(f"Kształt macierzy dokumentów testowych: {pad_docs_test.shape}")

Maksymalna długość dokumentu: 470
Kształt macierzy dokumentów treningowych: (1999, 470)
Kształt macierzy dokumentów testowych: (225, 470)


In [6]:
seq_vectors_train = np.asarray([mean_doc_vector(pad_docs_train[i], embed_dim, embed_matrix) \
                                for i in range(pad_docs_train.shape[0])])
seq_vectors_test = np.asarray([mean_doc_vector(pad_docs_test[i], embed_dim, embed_matrix) \
                                for i in range(pad_docs_test.shape[0])])
print(f"Kształt wektorów treningowych: {seq_vectors_train.shape}\n"\
     f"Kształt wektorów testowych: {seq_vectors_test.shape}")

Kształt wektorów treningowych: (1999, 100)
Kształt wektorów testowych: (225, 100)


In [7]:
x_train = seq_vectors_train[:-225, :]
y_train = np.array(df_train['label'])[:-225]

x_test = np.concatenate((seq_vectors_test, seq_vectors_train[-225:, :]), axis=0)
y_test = np.concatenate((np.array(df_test['label']), np.array(df_train['label'])[-225:]), axis=0)

print("Kształ danych treningowych po zmianie ilości:")
print(x_train.shape)
print(y_train.shape)

print("\nKształ danych testowych po zmianie ilości:")
print(x_test.shape)
print(y_test.shape)

print(f"\nPodział na klasy zbioru treningowego: {Counter(y_train)}")
print(f"\nPodział na klasy zbioru testowego: {Counter(y_test)}")

Kształ danych treningowych po zmianie ilości:
(1774, 100)
(1774,)

Kształ danych testowych po zmianie ilości:
(450, 100)
(450,)

Podział na klasy zbioru treningowego: Counter({3: 409, 0: 407, 2: 323, 4: 323, 1: 312})

Podział na klasy zbioru testowego: Counter({0: 103, 3: 101, 2: 94, 4: 78, 1: 74})


In [8]:
parameters = {'kernel':['poly', 'rbf', 'sigmoid'], 'C':[0.1, 1, 10], 'degree':[3, 4]}
svc = SVC()
svc_grid = GridSearchCV(svc, parameters, scoring='accuracy')
svc_grid.fit(x_train,y_train)

In [9]:
def predict_function(model, x):
    return model.predict(x)

svc_classifier = svc_grid.best_estimator_
best_svc_params = svc_grid.best_params_
svc_train_score = svc_grid.best_score_

print(f"Najlepsze znalezione parametry dla SVC:\n{best_svc_params}\n\n"\
     f"Wynik na zbiorze treningowym dla SVC: {svc_train_score}\n")


metrics = Metrics(svc_classifier, predict_function)
print("Metryki dla zbioru testowego dla klasyfikatora SVC")
_ = metrics.accuracy(x_test, y_test)
_ = metrics.precision(x_test, y_test)
_ = metrics.recall(x_test, y_test)
_ = metrics.f1(x_test, y_test)
metrics.print_confusion_matrix(x_test, y_test)
metrics.print_metrics(x_test, y_test)

Najlepsze znalezione parametry dla SVC:
{'C': 10, 'degree': 3, 'kernel': 'rbf'}

Wynik na zbiorze treningowym dla SVC: 0.9605363252964112

Metryki dla zbioru testowego dla klasyfikatora SVC
Dokładność: 0.9577777777777777
Precyzja: [0.93396226 0.96       0.95604396 0.95283019 1.        ]
Czułość: [0.96116505 0.97297297 0.92553191 1.         0.92307692]
F1: [0.94736842 0.96644295 0.94054054 0.97584541 0.96      ]
[[ 99   1   3   0   0]
 [  1  72   0   1   0]
 [  5   0  87   2   0]
 [  0   0   0 101   0]
 [  1   2   1   2  72]]
╒═══════╤═══════╤═══════╤═══════╤═══════╕
│ 0.934 │ 0.96  │ 0.956 │ 0.953 │ 1     │
├───────┼───────┼───────┼───────┼───────┤
│ 0.961 │ 0.973 │ 0.926 │ 1     │ 0.923 │
├───────┼───────┼───────┼───────┼───────┤
│ 0.947 │ 0.966 │ 0.941 │ 0.976 │ 0.96  │
╘═══════╧═══════╧═══════╧═══════╧═══════╛
Dokładność: 0.958


In [10]:
parameters = {'criterion':['gini', 'entropy'], 'max_features':[None, "sqrt", 'log2'], 
              'min_samples_split': [2,5,15], 'splitter': ['best', 'random']}
tree = DecisionTreeClassifier()
tree_grid = GridSearchCV(tree, parameters, scoring='accuracy')
tree_grid.fit(x_train,y_train)

In [11]:
tree_classifier = tree_grid.best_estimator_
best_tree_params = tree_grid.best_params_
tree_train_score = tree_grid.best_score_

print(f"Najlepsze znalezione parametry dla drzewa decyzyjnego:\n{best_tree_params}\n\n"\
     f"Wynik na zbiorze treningowym dla drzewa decyzyjnego: {tree_train_score}\n"\
     f"Głębokość drzewa decyzyjnego: {tree_classifier.get_depth()}\n")


metrics = Metrics(tree_classifier, predict_function)
print("Metryki dla zbioru testowego dla drzewa decyzyjnego")
_ = metrics.accuracy(x_test, y_test)
_ = metrics.precision(x_test, y_test)
_ = metrics.recall(x_test, y_test)
_ = metrics.f1(x_test, y_test)

metrics.print_confusion_matrix(x_test, y_test)
metrics.print_metrics(x_test, y_test)

Najlepsze znalezione parametry dla drzewa decyzyjnego:
{'criterion': 'entropy', 'max_features': None, 'min_samples_split': 2, 'splitter': 'best'}

Wynik na zbiorze treningowym dla drzewa decyzyjnego: 0.7869308506405666
Głębokość drzewa decyzyjnego: 12

Metryki dla zbioru testowego dla drzewa decyzyjnego
Dokładność: 0.8
Precyzja: [0.82105263 0.77108434 0.73033708 0.89       0.77108434]
Czułość: [0.75728155 0.86486486 0.69148936 0.88118812 0.82051282]
F1: [0.78787879 0.81528662 0.71038251 0.88557214 0.79503106]
[[78  4 12  3  6]
 [ 1 64  5  4  0]
 [10  6 65  3 10]
 [ 1  5  3 89  3]
 [ 5  4  4  1 64]]
╒═══════╤═══════╤═══════╤═══════╤═══════╕
│ 0.821 │ 0.771 │ 0.73  │ 0.89  │ 0.771 │
├───────┼───────┼───────┼───────┼───────┤
│ 0.757 │ 0.865 │ 0.691 │ 0.881 │ 0.821 │
├───────┼───────┼───────┼───────┼───────┤
│ 0.788 │ 0.815 │ 0.71  │ 0.886 │ 0.795 │
╘═══════╧═══════╧═══════╧═══════╧═══════╛
Dokładność: 0.8


In [12]:
print(f"Wstępne przetwarzanie: {options}")

Wstępne przetwarzanie: [<PrepOption.NUMBERS: 7>, <PrepOption.RUBBISH: 6>, <PrepOption.INTERPUNCTION: 4>]
