In [1]:
import pandas as pd
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from my_nlp_module.preprocessing import PrepOption, preprocess_klej
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from tensorflow.keras.preprocessing.sequence import pad_sequences
import gc
import os
from my_nlp_module.tokenizer import Tokenizer
from my_nlp_module.metrics import Metrics
import matplotlib.pyplot as plt

path_to_model = "../pretrained_models/62/model.bin"
model = KeyedVectors.load_word2vec_format(path_to_model, binary=True)
embed_dim = model.vector_size

2022-12-11 23:32:30.887485: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def mean_doc_vector(document, embed_dim, embed_matrix):
    result = np.zeros(embed_dim,)
    for token in document:
        if token != 0:
            result += embed_matrix[token]
    result /= len(document)
    return result

In [3]:
train_path = '../datasets/klej_ar/train.tsv'
dev_path = '../datasets/klej_ar/dev.tsv'

df_train = pd.read_csv(train_path, sep='\t')
df_train = df_train.sample(frac=1, axis=0).reset_index(drop=True)

df_test = pd.read_csv(dev_path, sep='\t')
df_test = df_test.sample(frac=1, axis=0).reset_index(drop=True)

# Poniżej należy odkomentować testowany zestaw wstępnego przetwarzania

#options = [PrepOption.STOPWORDS, PrepOption.LOWERCASE, PrepOption.STEM,
#           PrepOption.NUMBERS, PrepOption.RUBBISH, PrepOption.INTERPUNCTION]
options = [PrepOption.NUMBERS, PrepOption.RUBBISH, PrepOption.INTERPUNCTION]

df_train = preprocess_klej(df_train, options, "../my_nlp_module/polish_stopwords.txt")
df_train['rating'] = df_train['rating'].apply(lambda x: x-1)

df_test = preprocess_klej(df_test, options, "../my_nlp_module/polish_stopwords.txt")
df_test['rating'] = df_test['rating'].apply(lambda x: x-1)


print(f"zbiór treningowy: {df_train.groupby(['rating'])['rating'].count()}")
print(f"zbiór testowy: {df_test.groupby(['rating'])['rating'].count()}")

Loading: 100%|██████████████| 11368252/11368252 [00:10<00:00, 1092975.36bytes/s]
Loading: 100%|██████████████| 11368252/11368252 [00:09<00:00, 1229682.99bytes/s]


zbiór treningowy: rating
0.0    1733
1.0    1069
2.0    1208
3.0    1644
4.0    3923
Name: rating, dtype: int64
zbiór testowy: rating
0.0    209
1.0    118
2.0    138
3.0    201
4.0    336
Name: rating, dtype: int64


In [4]:
MAX_WORDS = 8000
tok = Tokenizer(MAX_WORDS)
tok.fit(list(df_train['text']))
vocab_size = len(tok.vocab) + 1

embed_matrix=np.zeros(shape=(vocab_size,embed_dim))
for word,i in tok.vocab.items():
    try:
        embed_vector=model[word]
        embed_matrix[i]=embed_vector
    except KeyError:
        continue

print(f"number of words {vocab_size}")

number of words 8001


In [5]:
encoded_train = tok.texts_to_sequences(list(df_train['text']))
encoded_test = tok.texts_to_sequences(list(df_test['text']))

lengths = []
m = -1
for doc in encoded_train:
    lengths.append(len(doc))
    if len(doc) > m:
        m = len(doc)
lengths = np.array(lengths)
mean = np.mean(lengths)
std = np.std(lengths)
max_doc_len = int(mean + std/2)
        
print(f"Maksymalna długość dokumentu: {max_doc_len}")
pad_docs_train = pad_sequences(encoded_train, maxlen=max_doc_len, padding='post')
pad_docs_test = pad_sequences(encoded_test, maxlen=max_doc_len, padding='post')
print(f"Kształt macierzy dokumentów treningowych: {pad_docs_train.shape}")
print(f"Kształt macierzy dokumentów testowych: {pad_docs_test.shape}")

Maksymalna długość dokumentu: 80
Kształt macierzy dokumentów treningowych: (9577, 80)
Kształt macierzy dokumentów testowych: (1002, 80)


In [6]:
seq_vectors_train = np.asarray([mean_doc_vector(pad_docs_train[i], embed_dim, embed_matrix) \
                                for i in range(pad_docs_train.shape[0])])
seq_vectors_test = np.asarray([mean_doc_vector(pad_docs_test[i], embed_dim, embed_matrix) \
                                for i in range(pad_docs_test.shape[0])])
print(f"Kształt wektorów treningowych: {seq_vectors_train.shape}\n"\
     f"Kształt wektorów testowych: {seq_vectors_test.shape}")

Kształt wektorów treningowych: (9577, 100)
Kształt wektorów testowych: (1002, 100)


In [7]:
x_train = seq_vectors_train
y_train = np.array(df_train['rating'])
x_test = seq_vectors_test
y_test = np.array(df_test['rating'])

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(9577, 100)
(1002, 100)
(9577,)
(1002,)


In [8]:
parameters = {'kernel':['poly', 'rbf'], 'C':[0.1, 1, 10], 'degree':[3, 4]}
#parameters = {'kernel':['rbf'], 'C':[0.1, 1, 10]}
svc = SVC()
svc_grid = GridSearchCV(svc, parameters, scoring='accuracy', verbose=3)
svc_grid.fit(x_train,y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END ......C=0.1, degree=3, kernel=poly;, score=0.448 total time=   6.2s
[CV 2/5] END ......C=0.1, degree=3, kernel=poly;, score=0.445 total time=   6.3s
[CV 3/5] END ......C=0.1, degree=3, kernel=poly;, score=0.440 total time=   6.1s
[CV 4/5] END ......C=0.1, degree=3, kernel=poly;, score=0.428 total time=   6.1s
[CV 5/5] END ......C=0.1, degree=3, kernel=poly;, score=0.449 total time=   6.0s
[CV 1/5] END .......C=0.1, degree=3, kernel=rbf;, score=0.443 total time=   8.9s
[CV 2/5] END .......C=0.1, degree=3, kernel=rbf;, score=0.447 total time=   9.0s
[CV 3/5] END .......C=0.1, degree=3, kernel=rbf;, score=0.444 total time=   8.7s
[CV 4/5] END .......C=0.1, degree=3, kernel=rbf;, score=0.437 total time=   8.6s
[CV 5/5] END .......C=0.1, degree=3, kernel=rbf;, score=0.450 total time=   8.1s
[CV 1/5] END ......C=0.1, degree=4, kernel=poly;, score=0.449 total time=   5.6s
[CV 2/5] END ......C=0.1, degree=4, kernel=poly;

In [9]:
def predict_function(model, x):
    return model.predict(x)

svc_classifier = svc_grid.best_estimator_
best_svc_params = svc_grid.best_params_
svc_train_score = svc_grid.best_score_

print(f"Najlepsze znalezione parametry dla SVC:\n{best_svc_params}\n\n"\
     f"Wynik na zbiorze testowym dla SVC: {svc_train_score}\n")


metrics = Metrics(svc_classifier, predict_function)
print("Metryki dla zbioru testowego dla klasyfikatora SVC")
_ = metrics.accuracy(x_test, y_test)
_ = metrics.precision(x_test, y_test)
_ = metrics.recall(x_test, y_test)
_ = metrics.f1(x_test, y_test)

metrics.print_confusion_matrix(x_test, y_test)
metrics.print_metrics(x_test, y_test)

print(f"preprocessing: {options}\n\nmax words: {MAX_WORDS}")

Najlepsze znalezione parametry dla SVC:
{'C': 10, 'degree': 3, 'kernel': 'rbf'}

Wynik na zbiorze testowym dla SVC: 0.51122644543408

Metryki dla zbioru testowego dla klasyfikatora SVC
Dokładność: 0.46506986027944114
Precyzja: [0.5037037  0.19230769 0.10526316 0.35365854 0.51146384]
Czułość: [0.6507177  0.04237288 0.04347826 0.14427861 0.86309524]
F1: [0.56784969 0.06944444 0.06153846 0.204947   0.64230343]
[[136   9  14   3  47]
 [ 49   5  13  12  39]
 [ 39   7   6  24  62]
 [ 18   4  21  29 129]
 [ 28   1   3  14 290]]
╒═══════╤═══════╤═══════╤═══════╤═══════╕
│ 0.504 │ 0.192 │ 0.105 │ 0.354 │ 0.511 │
├───────┼───────┼───────┼───────┼───────┤
│ 0.651 │ 0.042 │ 0.043 │ 0.144 │ 0.863 │
├───────┼───────┼───────┼───────┼───────┤
│ 0.568 │ 0.069 │ 0.062 │ 0.205 │ 0.642 │
╘═══════╧═══════╧═══════╧═══════╧═══════╛
Dokładność: 0.465
preprocessing: [<PrepOption.NUMBERS: 7>, <PrepOption.RUBBISH: 6>, <PrepOption.INTERPUNCTION: 4>]

max words: 8000


In [10]:
parameters = {'criterion':['gini', 'entropy'], 'max_features':[None, "sqrt"]}
tree = DecisionTreeClassifier()
tree_grid = GridSearchCV(tree, parameters, scoring='accuracy', verbose=3)
tree_grid.fit(x_train,y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END .criterion=gini, max_features=None;, score=0.352 total time=   0.9s
[CV 2/5] END .criterion=gini, max_features=None;, score=0.350 total time=   0.9s
[CV 3/5] END .criterion=gini, max_features=None;, score=0.352 total time=   0.9s
[CV 4/5] END .criterion=gini, max_features=None;, score=0.364 total time=   0.8s
[CV 5/5] END .criterion=gini, max_features=None;, score=0.348 total time=   0.9s
[CV 1/5] END .criterion=gini, max_features=sqrt;, score=0.353 total time=   0.1s
[CV 2/5] END .criterion=gini, max_features=sqrt;, score=0.338 total time=   0.1s
[CV 3/5] END .criterion=gini, max_features=sqrt;, score=0.348 total time=   0.1s
[CV 4/5] END .criterion=gini, max_features=sqrt;, score=0.339 total time=   0.1s
[CV 5/5] END .criterion=gini, max_features=sqrt;, score=0.339 total time=   0.1s
[CV 1/5] END criterion=entropy, max_features=None;, score=0.339 total time=   1.6s
[CV 2/5] END criterion=entropy, max_features=No

In [11]:
tree_classifier = tree_grid.best_estimator_
best_tree_params = tree_grid.best_params_
tree_train_score = tree_grid.best_score_

print(f"Najlepsze znalezione parametry dla drzewa decyzyjnego:\n{best_tree_params}\n\n"\
     f"Wynik na zbiorze treningowym dla drzewa decyzyjnego: {tree_train_score}\n"\
     f"Głębokość drzewa decyzyjnego: {tree_classifier.get_depth()}\n")


metrics = Metrics(tree_classifier, predict_function)
print("Metryki dla zbioru testowego dla drzewa decyzyjnego")
_ = metrics.accuracy(x_test, y_test)
_ = metrics.precision(x_test, y_test)
_ = metrics.recall(x_test, y_test)
_ = metrics.f1(x_test, y_test)

metrics.print_confusion_matrix(x_test, y_test)
metrics.print_metrics(x_test, y_test)

print(f"preprocessing: {options}\n\nmax words: {MAX_WORDS}")

Najlepsze znalezione parametry dla drzewa decyzyjnego:
{'criterion': 'gini', 'max_features': None}

Wynik na zbiorze treningowym dla drzewa decyzyjnego: 0.3531382285767237
Głębokość drzewa decyzyjnego: 30

Metryki dla zbioru testowego dla drzewa decyzyjnego
Dokładność: 0.3413173652694611
Precyzja: [0.3908046  0.16176471 0.19847328 0.26130653 0.48066298]
Czułość: [0.32535885 0.18644068 0.1884058  0.25870647 0.51785714]
F1: [0.35509138 0.17322835 0.19330855 0.26       0.49856734]
[[ 68  25  35  40  41]
 [ 25  22  22  18  31]
 [ 19  18  26  28  47]
 [ 25  36  19  52  69]
 [ 37  35  29  61 174]]
╒═══════╤═══════╤═══════╤═══════╤═══════╕
│ 0.391 │ 0.162 │ 0.198 │ 0.261 │ 0.481 │
├───────┼───────┼───────┼───────┼───────┤
│ 0.325 │ 0.186 │ 0.188 │ 0.259 │ 0.518 │
├───────┼───────┼───────┼───────┼───────┤
│ 0.355 │ 0.173 │ 0.193 │ 0.26  │ 0.499 │
╘═══════╧═══════╧═══════╧═══════╧═══════╛
Dokładność: 0.341
preprocessing: [<PrepOption.NUMBERS: 7>, <PrepOption.RUBBISH: 6>, <PrepOption.INTERPUNCTI