In [4]:
import sys
import os
import glob
import json
import random
import unidecode
import time
import re

import numpy as np

import seaborn as sn
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline


import pandas as pd

from transformers import BertTokenizer, BertForSequenceClassification

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

from torchtext.utils import download_from_url, extract_archive
from torch.autograd import Variable
from torchtext import legacy
from torch import autograd
import torch.optim as optim
import torch.nn.functional as F
import torch.nn as nn
import torch
import torchtext

import gensim
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

from nltk.corpus import stopwords
from nltk.stem.snowball import HungarianStemmer
from nltk.tokenize import RegexpTokenizer
import nltk

import hu_core_ud_lg

In [5]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
def find_files(path):
    return glob.glob(path)

In [8]:
def read_json(filename):
    with open(filename) as json_file:
        data = json.load(json_file)
    return data

### Az adathalmaz letöltése

In [9]:
dataset_tar = download_from_url('https://drive.google.com/uc?id=1k7GfVRqrHFK00ABkit0oGQo62fCakMSZ', root='.data/')
extracted_files = extract_archive(dataset_tar)

## Adatok olvasása
Adatok beolvasása JSON fájlként, majd egy nagy pandas-os DataFrame-mé alakítása.

In [10]:
json_data = []

for file in find_files('.data/gyakori_*'):
    print("Reading file:", file)
    json_data.append(read_json(file))
    
data_frame = pd.DataFrame()

for data in json_data:
    frames  = [data_frame, pd.DataFrame(data)]
    data_frame = pd.concat(frames).reset_index(drop=True)

Reading file: .data/gyakori_szamitastechnika
Reading file: .data/gyakori_egeszseg
Reading file: .data/gyakori_egeszseg_20000
Reading file: .data/gyakori_allatok_14000
Reading file: .data/gyakori_szorakozas_30000


## Bepillantás a kérdésekbe
Csak hogy tudjuk pontosan mivel is állunk szemben. Minden kérdéshez tartozik a kérdés rövid, illetve hosszú verziója, egy válasz, amit a felhasználók a leghasznosabbnak találtak. Ezeken kívül kategóriák és kulcsszavak is vannak a kérdéshez.

In [11]:
data_frame.head(2)

Unnamed: 0,valasz,kategoriak,hosszu_kerdes,rovid_kerdes,keywords
0,Várak régen is voltak. Ha mindhárom tornyot le...,"[Számítástechnika, Internet]",Miért lett ilyen sz@r a honfoglaló? Régen tök ...,Miért lett ilyen sz@r a honfoglaló?,"[Honfoglaló, vár]"
1,"Ahogy írták, az stdio az a C-s függvénykönyvtá...","[Számítástechnika, Programozás]",C++-ban melyiket érdemesebb használni? Stdio v...,C++-ban melyiket érdemesebb használni? Stdio v...,"[C++, iostream, konzol, Windows, Visual Studio]"


### Túl kicsi kategóriák szűrése
Ha egy kategóriához túl kevés kérdés tartozik, akkor nem érdemes a továbbiakban foglalkozni vele. A túl kicsi kategóriák szűrése következik.

In [12]:
target_names = data_frame['kategoriak'].apply(lambda x: x[0]).unique().tolist()

minimum_questions_for_each_category = 3000

for target in target_names:
    target_size = data_frame[data_frame['kategoriak'].apply(lambda x : x[0]) == target].shape[0]
    if target_size < minimum_questions_for_each_category:
        data_frame = data_frame[data_frame['kategoriak'].apply(lambda x : x[0]) != target]


### Túl rövid kérdések szűrése
A túl rövid kérdések nem túl hasznosak. Az adathalmazban előfordul pár 2 szóból álló HOSSZÚ kérdés. Vegyük például a következő kérdéseket: `Militaryra appalosa?`, `Ivabradine vélemények?`. Ezek a kérdések nem meghatározóak a témájukra nézve.

In [13]:
minimum_words_per_question = 5
data_frame = data_frame[data_frame['hosszu_kerdes'].apply(lambda x: len(x.split())) >= minimum_words_per_question]

## "Főkategóriák" kigyűjtése
Az egyes főkategóriák neveinek kigyűjtése, majd az egyes nevekhez egy azonosító szám rendelése.

In [14]:
def get_target_names(idx, data_frame):
    target_names = data_frame['kategoriak'].apply(lambda x: x[idx]).unique().tolist()
    target_dict =  {value: key for key, value in enumerate(target_names)}
    
    return target_names, target_dict, len(target_names)

In [15]:
target_names, target_dict, num_of_categories = get_target_names(0, data_frame)

print(target_names)
print(target_dict)
print("Kategóriák száma: ", num_of_categories)

['Számítástechnika', 'Egészség', 'Állatok', 'Szórakozás']
{'Számítástechnika': 0, 'Egészség': 1, 'Állatok': 2, 'Szórakozás': 3}
Kategóriák száma:  4


## Tanító adathalmaz előállítása
A tanító adathalmazban minden egyes főkategóriából ugyanannyi kérdésnek kell szerepelnie (így fair). Itt pontosan ez történik `questions_from_each_category` darab kérdés kerül a tanító adathalmazba minden kategóriából, majd az eredményül kapott tömb véletlenszerűen összekeveredik.

#### Shuffle together
A shuffle_together függvény két listát véletlenszerűen kever össze, úgy, hogy az a keverés előtt az egyes listákban azonos indexen szereplő értékek a keverés után is azonos indexen lesznek.

In [16]:
def shuffle_together(list1, list2):
    zipped = list(zip(list1, list2))
    random.shuffle(zipped)
    list1, list2 = zip(*zipped)
    
    return (list(list1), list(list2))

#### Least questions in category
Ez a találó nevű függvény azt akarja kiszámolni, hogy a legkevesebb kérdéssel rendelkező kategóriában mennyi kérdés van. Ezt azért számolom ki, hogy a tanító és tesztelő adathalmazokba ugyanannyi kérdés kerülhessen minden kategóriába.


In [17]:
def least_questions_in_ctg(idx, target_names, data_frame):
    min_amount = float('inf')

    for target_name in target_names:
        amount = len(data_frame[data_frame["kategoriak"].apply(lambda x : x[idx]) == target_name]['kategoriak'])
        min_amount = min(amount, min_amount)
    return min_amount

#### Split datasets

In [18]:
def split_datasets(train_ratio, questions_size, ctg_idx, data_frame, num_of_categories, target_names, target_dict):
    
    train_each_ctg = int(train_ratio * questions_size)
    train_size = train_each_ctg * num_of_categories
    train_questions = []
    train_target = []
    
    test_each_ctg = int((1.0 - train_ratio) * questions_size)
    test_size = test_each_ctg * num_of_categories
    test_questions = []
    test_target = []
    
    for target_name in target_names:
        train_questions += data_frame[data_frame["kategoriak"].apply(lambda x: x[ctg_idx]) == target_name][0:train_each_ctg]["hosszu_kerdes"].to_list()
        train_target += [target_dict[target_name]] * train_each_ctg    
    
        test_questions += data_frame[data_frame["kategoriak"].apply(lambda x: x[ctg_idx]) == target_name][train_each_ctg:train_each_ctg + test_each_ctg]["hosszu_kerdes"].to_list()
        test_target += [target_dict[target_name]] * test_each_ctg
    
    train_questions, train_target = shuffle_together(train_questions, train_target)
    test_questions, test_target = shuffle_together(test_questions, test_target)
    
    return train_each_ctg, train_size, train_questions, train_target,\
        test_each_ctg, test_size, test_questions, test_target

In [19]:
train_ratio = 0.80
test_ratio = 1.0 - train_ratio

min_amount = least_questions_in_ctg(0, target_names, data_frame)
train_each_ctg, train_size, train_questions, train_target, \
    test_each_ctg, test_size, test_questions, test_target = \
        split_datasets(train_ratio, min_amount, 0, data_frame, num_of_categories, target_names, target_dict)

In [20]:

def unicode_to_ascii(data):
    return unidecode.unidecode(re.sub(r"[,.;@#?!&$]+\ *", " ", data).lower()).split()

#### Get vocab

In [21]:
def get_vocab(questions):
    vocab = set()
    
    for idx, question in enumerate(questions):
        words = unicode_to_ascii(question)

        for idx in range(len(words)):
            vocab.add(words[idx])

    vocab_size = len(vocab)

    index_to_word = {}

    for idx, word in enumerate(vocab):
        index_to_word[word] = idx
        
    return vocab, vocab_size, index_to_word

In [22]:
vocab, vocab_size, index_to_word = get_vocab(train_questions)

In [23]:
def get_batch(text, target, i, batch_size, input_size, index_to):

    batches = []
    results = []
    
    texts = text[i * batch_size : (i + 1) * batch_size]
    categories = target[i * batch_size : (i + 1) * batch_size]

    for text in texts:
        layer = np.zeros(input_size, dtype=float)
        words = unicode_to_ascii(text)
        
        for word in words:
            if word in index_to:
                layer[index_to[word]] += 1
            
        batches.append(layer)
        
    for category in categories:
        results.append(category)
     
    return np.array(batches), np.array(results)

In [24]:
learning_rate = 0.01
num_epochs = 2
batch_size = 200

hidden_size = 100
input_size = vocab_size
num_classes = len(target_names)

In [25]:
class BOWClassification(nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
        super(BOWClassification, self).__init__()
        self.layer_1 = nn.Linear(input_size, hidden_size, bias=True)
        self.relu = nn.ReLU()
        self.layer_2 = nn.Linear(hidden_size, hidden_size, bias=True)
        self.output_layer = nn.Linear(hidden_size, num_classes, bias=True)
 
     def forward(self, x):
        out = self.layer_1(x)
        out = self.relu(out)
        out = self.layer_2(out)
        out = self.relu(out)
        out = self.output_layer(out)
        return out

#### Get net

In [26]:
def get_net(vocab_size, hidden_size, num_classes, learning_rate, num_epochs, train_questions, train_target, batch_size, index_to_word):
    net = BOWClassification(vocab_size, hidden_size, num_classes)

    criterion = nn.CrossEntropyLoss()  
    optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)  


    for epoch in range(num_epochs):
        if epoch:
            print()
        print("Epoch %d/%d: " % (epoch + 1, num_epochs))
        total_batch = len(train_questions) // batch_size
        for i in range(total_batch):
            batch_x, batch_y = get_batch(train_questions, train_target, i, batch_size, vocab_size, index_to_word)
            questions = Variable(torch.FloatTensor(batch_x))
            themes = Variable(torch.LongTensor(batch_y))

            optimizer.zero_grad()
            outputs = net(questions)
            loss = criterion(outputs, themes)
            loss.backward()
            optimizer.step()

            print("\r[%d/%d] %.2f%%" % (i + 1, total_batch, (i + 1)/ total_batch * 100), end="")
    
    return net

In [None]:
net = get_net(vocab_size, hidden_size, num_classes, learning_rate, num_epochs, train_questions, train_target, batch_size, index_to_word)

In [None]:
def test_net(net, test_q, test_t, batch_s, vocab_s, index2word):
    total_batch = len(test_q) // batch_s

    total_pred = []

    for i in range(total_batch):
        test_batch_x, test_batch_y = get_batch(test_q, test_t, i, batch_s, vocab_s, index2word)
        print("\rTesting... [%d/%d] %.2f%%" % (i + 1, total_batch, (i + 1)/ total_batch * 100), end="")

        questions = Variable(torch.FloatTensor(test_batch_x))
        themes = Variable(torch.FloatTensor(test_batch_y))

        outputs = net(questions)
        _, predicted = torch.max(outputs.data, 1)
        total_pred += predicted.tolist()
        
    return total_pred

In [None]:
total_predicted = test_net(net, test_questions, test_target, batch_size, vocab_size, index_to_word)

## Tesztelés eredménye

### Confusion matrix
Tévesztési mátrix magyarul. A mátrix `i` sorában, `j` oszlopában szereplő érték (ebben az esetben) azt jelenti, hogy a mondat a `i` kategóriájú, de a háló `j` kategóriát ismert fel.

In [None]:
cm = confusion_matrix(test_target[0:len(total_predicted)], total_predicted)
cm_df = pd.DataFrame(cm, index=target_names, columns=target_names)

heatmap = sn.heatmap(cm_df, annot=True, cmap='Reds', fmt='g', annot_kws={"size": 15}, cbar=False)
plt.show()

## Classification report
#### Accuracy
A accuracy érték jelentése: a kérdések mekkora részét sikerült helyesen osztályozni.
#### Precision
A Szórakozás kategóriához tartozó precision érték azt jelenti, hogy az összes Szórakozás kategóriába sorolt kérdés közül mekkora arányban vannak a ténylegesen Szórakozás kategóriájú kérdések.
#### Recall
A recall érték a precision értékhez eléggé hasonló. Az összes ténylegesen Szórakozás kategóriájú kérdés mekkora részét sorolta Szórakozás kategóriába az osztályozó.

In [None]:
class_report = classification_report(test_target[0:len(total_predicted)], total_predicted, target_names=target_names)
print(class_report)

## Alkategória osztályozás
A főkategória osztályozáshoz hasonló módszerrel próbálkozva. A 4 főkategóriát most elkülönítjük egymástól, 1 tanító adathalamzban csak 1 főkategóriához tartozó kérdések lesznek. Ennek megfelelően az eddig látott dolgokból 4 fog kelleni (háló, tévesztési mátrix, tanító-, tesztelő halmaz stb.)

#### Get target names

In [None]:
target_names_sub = {}
target_dict_sub = {}
num_of_ctg_sub = {}

filtered_df = {}

for t in target_names:
    filtered_df[t] = data_frame[data_frame['kategoriak'].apply(lambda x: x[0]) == t]
    filtered_df[t] = filtered_df[t][filtered_df[t]['kategoriak'].apply(lambda x: x[1]) != 'Egyéb kérdések']
    target_names_sub[t], target_dict_sub[t], num_of_ctg_sub[t] = get_target_names(1, filtered_df[t])

#### Split datasets

In [None]:
train_ratio = 0.8

min_amount_sub = {}

train_each_sub = {}
train_questions_sub = {}
train_size_sub = {}
train_target_sub = {}

test_each_sub = {}
test_questions_sub = {}
test_size_sub = {}
test_target_sub = {}

for t in target_names:
    min_amount_sub[t] = least_questions_in_ctg(1, target_names_sub[t], filtered_df[t])
    
    train_each_sub[t], train_size_sub[t], train_questions_sub[t], train_target_sub[t], \
        test_each_sub[t], test_size_sub[t], test_questions_sub[t], test_target_sub[t] = \
            split_datasets(train_ratio, min_amount_sub[t], 1, filtered_df[t], num_of_ctg_sub[t], target_names_sub[t], target_dict_sub[t])


#### Get vocab

In [None]:
vocab_sub = {}
vocab_size_sub = {}
index_to_word_sub = {}

for t in target_names:
    vocab_sub[t], vocab_size_sub[t], index_to_word_sub[t] = get_vocab(train_questions_sub[t])

In [None]:
learning_rate = 0.005
num_epochs = 2
batch_size = 75

hidden_size = 200

In [None]:
net_sub = {}

for t in target_names:
    print(t)
    net_sub[t] = get_net(vocab_size_sub[t], hidden_size, num_of_ctg_sub[t], learning_rate, num_epochs, train_questions_sub[t], train_target_sub[t], batch_size, index_to_word_sub[t])
    print()

In [None]:
total_pred_sub = {}

for t in target_names:
    print(t)
    total_pred_sub[t] = test_net(net_sub[t], test_questions_sub[t], test_target_sub[t], batch_size, vocab_size_sub[t], index_to_word_sub[t])
    print()

In [None]:
for t in target_names:
    cm = confusion_matrix(test_target_sub[t][0:len(total_pred_sub[t])], total_pred_sub[t])
    cm_df = pd.DataFrame(cm, index=target_names_sub[t], columns=target_names_sub[t])

    plt.figure(figsize = (10,8))
    heatmap = sn.heatmap(cm_df, annot=True, cmap='Reds', fmt='g', annot_kws={"size": 15}, cbar=False)
    plt.show()


In [None]:
for t in target_names:
    class_report = classification_report(test_target_sub[t][0:len(total_pred_sub[t])], total_pred_sub[t], target_names=target_names_sub[t])
    print(class_report)

# Embedding
Get hungarian glove file

In [27]:
download_from_url('https://drive.google.com/uc?id=19k2AACA90Qv1BeUBz8H6trCT4QTJ8OjW', root='.')

'/nlp/glove-hu.200k.200d.txt'

In [29]:

glove_file = "glove-hu.200k.200d.txt"
gensim_file = "glove-hu.200k.200d_gensim"

_ = glove2word2vec(glove_file, gensim_file)

  _ = glove2word2vec(glove_file, gensim_file)


In [30]:
embedding = gensim.models.KeyedVectors.load_word2vec_format(gensim_file, binary=False)

## Stemming

In [31]:
tokenizer = RegexpTokenizer(r'\w+')

stemmer = HungarianStemmer()
q = train_questions[0]

nltk.download('stopwords')
stop_words = set(stopwords.words('hungarian'))

# tokenizálás
tokenized_sentence = tokenizer.tokenize(q)

# stop word szűrés
filtered_sentence = [w for w in tokenized_sentence if not w in stop_words]

# lemmatizáció
lemmatizer = hu_core_ud_lg.load()

lemmatizer.remove_pipe('parser')
lemmatizer.remove_pipe('ner')
lemmatizer.add_pipe(lemmatizer.create_pipe('sentencizer'))

doc = lemmatizer(" ".join(filtered_sentence))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [32]:
print("Eredeti kérdés:")
print(q)

print("\nTokenizálás utáni kérdés:")
for w in tokenized_sentence:
    print(w, end=" ")

print("\n\nStop Word szűrés utáni kérdés:")
for w in filtered_sentence:
    print(w, end=" ")

print("\n\nLemmatizáció utáni kérdés:")
for w in doc:
    print(w.lemma_.lower(), end=" ")
    
print("\n\nStemming utáni kérdés:")
for w in tokenized_sentence:
    print(stemmer.stem(w), end=" ")

Eredeti kérdés:
Tudnátok írni pár hobbit? Mostanában sokat unatkozok és ki kéne valami jó elfoglaltságot.. Köszönöm :)

Tokenizálás utáni kérdés:
Tudnátok írni pár hobbit Mostanában sokat unatkozok és ki kéne valami jó elfoglaltságot Köszönöm 

Stop Word szűrés utáni kérdés:
Tudnátok írni pár hobbit Mostanában unatkozok kéne elfoglaltságot Köszönöm 

Lemmatizáció utáni kérdés:
tudnát ír pár hobbi mostanában unatkoz kell elfoglaltság köszön 

Stemming utáni kérdés:
tudnát írn pár hobb mostan sok unatkoz és ki kén valam jó elfoglaltság köszönö 

In [33]:
def lemmatize_question(question):
    doc = lemmatizer(question)
    lemmatized = []
    
    for w in doc:
        lemmatized.append(w.lemma_.lower())
    return " ".join(lemmatized)

In [34]:
def stem_question(question):
    stemmed = []
    
    for w in question.split():
        stemmed.append(stemmer.stem(w))
    return " ".join(stemmed)

In [35]:
def create_datasets(questions):
    lemmatized_list = []
    stemmed_list = []
    lemmatized_filtered_list = []
    stemmed_filtered_list = []
    
    start_time = int(time.time() * 1000)
    
    for idx, q in enumerate(questions):
        tokenized = tokenizer.tokenize(q)
        tokenized_q = " ".join(tokenized)
        filtered = [w for w in tokenized if not w in stop_words]

        filtered_q = " ".join(filtered)

        lemmatized_list.append(lemmatize_question(tokenized_q))
        stemmed_list.append(stem_question(tokenized_q))
        
        lemmatized_filtered_list.append(lemmatize_question(filtered_q))
        stemmed_filtered_list.append(stem_question(filtered_q))

        if idx % 100 == 0:
            print("\r%8d / %8d" % (idx, len(questions)), end="")

    print()
    end_time = int(time.time() * 1000)
    print("Creating datasets took: %f seconds" % ((end_time - start_time) / 1000.0))

    return lemmatized_list, stemmed_list, lemmatized_filtered_list, stemmed_filtered_list

In [None]:
lemmatized_train = []
lemmatized_test = []

stemmed_train = []
stemmed_test = []

lemmatized_filtered_train = []
lemmatized_filtered_test = []

stemmed_filtered_train = []
stemmed_filtered_test = []

lemmatized_train, stemmed_train, lemmatized_filtered_train, stemmed_filtered_train = create_datasets(train_questions)
print("Train datasets are ready")
lemmatized_test, stemmed_test, lemmatized_filtered_test, stemmed_filtered_test = create_datasets(test_questions)
print("Test datasets are ready")
    
lemma_trdf = pd.DataFrame(list(zip(train_target, lemmatized_train)), columns =['Target', 'Question']) 
lemma_tedf = pd.DataFrame(list(zip(test_target, lemmatized_test)), columns =['Target', 'Question'])

stem_trdf = pd.DataFrame(list(zip(train_target, stemmed_train)), columns =['Target', 'Question']) 
stem_tedf = pd.DataFrame(list(zip(test_target, stemmed_test)), columns =['Target', 'Question'])

lemmaf_trdf = pd.DataFrame(list(zip(train_target, lemmatized_filtered_train)), columns =['Target', 'Question']) 
lemmaf_tedf = pd.DataFrame(list(zip(test_target, lemmatized_filtered_test)), columns =['Target', 'Question'])

stemf_trdf = pd.DataFrame(list(zip(train_target, stemmed_filtered_train)), columns =['Target', 'Question']) 
stemf_tedf = pd.DataFrame(list(zip(test_target, stemmed_filtered_test)), columns =['Target', 'Question'])

lemma_trdf.to_csv(".csv/lemma_trdf.csv", index=False)
lemma_tedf.to_csv(".csv/lemma_tedf.csv", index=False)

stem_trdf.to_csv(".csv/stem_trdf.csv", index=False)
stem_tedf.to_csv(".csv/stem_tedf.csv", index=False)

lemmaf_trdf.to_csv(".csv/lemmaf_trdf.csv", index=False)
lemmaf_tedf.to_csv(".csv/lemmaf_tedf.csv", index=False)

stemf_trdf.to_csv(".csv/stemf_trdf.csv", index=False)
stemf_tedf.to_csv(".csv/stemf_tedf.csv", index=False)

In [36]:
dataset_tar = download_from_url('https://drive.google.com/uc?id=1LQkdBq9KW0wqgT9NG0k0VpCGqAtCGFkB', root='.csv/')
extracted_files = extract_archive(dataset_tar)

In [37]:
QUESTION = legacy.data.Field(tokenize = 'spacy', tokenizer_language = 'hu_core_ud_lg')
TARGET = legacy.data.LabelField(dtype = torch.long)

In [38]:
fields = [('Target', TARGET),('Question', QUESTION)]

train, test = legacy.data.TabularDataset.splits(
                                        path = '.csv',
                                        train = 'lemma_trdf.csv',
                                        test = 'lemma_tedf.csv',
                                        format = 'csv',
                                        fields = fields,
                                        skip_header = True
)

In [39]:
train, valid = train.split(random_state = random.seed(SEED))

In [40]:
vec = torchtext.vocab.Vectors('glove-hu.200k.200d_gensim', cache = '.')

QUESTION.build_vocab(train, vectors = vec)  
TARGET.build_vocab(train)

In [41]:
BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = legacy.data.BucketIterator.splits((train, valid, test), batch_size = BATCH_SIZE,
                                                                           sort_key = lambda x: len(x.Question),
                                                                           sort_within_batch = False,
                                                                           device = device)

In [42]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.embedding.weight.data.copy_(QUESTION.vocab.vectors)
        self.embedding.weight.requires_grad=True
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        
        self.fc = nn.Linear(hidden_dim, output_dim)

        
    def forward(self, text):
        
        embedded = self.embedding(text)

        output, hidden = self.lstm(embedded)

        y = self.fc(output[-1])
        
        log_probs = F.log_softmax(y.squeeze(0))
        return log_probs

In [43]:
INPUT_DIM = len(QUESTION.vocab)
EMBEDDING_DIM = 200
HIDDEN_DIM = 100
OUTPUT_DIM = 4

In [44]:
model = LSTMClassifier(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.NLLLoss()

model = model.to(device)
criterion = criterion.to(device)

In [45]:
def class_accuracy(preds, y):
    rounded_preds = preds.argmax(1)
    correct = (rounded_preds == y).float()

    acc = correct.sum() / len(correct)
    return acc

In [50]:
def train_(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:

        optimizer.zero_grad()
                
        predictions = model(batch.Question)

        loss = criterion(predictions, batch.Target)
        
        acc = class_accuracy(predictions, batch.Target)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [51]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    total_predicted = []
    with torch.no_grad():
    
        for batch in iterator:
            predictions = model(batch.Question)
            loss = criterion(predictions, batch.Target)
            
            acc = class_accuracy(predictions, batch.Target)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

            _, predicted = torch.max(predictions.data, 1)
            total_predicted += predicted.tolist()

    return epoch_loss / len(iterator), epoch_acc / len(iterator), total_predicted

In [52]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 30

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train_(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc, _ = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'LSTM.pt')

    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

  log_probs = F.log_softmax(y.squeeze(0))


Epoch: 01 | Epoch Time: 0m 21s
	Train Loss: 1.385 | Train Acc: 25.12%
	 Val. Loss: 1.311 |  Val. Acc: 37.98%
Epoch: 02 | Epoch Time: 0m 21s
	Train Loss: 1.309 | Train Acc: 30.72%
	 Val. Loss: 1.426 |  Val. Acc: 28.62%
Epoch: 03 | Epoch Time: 0m 22s
	Train Loss: 1.325 | Train Acc: 29.44%
	 Val. Loss: 0.864 |  Val. Acc: 60.47%
Epoch: 04 | Epoch Time: 0m 22s
	Train Loss: 0.777 | Train Acc: 60.75%
	 Val. Loss: 0.629 |  Val. Acc: 77.06%
Epoch: 05 | Epoch Time: 0m 22s
	Train Loss: 0.254 | Train Acc: 92.45%
	 Val. Loss: 0.406 |  Val. Acc: 88.54%
Epoch: 06 | Epoch Time: 0m 22s
	Train Loss: 0.151 | Train Acc: 95.86%
	 Val. Loss: 0.347 |  Val. Acc: 90.38%
Epoch: 07 | Epoch Time: 0m 22s
	Train Loss: 0.062 | Train Acc: 98.36%
	 Val. Loss: 0.333 |  Val. Acc: 90.27%
Epoch: 08 | Epoch Time: 0m 21s
	Train Loss: 0.035 | Train Acc: 99.17%
	 Val. Loss: 0.299 |  Val. Acc: 90.74%
Epoch: 09 | Epoch Time: 0m 21s
	Train Loss: 0.022 | Train Acc: 99.52%
	 Val. Loss: 0.292 |  Val. Acc: 91.16%


In [None]:
model.load_state_dict(torch.load('LSTM.pt'))

test_loss, test_acc, total_predicted = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

In [None]:
test_target = []

for batch in test_iterator:
    test_target += batch.Target.tolist()

cm = confusion_matrix(test_target, total_predicted)
cm_df = pd.DataFrame(cm, index=target_names, columns=target_names)

heatmap = sn.heatmap(cm_df, annot=True, cmap='Reds', fmt='g', annot_kws={"size": 15}, cbar=False)
plt.show()

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

In [6]:
init_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id

In [7]:
max_input_length = 128

In [8]:
def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length-2]
    return tokens

In [9]:
from torchtext.legacy import data

QUESTION = data.Field(batch_first = True,
                        use_vocab = False,
                        tokenize = tokenize_and_cut,
                        preprocessing = tokenizer.convert_tokens_to_ids,
                        init_token = init_token_idx,
                        eos_token = eos_token_idx,
                        pad_token = pad_token_idx,
                        unk_token = unk_token_idx)
TARGET = data.LabelField(dtype = torch.long)

In [10]:
fields = [('Target', TARGET),('Question', QUESTION)]

train_data, test_data = legacy.data.TabularDataset.splits(
                                        path = '.csv',
                                        train = 'lemma_trdf.csv',
                                        test = 'lemma_tedf.csv',
                                        format = 'csv',
                                        fields = fields,
                                        skip_header = True
)

train_data, valid_data = train_data.split(random_state = random.seed(SEED))

In [11]:
TARGET.build_vocab(train_data)

In [12]:
BATCH_SIZE = 8

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.Question),
    sort_within_batch = False,
    device = device)

In [13]:
bert = BertForSequenceClassification.from_pretrained('bert-base-multilingual-uncased', num_labels = 4)

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

In [14]:
class BERT(nn.Module):

    def __init__(self, bert):
        super(BERT, self).__init__()
        self.encoder = bert

    def forward(self, text, target):
        loss, text_fea = self.encoder(text, labels=target)[:2]

        return loss, text_fea

In [15]:
def save_checkpoint(save_path, model, valid_loss):

    if save_path == None:
        return
    
    state_dict = {'model_state_dict': model.state_dict(),
                  'valid_loss': valid_loss}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')


def save_metrics(save_path, train_loss_list, valid_loss_list, global_steps_list):

    if save_path == None:
        return
    
    state_dict = {'train_loss_list': train_loss_list,
                  'valid_loss_list': valid_loss_list,
                  'global_steps_list': global_steps_list}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')


def load_metrics(load_path):

    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    return state_dict['train_loss_list'], state_dict['valid_loss_list'], state_dict['global_steps_list']

In [None]:
def train(model,
          optimizer,
          criterion = nn.CrossEntropyLoss(),
          train_loader = train_iterator,
          valid_loader = valid_iterator,
          num_epochs = 8,
          eval_every = len(train_iterator) // 2,
          file_path = ".",
          best_valid_loss = float("Inf")):
    
    # initialize running values
    running_loss = 0.0
    valid_running_loss = 0.0
    global_step = 0
    train_loss_list = []
    valid_loss_list = []
    global_steps_list = []

    # training loop
    model.train()
    
    for epoch in range(num_epochs):
        start_time = time.time()

        batch_counter = 0
        
        for batch in train_loader:
            batch_counter += 1
            print("\r%d / %d" % (batch_counter, len(train_loader)), end="")
            output = model(batch.Question, batch.Target)
            loss, _ = output

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # update running values
            running_loss += loss.item()
            global_step += 1

            # evaluation step
            if global_step % eval_every == 0:
                model.eval()
                with torch.no_grad():                    

                    # validation loop
                    for valid_batch in valid_loader:
                        output = model(valid_batch.Question, valid_batch.Target)
                        loss, _ = output
                        
                        valid_running_loss += loss.item()

                # evaluation
                average_train_loss = running_loss / eval_every
                average_valid_loss = valid_running_loss / len(valid_loader)
                train_loss_list.append(average_train_loss)
                valid_loss_list.append(average_valid_loss)
                global_steps_list.append(global_step)

                # resetting running values
                running_loss = 0.0                
                valid_running_loss = 0.0
                model.train()

                # print progress
                print('\nEpoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}'
                      .format(epoch+1, num_epochs, global_step, num_epochs*len(train_loader),
                              average_train_loss, average_valid_loss))
                
                # checkpoint
                if best_valid_loss > average_valid_loss:
                    best_valid_loss = average_valid_loss
                    save_checkpoint(file_path + '/' + 'model.pt', model, best_valid_loss)
                    save_metrics(file_path + '/' + 'metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    
        end_time = time.time()
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    save_metrics(file_path + '/' + 'metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    print('\nFinished Training!')

model = BERT(bert).to(device)
optimizer = optim.Adam(model.parameters(), lr=2e-5)

train(model=model, optimizer=optimizer)

3269 / 6539
Epoch [1/8], Step [3269/52312], Train Loss: 0.1643, Valid Loss: 0.1968
Model saved to ==> ./model.pt
Model saved to ==> ./metrics.pt
6538 / 6539
Epoch [1/8], Step [6538/52312], Train Loss: 0.1599, Valid Loss: 0.1757
Model saved to ==> ./model.pt
Model saved to ==> ./metrics.pt
3268 / 6539
Epoch [2/8], Step [9807/52312], Train Loss: 0.1137, Valid Loss: 0.1955
6537 / 6539
Epoch [2/8], Step [13076/52312], Train Loss: 0.1212, Valid Loss: 0.2208
3267 / 6539
Epoch [3/8], Step [16345/52312], Train Loss: 0.0876, Valid Loss: 0.2005
6536 / 6539
Epoch [3/8], Step [19614/52312], Train Loss: 0.0956, Valid Loss: 0.2045
3266 / 6539
Epoch [4/8], Step [22883/52312], Train Loss: 0.0707, Valid Loss: 0.1859
6535 / 6539
Epoch [4/8], Step [26152/52312], Train Loss: 0.0760, Valid Loss: 0.2577
3265 / 6539
Epoch [5/8], Step [29421/52312], Train Loss: 0.0562, Valid Loss: 0.2159
6534 / 6539
Epoch [5/8], Step [32690/52312], Train Loss: 0.0624, Valid Loss: 0.2235
3264 / 6539
Epoch [6/8], Step [35959/52