In [None]:
import sys
import os
import glob
import json
import pandas as pd
%matplotlib inline
import matplotlib
import numpy as np

In [None]:
def find_files(path):
    return glob.glob(path)

In [None]:
def read_json(filename):
    with open(filename) as json_file:
        data = json.load(json_file)
    return data

### Adat beolvasása JSON fájlból

In [None]:
json_data = []

for file in find_files('Data/*'):
    print("Reading file:", file)
    json_data.append(read_json(file))

### Dataframe-mé alakítás

In [None]:
data_frame = pd.DataFrame()

for data in json_data:
    frames  = [data_frame, pd.DataFrame(data)]
    data_frame = pd.concat(frames).reset_index(drop=True)
    
for index, row in data_frame.iterrows():
    row['kategoriak'] = row['kategoriak'][0]

### Bepillantás a kérdésekbe

In [None]:
data_frame.head(2)

### Kérdések megszámolása, átlagos hossz számolása

In [None]:
def calculate_avg(data_frame, label="rovid_kerdes"):
    avg = 0
    count = 0
    
    for index, row in data_frame.iterrows():
        avg += len(row[label])
        count += 1

    avg = avg / count
    
    return avg

### Extrém rövid/hosszú kérdések eldobása

In [None]:
def drop_extreme(data_frame, min_, max_, label="rovid_kerdes"):
    data_frame = data_frame[data_frame[label].map(len) >= min_]
    data_frame = data_frame[data_frame[label].map(len) <= max_]

    return data_frame

In [None]:
long_questions_avg_len = calculate_avg(data_frame, "hosszu_kerdes")
print("Average length: %f" % long_questions_avg_len)

In [None]:
data_frame = drop_extreme(data_frame, long_questions_avg_len / 4, long_questions_avg_len * 3, "hosszu_kerdes")
long_questions_avg_len = calculate_avg(data_frame, "hosszu_kerdes")

print("Average length: %f" % long_questions_avg_len)

In [None]:
target_dir = {}
target_names = []

for idx in data_frame.index:
    category = data_frame["kategoriak"][idx]
    if category not in target_names:
        target_dir[category] = len(target_dir)
        target_names.append(category)
print(target_names)        

In [None]:
import random

questions_from_each_category = 1500

long_questions = []
target_values = []

for target_name in target_names:
    sample = data_frame[data_frame["kategoriak"] == target_name].sample(n = questions_from_each_category)
    long_questions += sample["hosszu_kerdes"].tolist()
    target_values += [target_dir[target_name]] * questions_from_each_category
    
zipped = list(zip(long_questions, target_values))

random.shuffle(zipped)

long_questions, target_values = zip(*zipped)

long_questions = list(long_questions)
target_values = list(target_values)

In [None]:
def unicode_to_ascii(data):
    return unidecode.unidecode(re.sub(r"[,.;@#?!&$]+\ *", " ", data).lower()).split()

In [None]:
import unidecode
import re
from collections import Counter

vocab = Counter()

for idx, q in enumerate(long_questions):
    words = unicode_to_ascii(q)
    for word in words:
        vocab[word] += 1
    long_questions[idx] = " ".join(words)
total_words = len(vocab)

word_to_index = {}

for idx, word in enumerate(vocab):
    word_to_index[word.lower()] = idx

In [None]:
def get_batch(text, target, i, batch_size):

    batches = []
    results = []
    
    texts = text[i * batch_size : i * batch_size + batch_size]
    categories = target[i * batch_size : i * batch_size + batch_size]

    for text in texts:
        layer = np.zeros(total_words , dtype=float)
        for word in text.split():
            if word.lower() not in word_to_index:
                print(text)
            layer[word_to_index[word.lower()]] += 1
            
        batches.append(layer)
        
    for category in categories:
        results.append(category)
     
    return np.array(batches), np.array(results)

In [None]:
learning_rate = 0.01
num_epochs = 5
batch_size = 150
display_step = 1

hidden_size = 100
input_size = total_words
num_classes = len(target_names)

In [None]:
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch

In [None]:
class OurNet(nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
        super(OurNet, self).__init__()
        self.layer_1 = nn.Linear(input_size,hidden_size, bias=True)
        self.relu = nn.ReLU()
        self.layer_2 = nn.Linear(hidden_size, hidden_size, bias=True)
        self.output_layer = nn.Linear(hidden_size, num_classes, bias=True)
 
     def forward(self, x):
        out = self.layer_1(x)
        out = self.relu(out)
        out = self.layer_2(out)
        out = self.relu(out)
        out = self.output_layer(out)
        return out

In [None]:
loss = nn.CrossEntropyLoss()
input_ = Variable(torch.randn(2, 5), requires_grad=True)
target = Variable(torch.LongTensor(2).random_(5))
output = loss(input_, target)
output.backward()

In [None]:
net = OurNet(input_size, hidden_size, num_classes)

criterion = nn.CrossEntropyLoss()  
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)  

train_text = long_questions
train_target = target_values

for epoch in range(num_epochs):
    total_batch = int(len(train_text) / batch_size)

    for i in range(total_batch):
        batch_x,batch_y = get_batch(train_text, train_target, i, batch_size)
        articles = Variable(torch.FloatTensor(batch_x))
        labels = Variable(torch.LongTensor(batch_y))
        
        optimizer.zero_grad()
        outputs = net(articles)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        if (i + 1) % 4 == 0:
            print ('Epoch [%d/%d], Step [%d/%d], Loss: %.4f' %(epoch + 1, num_epochs, i + 1, len(train_text) // batch_size, loss.data))

In [None]:
print("Mennyi kérdésed van?")
q_num = int(input())

for q in range(0, q_num):
    test_text = input()

    test_data = unicode_to_ascii(test_text)
    total_test_data = 1

    batch_x_test, batch_y_test = get_batch(test_data, [0], 0, total_test_data)
    articles = Variable(torch.FloatTensor(batch_x_test))
    labels = torch.LongTensor(batch_y_test)
    outputs = net(articles)
    _, predicted = torch.max(outputs.data, 1)

    print("A kérdés %s témájú .. talán." % target_names[predicted.item()])