In [1]:
import sys
import os
import glob
import json
import pandas as pd
%matplotlib inline
import matplotlib
import numpy as np

In [2]:
def find_files(path):
    return glob.glob(path)

In [3]:
def read_json(filename):
    with open(filename) as json_file:
        data = json.load(json_file)
    return data

### Adat beolvasása JSON fájlból

In [4]:
json_data = []

for file in find_files('Data/*'):
    print("Reading file:", file)
    json_data.append(read_json(file))

Reading file: Data/gyakori_szamitastechnika
Reading file: Data/gyakori_egeszseg_20000
Reading file: Data/gyakori_szorakozas_30000
Reading file: Data/gyakori_egeszseg
Reading file: Data/gyakori_allatok_14000


### Dataframe-mé alakítás

In [5]:
data_frame = pd.DataFrame()

for data in json_data:
    frames  = [data_frame, pd.DataFrame(data)]
    data_frame = pd.concat(frames).reset_index(drop=True)
    
for index, row in data_frame.iterrows():
    row['kategoriak'] = row['kategoriak'][0]

### Bepillantás a kérdésekbe

In [6]:
data_frame.head(2)

Unnamed: 0,valasz,kategoriak,hosszu_kerdes,rovid_kerdes,keywords
0,Várak régen is voltak. Ha mindhárom tornyot le...,Számítástechnika,Miért lett ilyen sz@r a honfoglaló? Régen tök ...,Miért lett ilyen sz@r a honfoglaló?,"[Honfoglaló, vár]"
1,"Ahogy írták, az stdio az a C-s függvénykönyvtá...",Számítástechnika,C++-ban melyiket érdemesebb használni? Stdio v...,C++-ban melyiket érdemesebb használni? Stdio v...,"[C++, iostream, konzol, Windows, Visual Studio]"


### Kérdések megszámolása, átlagos hossz számolása

In [7]:
def calculate_avg(data_frame, label="rovid_kerdes"):
    avg = 0
    count = 0
    
    for index, row in data_frame.iterrows():
        avg += len(row[label])
        count += 1

    avg = avg / count
    
    return avg

### Extrém rövid/hosszú kérdések eldobása

In [8]:
def drop_extreme(data_frame, min_, max_, label="rovid_kerdes"):
    data_frame = data_frame[data_frame[label].map(len) >= min_]
    data_frame = data_frame[data_frame[label].map(len) <= max_]

    return data_frame

In [9]:
long_questions_avg_len = calculate_avg(data_frame, "hosszu_kerdes")
print("Average length: %f" % long_questions_avg_len)

Average length: 346.495816


In [10]:
data_frame = drop_extreme(data_frame, long_questions_avg_len / 4, long_questions_avg_len * 3, "hosszu_kerdes")
long_questions_avg_len = calculate_avg(data_frame, "hosszu_kerdes")

print("Average length: %f" % long_questions_avg_len)

Average length: 326.753950


In [11]:
target_dir = {}
target_names = []

for idx in data_frame.index:
    category = data_frame["kategoriak"][idx]
    if category not in target_names:
        target_dir[category] = len(target_dir)
        target_names.append(category)
print(target_names)        

['Számítástechnika', 'Egészség', 'Szórakozás', 'Állatok']


In [12]:
import random

questions_from_each_category = 1500

long_questions = []
target_values = []

for target_name in target_names:
    sample = data_frame[data_frame["kategoriak"] == target_name].sample(n = questions_from_each_category)
    long_questions += sample["hosszu_kerdes"].tolist()
    target_values += [target_dir[target_name]] * questions_from_each_category
    
zipped = list(zip(long_questions, target_values))

random.shuffle(zipped)

long_questions, target_values = zip(*zipped)

long_questions = list(long_questions)
target_values = list(target_values)

In [23]:
def unicode_to_ascii(data):
    return unidecode.unidecode(re.sub(r"[,.;@#?!&$]+\ *", " ", data).lower()).split()

In [24]:
import unidecode
import re
from collections import Counter

vocab = set()

for idx, q in enumerate(long_questions):
    words = unicode_to_ascii(q)

    for word in words:
        vocab.add(word)

    long_questions[idx] = " ".join(words)

total_words = len(vocab)

index_of_word = {}

for idx, word in enumerate(vocab):
    index_of_word[word.lower()] = idx

In [26]:
def get_batch(text, target=[0], i=0, batch_size=1):

    batches = []
    results = []
    
    texts = text[i * batch_size : i * batch_size + batch_size]
    categories = target[i * batch_size : i * batch_size + batch_size]

    for text in texts:
        layer = np.zeros(input_size , dtype=float)
        for word in text.split():
            if word.lower() not in index_of_word:
                index_of_word[word.lower()] = len(index_of_word)
            layer[index_of_word[word.lower()]] += 1
            
        batches.append(layer)
        
    for category in categories:
        results.append(category)
     
    return np.array(batches), np.array(results)

In [27]:
learning_rate = 0.01
num_epochs = 5
batch_size = 150

hidden_size = 100
input_size = total_words + 100
num_classes = len(target_names)

In [28]:
from torch.autograd import Variable
import torch.nn as nn
import torch

In [29]:
class Classification(nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
        super(Classification, self).__init__()
        self.layer_1 = nn.Linear(input_size, hidden_size, bias=True)
        self.relu = nn.ReLU()
        self.layer_2 = nn.Linear(hidden_size, hidden_size, bias=True)
        self.output_layer = nn.Linear(hidden_size, num_classes, bias=True)
 
     def forward(self, x):
        out = self.layer_1(x)
        out = self.relu(out)
        out = self.layer_2(out)
        out = self.relu(out)
        out = self.output_layer(out)
        return out

In [30]:
net = Classification(input_size, hidden_size, num_classes)

criterion = nn.CrossEntropyLoss()  
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)  

train_text = long_questions
train_target = target_values

for epoch in range(num_epochs):
    total_batch = int(len(train_text) / batch_size)

    for i in range(total_batch):
        batch_x, batch_y = get_batch(train_text, train_target, i, batch_size)
        questions = Variable(torch.FloatTensor(batch_x))
        theme = Variable(torch.LongTensor(batch_y))
        
        optimizer.zero_grad()
        outputs = net(questions)
        loss = criterion(outputs, theme)
        loss.backward()
        optimizer.step()
        
        if (i + 1) % 4 == 0:
            print ('Epoch [%d/%d], Step [%d/%d], Loss: %.4f' %(epoch + 1, num_epochs, i + 1, len(train_text) // batch_size, loss.data))

Epoch [1/5], Step [4/40], Loss: 1.2696
Epoch [1/5], Step [8/40], Loss: 0.9319
Epoch [1/5], Step [12/40], Loss: 0.6414
Epoch [1/5], Step [16/40], Loss: 0.4717
Epoch [1/5], Step [20/40], Loss: 0.3818
Epoch [1/5], Step [24/40], Loss: 0.5796
Epoch [1/5], Step [28/40], Loss: 0.6062
Epoch [1/5], Step [32/40], Loss: 0.4463
Epoch [1/5], Step [36/40], Loss: 0.4505
Epoch [1/5], Step [40/40], Loss: 0.5508
Epoch [2/5], Step [4/40], Loss: 0.0723
Epoch [2/5], Step [8/40], Loss: 0.0337
Epoch [2/5], Step [12/40], Loss: 0.0669
Epoch [2/5], Step [16/40], Loss: 0.0242
Epoch [2/5], Step [20/40], Loss: 0.0107
Epoch [2/5], Step [24/40], Loss: 0.0195
Epoch [2/5], Step [28/40], Loss: 0.0243
Epoch [2/5], Step [32/40], Loss: 0.0345
Epoch [2/5], Step [36/40], Loss: 0.0142
Epoch [2/5], Step [40/40], Loss: 0.0098
Epoch [3/5], Step [4/40], Loss: 0.0004
Epoch [3/5], Step [8/40], Loss: 0.0005
Epoch [3/5], Step [12/40], Loss: 0.0007
Epoch [3/5], Step [16/40], Loss: 0.0006
Epoch [3/5], Step [20/40], Loss: 0.0003
Epoch 

In [None]:
print("Mennyi kérdésed van?")
q_num = int(input())

for q in range(0, q_num):
    test_text = input()

    test_data = unicode_to_ascii(test_text)

    batch_x_test, _ = get_batch(test_data)
    question = Variable(torch.FloatTensor(batch_x_test))
    outputs = net(question)
    _, predicted = torch.max(outputs.data, 1)

    print("A kérdés %s témájú .. talán." % target_names[predicted.item()])

Mennyi kérdésed van?
