In [1]:
import re
import json
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

In [2]:
import nltk
from nltk.corpus import wordnet

In [3]:
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\evaje\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\evaje\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [51]:
# image
# question
# answers
# answer_type
# answerable

# 5614  slika/pitanja
# 56140 odgovora (za svaku sliku 10 odgovora)
# 10822 jedinstvenih odgovora
#  2327 odgovora u slucaju da prihvatamo odgovor ako su barem 2/10 dali takav

In [50]:
in_file = read_data('train')
answers, q_answers = answer_consensus(in_file)

df = adapt_dataset(q_answers)
df.to_csv('data/train_answers.csv', index=False)

print(len(answers))

2327


In [4]:
def read_data(dataset):
    f = open('data/' + dataset + '.json', encoding='utf8')
    data = json.load(f)
    f.close()
    return data

## Pretprocesiranje odgovora

In [5]:
def preprocess_text(text):
    # 1) pretvaranje svih unicode karaktera u ascii (outliers)
    text = unicode_to_ascii(text)
    # 2) izbacivanje belina
    text = remove_blanks(text)
    # 3) izbacivanje specijalnih karaktera
    text = remove_special_characters(text)
    # 4) svodjenje na sva mala slova
    text = to_lower(text)
    return text

def unicode_to_ascii(text):
    return text.encode('ascii', 'ignore').decode('ascii')

def remove_blanks(text):
    without_blanks = text.replace('\\n', ' ').replace('\n', ' ').replace('\t',' ').replace('\\', ' ')
    without_extra_whitespace = re.sub(r'\s+', ' ', without_blanks)
    without_extra_whitespace = without_extra_whitespace.strip()
    return without_extra_whitespace

def remove_special_characters(text):
    return re.sub('[^A-Za-z0-9 ]+', '', text)

def remove_non_numeric(text):
    return re.sub('[^0-9]', '', text)

def to_lower(text):
    return text.lower()

# not very useful
def pos_tag_helper(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

## Kreiranje skupa potencijalnih odgovora

In [44]:
def answer_consensus(data):
    q_answers = {}
    sure_answers = {'unanswerable'}
    for key, value in data['answers'].items():
        answers = value
        # samo one za koje se moze dati odgovor
        if (data['answerable'][key]):
            # svodjenje na samo sigurne odgovore
            confident_answers = [item['answer'] for item in answers if item['answer_confidence'] != 'no']
            
            # pretprocesiranje odgovora
            processed_answers = [preprocess_text(ans) for ans in confident_answers]
            
            # za pitanja koja imaju odgovor broj, dodatno svodimo odgovore samo na broj
            if (data['answer_type'][key] == 'number'):
                processed_answers = [remove_non_numeric(ans) for ans in processed_answers if remove_non_numeric(ans)!='']
            
            # najcesci odgovor koji ima barem 2 pojavljivanja (potvrdjeni)
            counts = Counter(processed_answers)
            most_common = [k for (k, v) in counts.items() if v>=2]
            
            # ako nijedan odgovor nisu dale barem dve osobe -> unanswerable
            if most_common:
                q_answers[key] = most_common
                sure_answers.update(most_common)
            else:
                q_answers[key] = ['unanswerable']
        else:
            # odgovor je samo 'unanswerable'
            q_answers[key] = ['unanswerable']
    return sure_answers, q_answers

## Pitanje-odgovor CSV

In [27]:
# Question, Answer1, Answer2,..., AnswerN
# uzmi sva pitanja i sve odgovore, napravi tabelu, ako se poklapa, stavi jedinicu
def adapt_dataset(q_answers):
    mlb = MultiLabelBinarizer()
    # fit_transform prima listu listi ili listu setova (za svaki key njegovu listu odgovora)
    # binarno popunjava sve odgovore u zavisnosti da li se nalaze u listi za pitanje ili ne
    mlb_result = mlb.fit_transform(list(q_answers.values()))
    # list(mlb.classes_)
    id_series = pd.Series(q_answers.keys())
    df_final = pd.concat([id_series, pd.DataFrame(mlb_result, columns=list(mlb.classes_))], axis=1)
    return df_final

In [28]:
list(q_answers.values())

[['rug', 'unanswerable'],
 ['legs', 'shoes'],
 ['coin'],
 ['toilet paper'],
 ['black', 'grey'],
 ['purple', 'pink', 'black pink'],
 ['black'],
 ['yellow'],
 ['white', 'grey', 'unsuitable'],
 ['unsuitable', 'chicken'],
 ['oranges', 'oranges apples', 'apples oranges'],
 ['red'],
 ['white'],
 ['no'],
 ['brown black'],
 ['white', 'grey'],
 ['cats', 'hairless cats'],
 ['no'],
 ['chair'],
 ['unanswerable'],
 ['green'],
 ['sunglasses'],
 ['tv', 'ceiling fan'],
 ['grey', 'blue', 'unanswerable'],
 ['pomegranate'],
 ['grey', 'unanswerable'],
 ['white 10 feet'],
 ['unanswerable', 'yes', 'yes daytime'],
 ['fan'],
 ['pepper shaker', 'pepper'],
 ['no', 'unanswerable', 'yes'],
 ['pictures'],
 ['unanswerable'],
 ['black'],
 ['unanswerable'],
 ['grey', 'white'],
 ['candy', 'lindt truffle'],
 ['no', 'unsuitable'],
 ['unanswerable'],
 ['pink'],
 ['no'],
 ['man', 'lobby'],
 ['american eagle', 'eagle'],
 ['hand', 'fingers'],
 ['better', 'unanswerable'],
 ['unanswerable', 'no'],
 ['goblet'],
 ['no'],
 ['gre