In [10]:
import re
import json
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

In [11]:
import nltk
from nltk.corpus import wordnet

In [12]:
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /home/tamara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/tamara/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [13]:
# image
# question
# answers
# answer_type
# answerable

# 5614  slika/pitanja
# 56140 odgovora (za svaku sliku 10 odgovora)
# 10822 jedinstvenih odgovora
#  2327 odgovora u slucaju da prihvatamo odgovor ako su barem 2/10 dali takav

In [14]:
in_file = read_data('train')
answers, q_answers = answer_consensus(in_file)

df = adapt_dataset(q_answers)
df.to_csv('data/train_answers.csv', index=False)

print(len(answers))

2327


In [15]:
def read_data(dataset):
    f = open('data/' + dataset + '.json', encoding='utf8')
    data = json.load(f)
    f.close()
    return data

## Pretprocesiranje odgovora

In [16]:
def preprocess_text(text):
    # 1) pretvaranje svih unicode karaktera u ascii (outliers)
    text = unicode_to_ascii(text)
    # 2) izbacivanje belina
    text = remove_blanks(text)
    # 3) izbacivanje specijalnih karaktera
    text = remove_special_characters(text)
    # 4) svodjenje na sva mala slova
    text = to_lower(text)
    return text

def unicode_to_ascii(text):
    return text.encode('ascii', 'ignore').decode('ascii')

def remove_blanks(text):
    without_blanks = text.replace('\\n', ' ').replace('\n', ' ').replace('\t',' ').replace('\\', ' ')
    without_extra_whitespace = re.sub(r'\s+', ' ', without_blanks)
    without_extra_whitespace = without_extra_whitespace.strip()
    return without_extra_whitespace

def remove_special_characters(text):
    return re.sub('[^A-Za-z0-9 ]+', '', text)

def remove_non_numeric(text):
    return re.sub('[^0-9]', '', text)

def to_lower(text):
    return text.lower()

# not very useful
def pos_tag_helper(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

## Kreiranje skupa potencijalnih odgovora

In [17]:
def answer_consensus(data):
    q_answers = {}
    sure_answers = {'unanswerable'}
    for key, value in data['answers'].items():
        answers = value
        # samo one za koje se moze dati odgovor
        if (data['answerable'][key]):
            # svodjenje na samo sigurne odgovore
            confident_answers = [item['answer'] for item in answers if item['answer_confidence'] != 'no']
            
            # pretprocesiranje odgovora
            processed_answers = [preprocess_text(ans) for ans in confident_answers]
            
            # za pitanja koja imaju odgovor broj, dodatno svodimo odgovore samo na broj
            if (data['answer_type'][key] == 'number'):
                processed_answers = [remove_non_numeric(ans) for ans in processed_answers if remove_non_numeric(ans)!='']
            
            # najcesci odgovor koji ima barem 2 pojavljivanja (potvrdjeni)
            counts = Counter(processed_answers)
            most_common = [k for (k, v) in counts.items() if v>=2]
            
            # ako nijedan odgovor nisu dale barem dve osobe -> unanswerable
            if most_common:
                q_answers[key] = most_common
                sure_answers.update(most_common)
            else:
                q_answers[key] = ['unanswerable']
        else:
            # odgovor je samo 'unanswerable'
            q_answers[key] = ['unanswerable']
    return sure_answers, q_answers

## Pitanje-odgovor CSV

In [18]:
# Question, Answer1, Answer2,..., AnswerN
# uzmi sva pitanja i sve odgovore, napravi tabelu, ako se poklapa, stavi jedinicu
def adapt_dataset(q_answers):
    mlb = MultiLabelBinarizer()
    # fit_transform prima listu listi ili listu setova (za svaki key njegovu listu odgovora)
    # binarno popunjava sve odgovore u zavisnosti da li se nalaze u listi za pitanje ili ne
    mlb_result = mlb.fit_transform(list(q_answers.values()))
    # list(mlb.classes_)
    id_series = pd.Series(q_answers.keys())
    df_final = pd.concat([id_series, pd.DataFrame(mlb_result, columns=list(mlb.classes_))], axis=1)
    return df_final

In [19]:
list(q_answers.values())

[['rug', 'unanswerable'],
 ['legs', 'shoes'],
 ['coin'],
 ['toilet paper'],
 ['black', 'grey'],
 ['purple', 'pink', 'black pink'],
 ['black'],
 ['yellow'],
 ['white', 'grey', 'unsuitable'],
 ['unsuitable', 'chicken'],
 ['oranges', 'oranges apples', 'apples oranges'],
 ['red'],
 ['white'],
 ['no'],
 ['brown black'],
 ['white', 'grey'],
 ['cats', 'hairless cats'],
 ['no'],
 ['chair'],
 ['unanswerable'],
 ['green'],
 ['sunglasses'],
 ['tv', 'ceiling fan'],
 ['grey', 'blue', 'unanswerable'],
 ['pomegranate'],
 ['grey', 'unanswerable'],
 ['white 10 feet'],
 ['unanswerable', 'yes', 'yes daytime'],
 ['fan'],
 ['pepper shaker', 'pepper'],
 ['no', 'unanswerable', 'yes'],
 ['pictures'],
 ['unanswerable'],
 ['black'],
 ['unanswerable'],
 ['grey', 'white'],
 ['candy', 'lindt truffle'],
 ['no', 'unsuitable'],
 ['unanswerable'],
 ['pink'],
 ['no'],
 ['man', 'lobby'],
 ['american eagle', 'eagle'],
 ['hand', 'fingers'],
 ['better', 'unanswerable'],
 ['unanswerable', 'no'],
 ['goblet'],
 ['no'],
 ['gre

## Test skup

In [20]:
in_file = read_data('test')
answers, q_answers = answer_consensus(in_file)
list(q_answers.values())

[['blue yellow', 'blue yellow stripes'],
 ['sweet corn', 'corn'],
 ['mug', 'travel mug', 'cup'],
 ['white'],
 ['melatonin'],
 ['flower'],
 ['pink'],
 ['laundry basket'],
 ['vending machine', 'soda machine'],
 ['blue'],
 ['drink'],
 ['apple'],
 ['maxi pad', 'pad'],
 ['unanswerable'],
 ['yellow'],
 ['cell phone', 'cellphone'],
 ['shepherds pie'],
 ['paper towels', 'paper towel'],
 ['unsuitable'],
 ['glass'],
 ['chair'],
 ['phone'],
 ['chess'],
 ['flowers', 'roses'],
 ['on'],
 ['blue black', 'black blue'],
 ['white'],
 ['table', 'remote on table', 'tv remote on table'],
 ['yes'],
 ['purple', 'blue'],
 ['yes', 'no', 'unsuitable'],
 ['unanswerable'],
 ['clouds trees'],
 ['water bottle', 'empty water bottle'],
 ['fireplace'],
 ['richard s holiday atrium'],
 ['yes'],
 ['remote', 'remote control'],
 ['cat litter'],
 ['green fabric', 'pillow'],
 ['floor', 'dog toys'],
 ['black', 'white'],
 ['flowers'],
 ['tan', 'beige'],
 ['blue', 'black'],
 ['house'],
 ['brown', 'pink'],
 ['white black stripes

In [21]:
len(list(q_answers.values()))

624

In [22]:
test_df = adapt_dataset(q_answers)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 624 entries, 0 to 623
Columns: 433 entries, 0 to zipper
dtypes: int64(432), object(1)
memory usage: 2.1+ MB


In [23]:
test_df.rename(columns = {0: '0'}, inplace = True)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 624 entries, 0 to 623
Columns: 433 entries, 0 to zipper
dtypes: int64(432), object(1)
memory usage: 2.1+ MB


In [24]:
train_df = pd.read_csv('data/train_answers.csv')
train_cols = train_df.columns
test_cols = test_df.columns
drop_columns = list(set(test_cols) - set(train_cols))
add_columns = list(set(train_cols) - set(test_cols))
print(len(drop_columns))
print(len(add_columns))

197
2092


In [25]:
test_df.drop(columns=drop_columns, inplace=True)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 624 entries, 0 to 623
Columns: 236 entries, 0 to zipper
dtypes: int64(235), object(1)
memory usage: 1.1+ MB


In [26]:
add_df = pd.DataFrame(columns=add_columns)
add_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Columns: 2092 entries, dessert to advanced creme rinse
dtypes: object(2092)
memory usage: 0.0+ bytes


In [27]:
test_df = test_df.join(add_df)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 624 entries, 0 to 623
Columns: 2328 entries, 0 to advanced creme rinse
dtypes: int64(235), object(2093)
memory usage: 11.1+ MB


In [28]:
test_df = test_df.reindex(sorted(test_df.columns), axis=1)
test_df.head()

Unnamed: 0,0,0.1,00 310 539,1,1 08 11,1 3,1 4 ounce,1 cup,1 dollar,1 dollar bill,...,yes green grey,yes grey shirt,yes no,yes on,yes white,yes yes,yogurt,yoplait,yorkie,zipper
0,7824,,,,,,,,,,...,,,,,0,0,,,,0
1,7288,,,,,,,,,,...,,,,,0,0,,,,0
2,6489,,,,,,,,,,...,,,,,0,0,,,,0
3,9376,,,,,,,,,,...,,,,,0,0,,,,0
4,6591,,,,,,,,,,...,,,,,0,0,,,,0


In [29]:
test_df.fillna(0, inplace=True)

In [30]:
test_df.to_csv('data/test_answers.csv', index=False)

In [31]:
def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

In [32]:
len(intersection(train_df.columns, test_df.columns))

2328