## Zhlukovanie článkov Wikipédie do kategórií na základe ich vedecko-spoločenskej oblasti

**Vypracoval:** Tomáš Babjak

**Predmet:** Vyhľadávanie informácii

**GitHub:** https://github.com/tomasbabjak/VINF_Wikipedia

#### Potrebne kniznice na importovanie

In [1]:
import regex
import re
import datamuse
import nltk
import json
import string
import time
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import *
import time

import pandas as pd 
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import numpy.linalg as LA

### 1. Vytvoriť testovaciu vzorku dát, na ktorej budeme prvotne projekt realizovať

Read XML file with Wiki articles and parse articles to list:

In [2]:
def read_xml(file_name, n_first_articles):
    
    start_tag = f'<page>'
    end_tag = f'</page>'
    
    start_found = False
    articles_found = []
    lines = ''
    
    with open(file_name, encoding="utf8") as f:
        for line in f:
            if start_tag in line:
                start_found = True
            if start_found:
                lines += line
            if end_tag in line:
                start_found = False
                articles_found.append(lines)
                lines = ''
            if len(articles_found) == n_first_articles:
                break
    with open(f'../data/wiki_{n_first_articles}_before.json', 'w') as outfile:
        json.dump(articles_found, outfile, indent=4)
    return articles_found

In [16]:
def read_xml_modified(file_name, n_first_articles):
    
    start_tag = f'<page>'
    end_tag = f'</page>'
    
    start_found = False
    articles_found = []
    lines = ''
    counter = 0
    
    with open(file_name, encoding="utf8") as f:
        for line in f:
            if start_tag in line:
                start_found = True
            if start_found:
                lines += line
            if end_tag in line:
                start_found = False
                articles_found.append(lines)
                lines = ''
            if len(articles_found) == n_first_articles:
                counter += 1
                print(counter)
                with open(f'../data/wiki_{counter}_before.json', 'w') as outfile:
                    json.dump(articles_found, outfile, indent=4)
                articles_found = []
#             if counter == 10:
#                 break
    return articles_found

Extract Title and Text attributes from article and create dictionary from them:

In [4]:
def extract_text(text):
    title_regex = r'<title[^>]*>([^<]+)</title>'
    text_regex = r'<text[^>]*>([^<]+)</text>'
    pages = []
    for page in text:
        title = regex.findall(title_regex, page)
        text = regex.findall(text_regex, page)
        pages.append({"title": title[0] if title else '',
                      "text": text[0] if text else ''})
    return pages

## Najst infobox, anchor texty a wiki kategorie

### 4. Z článkov testovacej sady vyhľadať dôležité pojmy - zamerať sa na Infobox, kde sa nachádzajú dôležité informácie o článku

### 5. Vyhľadať odkazy na iné články Wikipédie (anchor text), ktoré môžu smerovať priamo na oblasť alebo aspoň priblížiť kontext článku

Find Infobox and Achor texts from Text attribute of article and add them to dictionary

In [5]:
def find_infobox_anchor(text):
    regex_infobox = r"(?=\{Infobox )(\{([^{}]|(?1))*\})"
    regex_anchor = r"\[\[([^\]\[:]+)\|([^\]\[:]+)\]\]"
    regex_category = r"\[\[Category:([^\]]*\b)"
    for page in text:
        page['infobox'] = regex.findall(regex_infobox, page['text'])
        page['anchors'] = regex.findall(regex_anchor, page['text'])
        page['category_wiki'] = regex.findall(regex_category, page['text'])
        page['text'] = regex.sub(regex_infobox, '', page['text'])
        page['text'] = regex.sub(regex_anchor, '', page['text'])
        page['text'] = regex.sub(regex_category, '', page['text'])
    return text

Separate Redirect articles from others into two separate lists

In [6]:
def find_redirect(text):
    regex_redirect = r"^#redirect[^\[]*\[\[([^\]]+)"
    redirect_pages = []
    article_pages = []
    for page in text:
        if regex.findall(regex_redirect, page['text']):
            redirect_pages.append(page)
        else:
            article_pages.append(page)
    return (redirect_pages, article_pages)

### 2. Vytvoriť zoznam (strom) spoločensko-vedných oblastí, do ktorých budeme jednotlivé stránky zaraďovať, ku každej oblasti nájsť aj slová, ktoré sa s ňou spájajú

Find terms related to our categories with Datamuse library. Split words of each category and find 100 terms related to them

In [7]:
new_categories = [
    'Culture',
    'Food',
    'Language',
    'Literature',
    'Art',
    'Dance',
    'Film',
    'Music',
    'Theatre',
    'Architecture',
    'Painting',
    'Sculpture',
    'Games',
    'Sport',
    'Recreation',
    'Media',
    'Internet',
    'Geography',
    'Earth',
    'Health',
    'Fitness',
    'Exercise',
    'Life',
    'Medicine',
    'History',
    'Education',
    'Crime',
    'War',
    'Transport',
    'Mathematics',
    'Logic',
    'Statistics',
    'Biology',
    'Nature',
    'Science',
    'Philosophy',
    'Religion',
    'Belief',
    'Society',
    'Technology',
    'Computing',
    'Electronics',
    'Engineering']

### Vytvorit gazeteer pomocou Wiki clankov mojich kategorii

Ku kazdej z mojich kategorii najst clanok wikipedie s rovnakym nazvom a pomocou neho neskor vytvorit gazeteer.

In [8]:
def find_categories_articles(file_name):
    start_tag = f'<page>'
    end_tag = f'</page>'
    title_regex = r'<title[^>]*>([^<]+)</title>'

    start_found = False
    reading = False
    start_just = False
    articles_found = []
    lines = ''
    with open(file_name, encoding="utf8") as f:
        for line in f:
            if start_tag in line:
                start_found = True
                start_just = True
                continue
            if start_just:
                category = regex.findall(title_regex, line)
                if category[0] in new_categories:
                    print(category[0])
                    reading = True
                start_just = False
            if start_found and reading:
                lines += line
            if end_tag in line:
                start_found = False
                reading = False
                if category[0] in new_categories:
                    articles_found.append(lines)
                category = ''
                lines = ''
            if len(articles_found) == len(new_categories):
                break
    with open(f'../data/wiki_categories.json', 'w') as outfile:
        json.dump(articles_found, outfile)
    return articles_found

### Vytvorit gazeteer pomocou Datamuse kniznice

Ku kazdej z mojich kategorii najst gazeteer pomocou kniznice Datamuse - related words

In [9]:
api = datamuse.Datamuse()
        
def categories_find_related(categories):
    cats_with_words = []

    for c in categories:
        api_words = api.words(ml=c, max=20)
        result = list(map(lambda x: x.get('word'), api_words))
        result.append(c.lower())
        cats_with_words.append({'category':c,'related_words':result})

    return cats_with_words

## Predspracovanie

### 3. Články vhodne predspracovať - stemming, tokenizácia, odstránenie stop slov

In [10]:
def tokenize_text(text):
    text_tokens = word_tokenize(text)
    text_tokens = [token.lower() for token in text_tokens if token not in ["*+'-./:;,|<=>?@[\]^_`{}~!\"#$%&()\n"]]
    return text_tokens

In [11]:
def remove_stops(text):
    tokens_without_stops = list(filter(lambda x: (x not in string.punctuation) and (x not in stopwords.words('english')),text))
    return tokens_without_stops

In [12]:
stemmer = PorterStemmer()

def stem_list(llist):
    return [stemmer.stem(word) for word in llist]

In [13]:
def preprocess_text(text):
    if not text:
        return []
    else:
        text = tokenize_text(text)
        text = remove_stops(text)
        text = stem_list(text)
        return text

## TF-IDF

In [14]:
def tfidf_train(train_set):
    vectorizer = TfidfVectorizer()
    docs_tfidf = vectorizer.fit_transform(train_set)
    return vectorizer, docs_tfidf

def tfdif_test_cosine(query, vectorizer, docs_tfidf):
    query_tfidf = vectorizer.transform([query])
    cosineSimilarities = cosine_similarity(query_tfidf, docs_tfidf).flatten()
    categories_sims = {}
    
    for cosine, category in zip(cosineSimilarities, new_categories):
        if cosine != 0:
            categories_sims[category] = cosine
    return categories_sims

## Invertovany index
Jednoduchý vlastný invertovaný index pomocou python Hash Mapy - defaultdict. 

Prvým parametrom pri inicializacií Indexu sú všetky články z ktorých sa tvorí tento index a druhým je nejaký stĺpec podľa ktorého sa bude dať vyhľadávať článok.

Pri hľadaní sa v prvom parametri uvádzajú slová pre vyhľadávanie oddelené medzerou a druhým parametrom je buď uvedené slovo "and" alebo "or" podľa toho či chceme aby sa obe tieto slová nachádzali v hľadanom texte alebo aspoň jedno z nich.

In [15]:
class invertedIndex(object):

    def __init__(self,docs,method):
        self.docSets = defaultdict(set)
        for doc in docs:
            index = doc.get('title')
            t = [preprocess_text(a) for a in doc.get(method)]
            for term in [item for sublist in t for item in sublist]:
                self.docSets[term].add(index)
        #print(self.docSets)
        
    def search(self, term, andor):
        pole=set()            
        for a in preprocess_text(term):
            #print(self.docSets[a])
            if andor == 'and':
                if len(pole) == 0:
                    pole = self.docSets[a]
                else:
                    pole = pole.intersection(self.docSets[a])
            elif andor == 'or':
                pole = pole.union(self.docSets[a])
        return pole

## Spustenie

Pre všetky spustenia je potrebné nájsť Wiki články kategórií:

In [None]:
categories_articles = find_categories_articles('../data/enwiki-latest-pages-articles.xml')

Predspracovat slova pre gazeteer - WIKI clanky

In [22]:
categories_articles = find_infobox_anchor(extract_text(categories_articles))
for art in categories_articles:
    art['text_tokens'] = preprocess_text(art.get('text'))# num = 0
    art['category_wiki_tokens'] = preprocess_text(' '.join(art.get('category_wiki')))
    if art.get('infobox'):
        art['infobox_tokens'] = preprocess_text(' '.join(art.get('infobox')[0]))
    else:
        art['infobox_tokens'] = []
    art['anchors_tokens'] = preprocess_text(' '.join([' '.join(tups) for tups in art.get('anchors')]))

Predspracovat slova pre gazeteer - DATAMUSE

In [21]:
cats_with_words = categories_find_related(new_categories)
for cat in cats_with_words:
    cat['related_tokens'] = preprocess_text(' '.join(cat.get('related_words')))

####  TF-IDF a kosinusova podobnost

Natrenovať Term frequency - Inverse document frequency na datasete gazeteeru, teda slov jednotlivych kategorii.

Tento model vyhodnotime na slovach textu clankov, infoboxov, kategorii a anchor textoch a vypocitame kosinusovu podobnost s kategoriami gazeteeru.

#### Trenovanie TF-IDF na gazeteere z DATAMUSE

In [23]:
train_set = [' '.join(cat.get('related_tokens')) for cat in cats_with_words]

vectorizer_datamuse, trained_model_datamuse = tfidf_train(train_set)

#### Trenovanie TF-IDF na gazeteere z WIKI clankov kategorii

In [24]:
train_set_text = [' '.join(cat.get('text_tokens')) for cat in categories_articles]
train_set_category = [' '.join(cat.get('category_wiki_tokens')) for cat in categories_articles]
train_set_infobox = [' '.join(cat.get('infobox_tokens')) for cat in categories_articles]
train_set_anchors = [' '.join(cat.get('anchors_tokens')) for cat in categories_articles]

vectorizer_wiki0, trained_model_wiki0 = tfidf_train(train_set_anchors)
vectorizer_wiki1, trained_model_wiki1 = tfidf_train(train_set_infobox)
vectorizer_wiki2, trained_model_wiki2 = tfidf_train(train_set_category)
vectorizer_wiki3, trained_model_wiki3 = tfidf_train(train_set_text)

## Spustenie na testovacej vzorke dát - 30 článkov

Načítanie článkov, parsovanie potrebných častí článku a oddelenie redirectov od článkov.

In [None]:
redirects, articles_test = find_redirect(find_infobox_anchor(extract_text(read_xml_modified('../data/enwiki-latest-pages-articles.xml', 50))))

Predspracovanie slov textu clanku, kategorii, infoboxov a anchor textov

In [None]:
for art in articles_test:
    # Predspracovat slova textu clanku:
    art['text_tokens'] = preprocess_text(art.get('text'))
    # Predspracovat slova z kategorii:
    art['category_wiki_tokens'] = preprocess_text(' '.join(art.get('category_wiki')))
    # Predspracovat slova z infoboxov:
    if art.get('infobox'):
        art['infobox_tokens'] = preprocess_text(' '.join(art.get('infobox')[0]))
    else:
        art['infobox_tokens'] = []
    # Predspracovat slova z anchor textov:
    art['anchors_tokens'] = preprocess_text(' '.join([' '.join(tups) for tups in art.get('anchors')]))

#### Kosinusova podobnost s testovacimi clankami Wiki - DATAMUSE TFIDF

In [None]:
for art in articles_test:
    art['anchor_sims'] = tfdif_test_cosine(' '.join(art.get('anchors_tokens')), vectorizer_datamuse, trained_model_datamuse)
    art['anchor_sims'] = {k: v for k, v in sorted(art.get('anchor_sims').items(), key = lambda item: item[1], reverse=True)}
    art['categories_sims'] = tfdif_test_cosine(' '.join(art.get('category_wiki_tokens')), vectorizer_datamuse, trained_model_datamuse)
    art['categories_sims'] = {k: v for k, v in sorted(art.get('categories_sims').items(), key = lambda item: item[1], reverse=True)}        
    art['infobox_sims'] = tfdif_test_cosine(' '.join(art.get('infobox_tokens')), vectorizer_datamuse, trained_model_datamuse)
    art['infobox_sims'] = {k: v for k, v in sorted(art.get('infobox_sims').items(), key = lambda item: item[1], reverse=True)}    
    art['text_sims'] = tfdif_test_cosine(' '.join(art.get('text_tokens')), vectorizer_datamuse, trained_model_datamuse)
    art['text_sims'] = {k: v for k, v in sorted(art.get('text_sims').items(), key = lambda item: item[1], reverse=True)}

#### Kosinusova podobnost s  s testovacimi clankami - WIKI Infobox TFIDF

In [None]:
for art in articles_test:
    art['anchor_sims_info'] = tfdif_test_cosine(' '.join(art.get('anchors_tokens')), vectorizer_wiki1, trained_model_wiki1)
    art['anchor_sims_info'] = {k: v for k, v in sorted(art.get('anchor_sims_info').items(), key = lambda item: item[1], reverse=True)}    
    art['categories_sims_info'] = tfdif_test_cosine(' '.join(art.get('category_wiki_tokens')), vectorizer_wiki1, trained_model_wiki1)
    art['categories_sims_info'] = {k: v for k, v in sorted(art.get('categories_sims_info').items(), key = lambda item: item[1], reverse=True)}    
    art['infobox_sims_info'] = tfdif_test_cosine(' '.join(art.get('infobox_tokens')), vectorizer_wiki1, trained_model_wiki1)
    art['infobox_sims_info'] = {k: v for k, v in sorted(art.get('infobox_sims_info').items(), key = lambda item: item[1], reverse=True)}
    art['text_sims_info'] = tfdif_test_cosine(' '.join(art.get('text_tokens')), vectorizer_wiki1, trained_model_wiki1)
    art['text_sims_info'] = {k: v for k, v in sorted(art.get('text_sims_info').items(), key = lambda item: item[1], reverse=True)}

#### Kosinusova podobnost  s testovacimi clankami - WIKI kategorie TFIDF

In [None]:
for art in articles_test:
    art['anchor_sims_cat'] = tfdif_test_cosine(' '.join(art.get('anchors_tokens')), vectorizer_wiki2, trained_model_wiki2)
    art['anchor_sims_cat'] = {k: v for k, v in sorted(art.get('anchor_sims_cat').items(), key = lambda item: item[1], reverse=True)}    
    art['categories_sims_cat'] = tfdif_test_cosine(' '.join(art.get('category_wiki_tokens')), vectorizer_wiki2, trained_model_wiki2)
    art['categories_sims_cat'] = {k: v for k, v in sorted(art.get('categories_sims_cat').items(), key = lambda item: item[1], reverse=True)}    
    art['infobox_sims_cat'] = tfdif_test_cosine(' '.join(art.get('infobox_tokens')), vectorizer_wiki2, trained_model_wiki2)
    art['infobox_sims_cat'] = {k: v for k, v in sorted(art.get('infobox_sims_cat').items(), key = lambda item: item[1], reverse=True)}
    art['text_sims_cat'] = tfdif_test_cosine(' '.join(art.get('text_tokens')), vectorizer_wiki2, trained_model_wiki2)
    art['text_sims_cat'] = {k: v for k, v in sorted(art.get('text_sims_cat').items(), key = lambda item: item[1], reverse=True)}

#### Kosinusova podobnost  s testovacimi clankami - WIKI text TFIDF

In [None]:
for art in articles_test:
    art['anchor_sims_text'] = tfdif_test_cosine(' '.join(art.get('anchors_tokens')), vectorizer_wiki3, trained_model_wiki3)
    art['anchor_sims_text'] = {k: v for k, v in sorted(art.get('anchor_sims_text').items(), key = lambda item: item[1], reverse=True)}    
    art['categories_sims_text'] = tfdif_test_cosine(' '.join(art.get('category_wiki_tokens')), vectorizer_wiki3, trained_model_wiki3)
    art['categories_sims_text'] = {k: v for k, v in sorted(art.get('categories_sims_text').items(), key = lambda item: item[1], reverse=True)}    
    art['infobox_sims_text'] = tfdif_test_cosine(' '.join(art.get('infobox_tokens')), vectorizer_wiki3, trained_model_wiki3)
    art['infobox_sims_text'] = {k: v for k, v in sorted(art.get('infobox_sims_text').items(), key = lambda item: item[1], reverse=True)}    
    art['text_sims_text'] = tfdif_test_cosine(' '.join(art.get('text_tokens')), vectorizer_wiki3, trained_model_wiki3)
    art['text_sims_text'] = {k: v for k, v in sorted(art.get('text_sims_text').items(), key = lambda item: item[1], reverse=True)}    

## Spustenie na všetkých dátach

Spustenie na všetkých dátach neodporúčame spúšťať ak nemáme cluster alebo stroj ktorý môže bežať veľmi dlho alebo je veľmi výkonný, tento proces môže trvať aj niekoľko dní.

In [None]:
read_xml_modified('../data/enwiki-latest-pages-articles.xml', 100000)

In [None]:
for i in range(1,207):
    with open(f'../data/wiki_{i}_before.json', 'r+') as json_file:
        print(i)
        articles_all = json.load(json_file)
        redirects, articles_all = find_redirect(find_infobox_anchor(extract_text(articles_all)))
    with open(f'../data/wiki_{i}_before.json', 'w+') as json_file:
        json.dump(articles_all, json_file, indent=4)

In [None]:
for i in range(1,207):
    start_time = time.time()
    with open(f'../data/wiki_{i}_before.json', 'r+') as json_file:
        articles_all = json.load(json_file)
        for index, art in zip(range(len(articles_all)), articles_all):
            if (index % 10000 == 1):
                print(index)
            art['text_tokens'] = preprocess_text(art.pop('text',''))
            art['category_wiki_tokens'] = preprocess_text(' '.join(art.pop('category_wiki','')))
            if art.get('infobox'):
                art['infobox_tokens'] = preprocess_text(' '.join(art.pop('infobox','')[0]))
            else:
                art['infobox_tokens'] = []
            art['anchors_tokens'] = preprocess_text(' '.join([' '.join(tups) for tups in art.pop('anchors','')]))
    print("--- %s minutes ---" % ((time.time() - start_time) / 60))
    with open(f'../data/wiki_{i}_before.json', 'w+') as json_file:
        json.dump(articles_all, json_file, indent=4)

#### Kosinusova podobnost s Kategorickymi clankami Wiki - DATAMUSE TFIDF

Narozdiel od testovacich, vsetky clanky sme skusali iba na najuspesnejsom testovanom modely, teda pomocou gazeteera DATAMUSE.

Podobnost s textom je zakomentovana kvoli jeho vyssej casovej narocnosti.

In [None]:
for i in range(16,64):
    start_time = time.time()
    with open(f'../data/wiki_{i}_before.json') as json_file:
        more_articles = json.load(json_file)

    for art in more_articles:
        art['anchor_sims'] = tfdif_test_cosine(' '.join(art.get('anchors_tokens')), vectorizer_datamuse, trained_model_datamuse)
        art['anchor_sims'] = {k: v for k, v in sorted(art.get('anchor_sims').items(), key = lambda item: item[1], reverse=True)}
        art['categories_sims'] = tfdif_test_cosine(' '.join(art.get('category_wiki_tokens')), vectorizer_datamuse, trained_model_datamuse)
        art['categories_sims'] = {k: v for k, v in sorted(art.get('categories_sims').items(), key = lambda item: item[1], reverse=True)}        
        art['infobox_sims'] = tfdif_test_cosine(' '.join(art.get('infobox_tokens')), vectorizer_datamuse, trained_model_datamuse)
        art['infobox_sims'] = {k: v for k, v in sorted(art.get('infobox_sims').items(), key = lambda item: item[1], reverse=True)}    
#         art['text_sims'] = tfdif_test_cosine(' '.join(art.get('text_tokens')), vectorizer_datamuse, trained_model_datamuse)
#         art['text_sims'] = {k: v for k, v in sorted(art.get('text_sims').items(), key = lambda item: item[1], reverse=True)}

    with open(f'../data/wiki_{i}_after.json', 'w') as outfile:
        json.dump(more_articles, outfile, indent=4)
    print("--- %s minutes ---" % ((time.time() - start_time) / 60))

## Testovanie a vyhodnotenie

Ako funguje testovanie a vyhodnocovanie?

Nadpis testovania hovorí o gazeteere, na ktorom bol trénovaný TFIDF model.

Pomocou tohto modelu sa vyhodnocuje kosínusová podobnosť kategórií ku všetkým 4 častiam článku.

Následne pre každý z testovaných článkov zistím koľko anotovaných kategórií bolo priradených anotátorom.

Pre každú zo 4 častí článku zoberiem práve toľko zistených kategórií, koľko bolo anotovaných a spravím zjednotenie týchto kategórií -> stlpec ALL.

Pre každú zo 4 častí + zjednotenia vyhodnotím úspešnosť nájdených kategórií vzhľadom k anotovaným kategóriám v percentách.

Následne vypočítam priemer zo všetkých článkov a ďalšie štatistiky.

### DATAMUSE gazeteer

#### Anchor text +  Kategorie Wiki + Infobox + Text clanku

In [173]:
from more_itertools import take
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import recall_score, precision_score, f1_score

with open('../data/test_30_tested.json') as json_file:
    articles_test = json.load(json_file)
    
titles = [art.get('title') for art in articles_test]
anchors = [0] * 30
categories = [0] * 30
infoboxes = [0] * 30
texts = [0] * 30
extended = [0] * 30
counter = 0

for art in articles_test:
    len_annotated = len(art.get('annotated_categories'))
    
    n_anchors = take(len_annotated, art.get('anchor_sims').keys())
    n_anchors.extend(take(len_annotated, art.get('categories_sims').keys()))
    n_anchors.extend(take(len_annotated, art.get('infobox_sims').keys()))
    n_anchors.extend(take(len_annotated, art.get('text_sims').keys()))
    art['extended_sims'] = list(set(n_anchors))
    
    for cat in art.get('annotated_categories'):
        if cat in art.get('anchor_sims') and not pd.isna(art.get('anchor_sims').get(cat)) and list(art.get('anchor_sims')).index(cat) + 1 <= len_annotated:
            anchors[counter] += 1
        if cat in art.get('categories_sims') and not pd.isna(art.get('categories_sims').get(cat)) and list(art.get('categories_sims')).index(cat) + 1 <= len_annotated:
            categories[counter] += 1
        if cat in art.get('infobox_sims') and not pd.isna(art.get('infobox_sims').get(cat)) and list(art.get('infobox_sims')).index(cat) + 1 <= len_annotated:
            infoboxes[counter] += 1
        if cat in art.get('text_sims') and not pd.isna(art.get('text_sims').get(cat)) and list(art.get('text_sims')).index(cat) + 1 <= len_annotated:
            texts[counter] += 1
        if cat in art.get('extended_sims'):
            extended[counter] += 1
    anchors[counter] = float("{:.2f}".format(anchors[counter] / len_annotated * 100))
    categories[counter] = float("{:.2f}".format(categories[counter] / len_annotated * 100))
    infoboxes[counter] = float("{:.2f}".format(infoboxes[counter] / len_annotated * 100))
    texts[counter] = float("{:.2f}".format(texts[counter] / len_annotated* 100))
    extended[counter] = float("{:.2f}".format(extended[counter] / len_annotated* 100))

    counter += 1
    
cars = {
    'Anchor': anchors,
    'Categories': categories,
    'Infobox': infoboxes, 
    'Text': texts,
    'All': extended
}

df = pd.DataFrame(cars, columns = ['Anchor','Categories','Infobox','Text','All'], index=titles)

print(df)
print (df.describe())
print('\nAverage of All column: ', float("{:.2f}".format(df['All'].mean())))

A=[art['annotated_categories'] for art in articles_test]
B=[art['extended_sims'] for art in articles_test]
#A=[[ "Culture","Philosophy","Belief","Society"],['Society']]
#B=[['Culture', 'Philosophy', 'Belief','dsdasd'],['dsd']]

multi = MultiLabelBinarizer()

y_true = multi.fit(A).transform(A)
y_pred = multi.transform(B)

print('Precision: ',precision_score(y_true, y_pred,average='weighted',zero_division=1))
print('Recall: ',recall_score(y_true, y_pred, average='weighted',zero_division=1))
print('F1:' ,f1_score(y_true, y_pred, average='weighted'))

                                          Anchor  Categories  Infobox    Text  \
Anarchism                                  25.00       75.00     0.00   50.00   
Autism                                     66.67        0.00    33.33    0.00   
Albedo                                      0.00        0.00     0.00    0.00   
A                                          50.00        0.00    50.00    0.00   
Alabama                                     0.00        0.00     0.00    0.00   
Achilles                                    0.00       66.67     0.00   66.67   
Abraham Lincoln                            40.00       40.00    20.00   20.00   
Aristotle                                  57.14       57.14    57.14   42.86   
An American in Paris                       50.00       25.00     0.00   50.00   
Academy Award for Best Production Design   66.67       66.67    66.67   66.67   
Academy Awards                             66.67       66.67    66.67   66.67   
Actrius                     



### WIKI kategorie gazeteer

#### Anchor text +  Kategorie Wiki + Infobox + Text clanku

In [164]:
with open('../data/test_30_tested.json') as json_file:
    articles_test = json.load(json_file)
    
titles = [art.get('title') for art in articles_test]
anchors = [0] * 30
categories = [0] * 30
infoboxes = [0] * 30
texts = [0] * 30
extended = [0] * 30
counter = 0

for art in articles_test:
    len_annotated = len(art.get('annotated_categories'))

    n_anchors = take(len_annotated, art.get('anchor_sims_cat').keys())
    n_anchors.extend(take(len_annotated, art.get('categories_sims_cat').keys()))
    n_anchors.extend(take(len_annotated, art.get('infobox_sims_cat').keys()))
    n_anchors.extend(take(len_annotated, art.get('text_sims_cat').keys()))
    art['extended_sims_cat'] = list(set(n_anchors))

    for cat in art.get('annotated_categories'):
        if cat in art.get('anchor_sims_cat') and not pd.isna(art.get('anchor_sims_cat').get(cat)) and list(art.get('anchor_sims_cat')).index(cat) + 1 <= len_annotated:
            anchors[counter] += 1
            #print(cat, art.get('anchor_sims_cat').get(cat),list(art.get('anchor_sims_cat')).index(cat) + 1)
        if cat in art.get('categories_sims_cat') and not pd.isna(art.get('categories_sims_cat').get(cat)) and list(art.get('categories_sims_cat')).index(cat) + 1 <= len_annotated:
            categories[counter] += 1
            #print(cat, art.get('categories_sims_cat').get(cat),list(art.get('categories_sims_cat')).index(cat) + 1)
        if cat in art.get('infobox_sims_cat') and not pd.isna(art.get('infobox_sims_cat').get(cat)) and list(art.get('infobox_sims_cat')).index(cat) + 1 <= len_annotated:
            infoboxes[counter] += 1
            #print(cat, art.get('infobox_sims_cat').get(cat),list(art.get('infobox_sims_cat')).index(cat) + 1)
        if cat in art.get('text_sims_cat') and not pd.isna(art.get('text_sims_cat').get(cat)) and list(art.get('text_sims_cat')).index(cat) + 1 <= len_annotated:
            texts[counter] += 1
            #print(cat, art.get('text_sims_cat').get(cat),list(art.get('text_sims_cat')).index(cat) + 1)
        if cat in art.get('extended_sims_cat'):
            extended[counter] += 1
    anchors[counter] = float("{:.2f}".format(anchors[counter] / len_annotated * 100))
    categories[counter] = float("{:.2f}".format(categories[counter] / len_annotated * 100))
    infoboxes[counter] = float("{:.2f}".format(infoboxes[counter] / len_annotated * 100))
    texts[counter] = float("{:.2f}".format(texts[counter] / len_annotated* 100))
    extended[counter] = float("{:.2f}".format(extended[counter] / len_annotated* 100))

    counter += 1
    
cars = {
    'Anchor': anchors,
    'Categories': categories,
    'Infobox': infoboxes, 
    'Text': texts,
    'All': extended
}

df = pd.DataFrame(cars, columns = ['Anchor','Categories','Infobox','Text','All'], index=titles)

print(df)
print (df.describe())
print('\nAverage of All column: ', float("{:.2f}".format(df['All'].mean())))

A=[art['annotated_categories'] for art in articles_test]
B=[art['extended_sims_cat'] for art in articles_test]

multi = MultiLabelBinarizer()

y_true = multi.fit(A).transform(A)
y_pred = multi.transform(B)

print('Precision: ',precision_score(y_true, y_pred,average='weighted',zero_division=1))
print('Recall: ',recall_score(y_true, y_pred, average='weighted',zero_division=1))
print('F1:' ,f1_score(y_true, y_pred, average='weighted'))

                                          Anchor  Categories  Infobox   Text  \
Anarchism                                   0.00       25.00     0.00  25.00   
Autism                                      0.00        0.00     0.00   0.00   
Albedo                                      0.00        0.00     0.00   0.00   
A                                           0.00        0.00     0.00   0.00   
Alabama                                     0.00        0.00     0.00   0.00   
Achilles                                    0.00        0.00     0.00   0.00   
Abraham Lincoln                             0.00        0.00     0.00   0.00   
Aristotle                                  28.57        0.00    28.57  14.29   
An American in Paris                        0.00        0.00     0.00   0.00   
Academy Award for Best Production Design    0.00        0.00     0.00   0.00   
Academy Awards                              0.00        0.00     0.00   0.00   
Actrius                                 



### WIKI Infobox gazeteer

#### Anchor text +  Kategorie Wiki + Infobox + Text clanku

In [165]:
with open('../data/test_30_tested.json') as json_file:
    articles_test = json.load(json_file)
    
titles = [art.get('title') for art in articles_test]
anchors = [0] * 30
categories = [0] * 30
infoboxes = [0] * 30
texts = [0] * 30
extended = [0] * 30
counter = 0

for art in articles_test:
    len_annotated = len(art.get('annotated_categories'))

    n_anchors = take(len_annotated, art.get('anchor_sims_info').keys())
    n_anchors.extend(take(len_annotated, art.get('categories_sims_info').keys()))
    n_anchors.extend(take(len_annotated, art.get('infobox_sims_info').keys()))
    n_anchors.extend(take(len_annotated, art.get('text_sims_info').keys()))
    art['extended_sims_info'] = list(set(n_anchors))

    for cat in art.get('annotated_categories'):
        if cat in art.get('anchor_sims_info') and not pd.isna(art.get('anchor_sims_info').get(cat)) and list(art.get('anchor_sims_info')).index(cat) + 1 <= len_annotated:
            anchors[counter] += 1
        if cat in art.get('categories_sims_info') and not pd.isna(art.get('categories_sims_info').get(cat)) and list(art.get('categories_sims_info')).index(cat) + 1 <= len_annotated:
            categories[counter] += 1
        if cat in art.get('infobox_sims_info') and not pd.isna(art.get('infobox_sims_info').get(cat)) and list(art.get('infobox_sims_info')).index(cat) + 1 <= len_annotated:
            infoboxes[counter] += 1
        if cat in art.get('text_sims_info') and not pd.isna(art.get('text_sims_info').get(cat)) and list(art.get('text_sims_info')).index(cat) + 1 <= len_annotated:
            texts[counter] += 1
        if cat in art.get('extended_sims_info'):
            extended[counter] += 1
    anchors[counter] = float("{:.2f}".format(anchors[counter] / len_annotated * 100))
    categories[counter] = float("{:.2f}".format(categories[counter] / len_annotated * 100))
    infoboxes[counter] = float("{:.2f}".format(infoboxes[counter] / len_annotated * 100))
    texts[counter] = float("{:.2f}".format(texts[counter] / len_annotated* 100))
    extended[counter] = float("{:.2f}".format(extended[counter] / len_annotated* 100))

    counter += 1
    
cars = {
    'Anchor': anchors,
    'Categories': categories,
    'Infobox': infoboxes, 
    'Text': texts,
    'All': extended
}

df = pd.DataFrame(cars, columns = ['Anchor','Categories','Infobox','Text','All'], index=titles)

print(df)
print (df.describe())
print('\nAverage of All column: ', float("{:.2f}".format(df['All'].mean())))

A=[art['annotated_categories'] for art in articles_test]
B=[art['extended_sims_info'] for art in articles_test]

multi = MultiLabelBinarizer()

y_true = multi.fit(A).transform(A)
y_pred = multi.transform(B)

print('Precision: ',precision_score(y_true, y_pred,average='weighted',zero_division=1))
print('Recall: ',recall_score(y_true, y_pred, average='weighted',zero_division=1))
print('F1:' ,f1_score(y_true, y_pred, average='weighted'))

                                          Anchor  Categories  Infobox    Text  \
Anarchism                                   0.00         0.0     0.00   50.00   
Autism                                      0.00         0.0    33.33    0.00   
Albedo                                      0.00         0.0     0.00    0.00   
A                                           0.00         0.0    50.00    0.00   
Alabama                                     0.00         0.0     0.00    0.00   
Achilles                                    0.00         0.0     0.00   66.67   
Abraham Lincoln                             0.00         0.0    20.00   20.00   
Aristotle                                   0.00         0.0    57.14   42.86   
An American in Paris                       25.00         0.0     0.00   50.00   
Academy Award for Best Production Design   33.33         0.0    66.67   66.67   
Academy Awards                             33.33         0.0    66.67   66.67   
Actrius                     



### WIKI Text gazeteer

#### Anchor text +  Kategorie Wiki + Infobox + Text clanku

In [166]:
with open('../data/test_30_tested.json') as json_file:
    articles_test = json.load(json_file)
    
titles = [art.get('title') for art in articles_test]
anchors = [0] * 30
categories = [0] * 30
infoboxes = [0] * 30
texts = [0] * 30
extended = [0] * 30
counter = 0

for art in articles_test:
    len_annotated = len(art.get('annotated_categories'))

    n_anchors = take(len_annotated, art.get('anchor_sims_text').keys())
    n_anchors.extend(take(len_annotated, art.get('categories_sims_text').keys()))
    n_anchors.extend(take(len_annotated, art.get('infobox_sims_text').keys()))
    n_anchors.extend(take(len_annotated, art.get('text_sims_text').keys()))
    art['extended_sims_text'] = list(set(n_anchors))

    for cat in art.get('annotated_categories'):
        if cat in art.get('anchor_sims_text') and not pd.isna(art.get('anchor_sims_text').get(cat)) and list(art.get('anchor_sims_text')).index(cat) + 1 <= len_annotated:
            anchors[counter] += 1
        if cat in art.get('categories_sims_text') and not pd.isna(art.get('categories_sims_text').get(cat)) and list(art.get('categories_sims_text')).index(cat) + 1 <= len_annotated:
            categories[counter] += 1
        if cat in art.get('infobox_sims_text') and not pd.isna(art.get('infobox_sims_text').get(cat)) and list(art.get('infobox_sims_text')).index(cat) + 1 <= len_annotated:
            infoboxes[counter] += 1
        if cat in art.get('text_sims_text') and not pd.isna(art.get('text_sims_text').get(cat)) and list(art.get('text_sims_text')).index(cat) + 1 <= len_annotated:
            texts[counter] += 1
        if cat in art.get('extended_sims_text'):
            extended[counter] += 1
    anchors[counter] = float("{:.2f}".format(anchors[counter] / len_annotated * 100))
    categories[counter] = float("{:.2f}".format(categories[counter] / len_annotated * 100))
    infoboxes[counter] = float("{:.2f}".format(infoboxes[counter] / len_annotated * 100))
    texts[counter] = float("{:.2f}".format(texts[counter] / len_annotated* 100))
    extended[counter] = float("{:.2f}".format(extended[counter] / len_annotated* 100))

    counter += 1
    
cars = {
    'Anchor': anchors,
    'Categories': categories,
    'Infobox': infoboxes, 
    'Text': texts,
    'All': extended
}

df = pd.DataFrame(cars, columns = ['Anchor','Categories','Infobox','Text','All'], index=titles)

print(df)
print (df.describe())
print('\nAverage of All column: ', float("{:.2f}".format(df['All'].mean())))

A=[art['annotated_categories'] for art in articles_test]
B=[art['extended_sims_text'] for art in articles_test]

multi = MultiLabelBinarizer()

y_true = multi.fit(A).transform(A)
y_pred = multi.transform(B)

print('Precision: ',precision_score(y_true, y_pred,average='weighted',zero_division=1))
print('Recall: ',recall_score(y_true, y_pred, average='weighted',zero_division=1))
print('F1:' ,f1_score(y_true, y_pred, average='weighted'))

                                          Anchor  Categories  Infobox   Text  \
Anarchism                                   0.00       25.00     0.00   0.00   
Autism                                      0.00        0.00     0.00   0.00   
Albedo                                      0.00        0.00     0.00   0.00   
A                                           0.00        0.00     0.00   0.00   
Alabama                                     0.00        0.00     0.00   0.00   
Achilles                                    0.00        0.00     0.00   0.00   
Abraham Lincoln                            20.00       40.00     0.00   0.00   
Aristotle                                  14.29       14.29    14.29  14.29   
An American in Paris                        0.00        0.00     0.00  25.00   
Academy Award for Best Production Design    0.00        0.00    33.33  33.33   
Academy Awards                              0.00        0.00     0.00   0.00   
Actrius                                 



### Inverted index - hladanie podla textu

In [179]:
i=invertedIndex(articles_test, 'text_tokens')
#print(i)

print(i.search("arts FILM", "and"))

{'Achilles', 'Alien', 'Academy Awards', 'Animation', 'Alchemy', 'Ayn Rand', 'Alabama', 'Academy Award for Best Production Design', 'Algeria', 'Aristotle', 'An American in Paris', 'Anthropology'}


### Invertovany index - hladanie podla kategorie (infobox)

In [180]:
j=invertedIndex(articles_test, 'extended_sims_info')
#print(j)

print(j.search("arts FILM", "and"))

{'Alien', 'Academy Awards', 'Animation', 'Actrius', 'Academy Award for Best Production Design'}


### Invertovany index - hladanie podla kategorie (DATAMUSE)

In [181]:
k=invertedIndex(articles_test, 'extended_sims')
#print(j)

print(k.search("arts FILM", "and"))

{'Academy Award for Best Production Design'}


## Testovanie na vsetkych datach

Stiahli sme z Wikipedia dumpu XML suboru enwiki-latest-pages-articles.xml vsetky clanky a rozdelili do 207 súborov, pričom každý obsahoval 100 000 záznamov

Celym testovanim presla kvoli nedostatku casu a casovo narocnym operacie iba mensia cast clankov, konkrétne 47 týchto súborov si prešlo celým cyklom od prespracovania po testovanie na vytvorených modeloch.

In [4]:
counter = 0
for i in range(16,64):
    with open(f'../data/wiki_{i}_before.json') as json_file:
        more_articles = json.load(json_file)
        counter = counter + len(more_articles)
print('Spracovanych plnohodnotnych clankov: ',counter)

Spracovanych plnohodnotnych clankov:  2547414


## Presna zhoda

### 6. Z tela článku vyhľadať najčastejšie používané termy a tie, ktoré boli identifikované v kroku 2

Find exact match words or expressions with categorised words

In [None]:
Find exact match words or expressions with categorised words

def find_exact_match(articles, categories):
    for article in articles:
        article['categories_exact_text'] = []
        article['categories_exact_anchors'] = []
        article['categories_exact_infobox'] = []
        for category in categories:
            related_words = category.get('related_words')
            found_text = []
            found_anchors = []
            found_infobox = []
            found_text = list(filter(lambda word: re.findall(rf'\W+({word})\W+', article['text'], re.IGNORECASE), related_words))
            found_anchors = list(filter(lambda word: re.findall(rf'\W+({word})\W+', str(article['anchors']).strip('[]'), re.IGNORECASE), related_words))
            found_infobox = list(filter(lambda word: re.findall(rf'\W+({word})\W+', str(article['infobox']).strip('[]'), re.IGNORECASE), related_words))
            if found_text:
                article['categories_exact_text'].append({'category':category.get('category'),'related_words':found_text})
            if found_anchors:
                article['categories_exact_anchors'].append({'category':category.get('category'),'related_words':found_anchors})
            if found_infobox:
                article['categories_exact_infobox'].append({'category':category.get('category'),'related_words':found_infobox})
    return articles

In [176]:
def save_articles(articles, file_name):
    with open(f'../data/{file_name}.json', 'w') as outfile:
        json.dump(articles, outfile, indent=4)

In [177]:
#exact_match = find_exact_match(articles, cats_with_words)
save_articles(articles_test, 'test_30testedd')

### Vyskusat PySpark

In [None]:
import findspark
findspark.init()
findspark.find()
import pyspark
findspark.find()

In [None]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
conf = SparkConf().setAppName('SparkApp').setMaster("local")
sc = pyspark.SparkContext(conf = conf)
spark = SparkSession(sc)

In [None]:
tic = time.perf_counter()
numeric_val = sc.parallelize(articles)
square_udf_int = udf(lambda z: remove_stop_words(z))
#numeric_val.map(lambda x: remove_stop_words(x)).collect()
toc = time.perf_counter()
print(f"Performed in {toc - tic:0.4f} seconds")

In [None]:
tic = time.perf_counter()
numeric_val.map(lambda x: square_udf_int(x)).collect()
toc = time.perf_counter()
print(f"Performed in {toc - tic:0.4f} seconds")

In [None]:
def square(x):
    return x**2

In [None]:
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf
square_udf_int = udf(lambda z: square(z), IntegerType())

In [None]:
square_udf_int([1,2,3])