## Zhlukovanie článkov Wikipédie do kategórií na základe ich vedecko-spoločenskej oblasti

**Vypracoval:** Tomáš Babjak

**Predmet:** Vyhľadávanie informácii

**GitHub:** https://github.com/tomasbabjak/VINF_Wikipedia

Imports

In [169]:
import regex
import re
import datamuse
import nltk
import json
import string
import time
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import *
from sklearn.feature_extraction.text import TfidfVectorizer

### 1. Vytvoriť testovaciu vzorku dát, na ktorej budeme prvotne projekt realizovať

Read XML file with Wiki articles and parse articles to list:

In [175]:
def read_xml(file_name, n_first_articles):
    
    start_tag = f'<page>'
    end_tag = f'</page>'
    
    start_found = False
    articles_found = []
    lines = ''
    
    with open(file_name, encoding="utf8") as f:
        for line in f:
            if start_tag in line:
                start_found = True
            if start_found:
                lines += line
            if end_tag in line:
                start_found = False
                articles_found.append(lines)
                lines = ''
            if len(articles_found) == n_first_articles:
                break
    with open(f'../data/wiki_{n_first_articles}_before.json', 'w') as outfile:
        json.dump(articles_found, outfile)
    return articles_found

Extract Title and Text attributes from article and create dictionary from them:

In [171]:
def extract_text(text):
    title_regex = r'<title[^>]*>([^<]+)</title>'
    text_regex = r'<text[^>]*>([^<]+)</text>'
    pages = []
    for page in text:
        title = regex.findall(title_regex, page)
        text = regex.findall(text_regex, page)
        pages.append({"title": title[0] if title else '',
                      "text": text[0] if text else ''})
    return pages

## Najst infobox, anchor texty a wiki kategorie

### 4. Z článkov testovacej sady vyhľadať dôležité pojmy - zamerať sa na Infobox, kde sa nachádzajú dôležité informácie o článku

### 5. Vyhľadať odkazy na iné články Wikipédie (anchor text), ktoré môžu smerovať priamo na oblasť alebo aspoň priblížiť kontext článku

Find Infobox and Achor texts from Text attribute of article and add them to dictionary

In [172]:
def find_infobox_anchor(text):
    regex_infobox = r"(?=\{Infobox )(\{([^{}]|(?1))*\})"
    regex_anchor = r"\[\[([^\]\[:]+)\|([^\]\[:]+)\]\]"
    regex_category = r"\[\[Category:([^\]]*\b)"
    for page in text:
        page['infobox'] = regex.findall(regex_infobox, page['text'])
        page['anchors'] = regex.findall(regex_anchor, page['text'])
        page['category_wiki'] = regex.findall(regex_category, page['text'])
        page['text'] = regex.sub(regex_infobox, '', page['text'])
        page['text'] = regex.sub(regex_anchor, '', page['text'])
        page['text'] = regex.sub(regex_category, '', page['text'])
    return text

Separate Redirect articles from others into two separate lists

In [173]:
def find_redirect(text):
    regex_redirect = r"^#REDIRECT[^\[]*\[\[([^\]]+)"
    redirect_pages = []
    article_pages = []
    for page in text:
        if regex.findall(regex_redirect, page['text']):
            redirect_pages.append(page)
        else:
            article_pages.append(page)
    return (redirect_pages, article_pages)

In [176]:
redirects, articles_test = find_redirect(find_infobox_anchor(extract_text(read_xml('../data/enwiki-latest-pages-articles.xml', 100))))

### 2. Vytvoriť zoznam (strom) spoločensko-vedných oblastí, do ktorých budeme jednotlivé stránky zaraďovať, ku každej oblasti nájsť aj slová, ktoré sa s ňou spájajú

Find terms related to our categories with Datamuse library. Split words of each category and find 100 terms related to them

In [None]:
categories = [
    'Culture, literature and the arts',
    'Geography - places and states',
    'Medicine - health and fitness',
    'History and events',
    'Mathematics and logic',
    'Nature and physics',
    'Technology and computing',
    'Philosophy and thinking',
    'Religion and belief',
    'Society, politics and people'
]

new_categories = [
    'Culture',
    'Food',
    'Language',
    'Literature',
    'Art',
    'Dance',
    'Film',
    'Music',
    'Theatre',
    'Architecture',
    'Painting',
    'Sculpture',
    'Games',
    'Sport',
    'Recreation',
    'Media',
    'Internet',
    'Geography',
    'Earth',
    'Health',
    'Fitness',
    'Exercise',
    'Life',
    'Medicine',
    'History',
    'Education',
    'Crime',
    'War',
    'Transport',
    'Mathematics',
    'Logic',
    'Statistics',
    'Biology',
    'Nature',
    'Science',
    'Philosophy',
    'Religion',
    'Belief',
    'Society',
    'Technology',
    'Computing',
    'Electronics',
    'Engineering']

### Vytvorit gazeteer pomocou Wiki clankov mojich kategorii

Ku kazdej z mojich kategorii najst clanok wikipedie s rovnakym nazvom a pomocou neho neskor vytvorit gazeteer.

In [None]:
def find_categories_articles(file_name):
    start_tag = f'<page>'
    end_tag = f'</page>'
    title_regex = r'<title[^>]*>([^<]+)</title>'

    start_found = False
    reading = False
    start_just = False
    articles_found = []
    lines = ''
    
    try:
        with open(file_name, encoding="utf8") as f:
            for line in f:
                if start_tag in line:
                    start_found = True
                    start_just = True
                    continue
                if start_just:
                    category = regex.findall(title_regex, line)
                    if category[0] in new_categories:
                        print(category[0])
                        reading = True
                    start_just = False
                if start_found and reading:
                    lines += line
                if end_tag in line:
                    start_found = False
                    reading = False
                    if category[0] in new_categories:
                        articles_found.append(lines)
                    category = ''
                    lines = ''
                if len(articles_found) == len(new_categories):
                    break
        with open(f'../data/wiki_categories.json', 'w') as outfile:
            json.dump(articles_found, outfile)
        return articles_found
    except:
          print("An exception occurred")
    finally:
        with open(f'../data/wiki_categories.json', 'w') as outfile:
            json.dump(articles_found, outfile)    
        return articles_found

In [None]:
categories_articles = find_categories_articles('../data/enwiki-latest-pages-articles.xml')

### Vytvorit gazeteer pomocou Datamuse kniznice

Ku kazdej z mojich kategorii najst gazeteer pomocou kniznice Datamuse - related words

In [None]:
api = datamuse.Datamuse()
        
def categories_find_related(categories):
    cats_with_words = []

    for c in categories:
        api_words = api.words(ml=c, max=20)
        result = list(map(lambda x: x.get('word'), api_words))
        result.append(c.lower())
        cats_with_words.append({'category':c,'related_words':result})

    return cats_with_words

## Predspracovanie

### 3. Články vhodne predspracovať - stemming, tokenizácia, odstránenie stop slov

In [None]:
def tokenize_text(text):
    text_tokens = word_tokenize(text)
    text_tokens = [token.lower() for token in text_tokens if token not in ["*+'-./:;,|<=>?@[\]^_`{}~!\"#$%&()\n"]]
    return text_tokens

In [None]:
def remove_stops(text):
    tokens_without_stops = list(filter(lambda x: (x not in string.punctuation) and (x not in stopwords.words('english')),text))
    return tokens_without_stops

In [None]:
def stem_list(llist):
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in llist]

In [None]:
def preprocess_text(text):
    if not text:
        return []
    else:
        text = tokenize_text(text)
        text = remove_stops(text)
        text = stem_list(text)
        return text

### Predspracovat slova textu clanku, kategorii, infoboxov a anchor textov

In [177]:
for art in articles_test:
    # Predspracovat slova textu clanku:
    art['text_tokens'] = preprocess_text(art.get('text'))
    # Predspracovat slova z kategorii:
    art['category_wiki_tokens'] = preprocess_text(' '.join(art.get('category_wiki')))
    # Predspracovat slova z infoboxov:
    if art.get('infobox'):
        art['infobox_tokens'] = preprocess_text(' '.join(art.get('infobox')[0]))
    else:
        art['infobox_tokens'] = []
    # Predspracovat slova z anchor textov:
    art['anchors_tokens'] = preprocess_text(' '.join([' '.join(tups) for tups in art.get('anchors')]))

In [None]:
dataset = []
for a in articles:
    dataset.extend(a.get('text_tokens'))

### Predspracovat slova pre gazeteer - DATAMUSE

In [45]:
#cats_with_words = categories_find_related(new_categories)
for cat in cats_with_words:
    cat['related_tokens'] = preprocess_text(' '.join(cat.get('related_words')))

### Predspracovat slova pre gazeteer - WIKI clanky

In [62]:
# categories_articles = find_categories_articles('../data/enwiki-latest-pages-articles.xml')
with open('../data/wiki_categories.json') as json_file:
    categories_articles = json.load(json_file)

categories_articles = find_infobox_anchor(extract_text(categories_articles))
for art in categories_articles:
    art['text_tokens'] = preprocess_text(art.get('text'))# num = 0
    art['category_wiki_tokens'] = preprocess_text(' '.join(art.get('category_wiki')))
    if art.get('infobox'):
        art['infobox_tokens'] = preprocess_text(' '.join(art.get('infobox')[0]))
    else:
        art['infobox_tokens'] = []
    art['anchors_tokens'] = preprocess_text(' '.join([' '.join(tups) for tups in art.get('anchors')]))

## TF-IDF

In [36]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import CountVectorizer 
import numpy as np
import numpy.linalg as LA

In [165]:
def tfidf_train(train_set):
    vectorizer = TfidfVectorizer()
    trainVectorizerArray = vectorizer.fit_transform(train_set).toarray()
    return vectorizer, trainVectorizerArray

def tfdif_test_cosine(test_set, vectorizer, trainVectorizerArray):
    if not test_set[0]:
        return {}
    
    testVectorizerArray = vectorizer.transform(test_set).toarray()
    cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
    
    categories_sims = {}
    for vector, category in zip(trainVectorizerArray, new_categories):
        for testV in testVectorizerArray:
            cosine = cx(vector, testV)
            if cosine != 0:
                categories_sims[category] = cosine
    return categories_sims

## Kosinusova podobnost TF-IDF

Natrenovana Term frequency - Inverse document frequency na datasete mojho gazeteeru, teda slov jednotlivych kategorii.

Tento model vyhodnotime na slovach textu clankov, infoboxov, kategorii a anchor textoch a vypocitame kosinusovu podobnost s kategoriami gazeteeru.

### Trenovanie TF-IDF na gazeteere z DATAMUSE

In [154]:
train_set = [' '.join(cat.get('related_tokens')) for cat in cats_with_words]

vectorizer_datamuse, trained_model_datamuse  = tfidf_train(train_set)

### Trenovanie TF-IDF na gazeteere z WIKI clankov kategorii

In [203]:
train_set_text = [' '.join(cat.get('text_tokens')) for cat in categories_articles]
train_set_category = [' '.join(cat.get('category_wiki_tokens')) for cat in categories_articles]
# train_set_infobox = [' '.join(cat.get('infobox_tokens')) for cat in categories_articles]
# train_set_anchors = [' '.join(cat.get('anchors_tokens')) for cat in categories_articles]

vectorizer_wiki, trained_model_wiki = tfidf_train(train_set_text)
vectorizer_wiki2, trained_model_wiki2 = tfidf_train(train_set_category)

### Kosinusova podobnost s Kategorickymi clankami Wiki - DATAMUSE TFIDF

### Anchor text - kosinusova podobnost

In [195]:
for art in articles:
    art['anchor_sims'] = tfdif_test_cosine([' '.join(art.get('anchors_tokens'))], vectorizer_datamuse, trained_model_datamuse)
    art['anchor_sims'] = {k: v for k, v in sorted(art.get('anchor_sims').items(), key = lambda item: item[1], reverse=True)}    
    print(art.get('title'))
    print(art.get('anchor_sims'))

Art
{'Art': 0.3, 'Painting': 0.255, 'Theatre': 0.236, 'Culture': 0.176, 'Sculpture': 0.149, 'Literature': 0.13, 'Architecture': 0.105, 'Philosophy': 0.1, 'Media': 0.042, 'Belief': 0.04, 'Logic': 0.039, 'Nature': 0.031, 'History': 0.03, 'Engineering': 0.029, 'Earth': 0.026, 'Science': 0.026, 'Religion': 0.026, 'Computing': 0.023, 'Life': 0.021, 'Language': 0.02, 'Society': 0.014, 'Dance': 0.01, 'Exercise': 0.008, 'Recreation': 0.007, 'Medicine': 0.007, 'Education': 0.005}
Computing
{'Computing': 0.643, 'Technology': 0.359, 'Internet': 0.212, 'Engineering': 0.093, 'Language': 0.082, 'Science': 0.072, 'Education': 0.057, 'Logic': 0.05, 'Architecture': 0.037, 'Statistics': 0.037, 'Media': 0.034, 'Biology': 0.032, 'Geography': 0.031, 'Art': 0.018, 'Culture': 0.011, 'Mathematics': 0.009, 'Crime': 0.007, 'Society': 0.007, 'Literature': 0.006, 'Theatre': 0.006}
Crime
{'Crime': 0.328, 'Society': 0.144, 'Culture': 0.074, 'Life': 0.049, 'Nature': 0.047, 'War': 0.041, 'Architecture': 0.039, 'Langu

### Kategorie Wiki - kosinusova podobnost

In [196]:
for art in articles:
    art['categories_sims'] = tfdif_test_cosine([' '.join(art.get('category_wiki_tokens'))], vectorizer_datamuse, trained_model_datamuse)
    art['categories_sims'] = {k: v for k, v in sorted(art.get('categories_sims').items(), key = lambda item: item[1], reverse=True)}        
    print(art.get('title'))
    print(art.get('categories_sims'))

Art
{'Theatre': 0.283, 'Art': 0.242, 'Culture': 0.147, 'Sculpture': 0.144, 'Painting': 0.14, 'Literature': 0.138}
Computing
{'Computing': 0.702, 'Technology': 0.316, 'Internet': 0.14}
Crime
{'Crime': 0.527}
Dance
{'Theatre': 0.352, 'Recreation': 0.17, 'Dance': 0.169, 'Music': 0.141, 'Art': 0.088, 'Film': 0.061, 'Culture': 0.054, 'Sculpture': 0.053, 'Painting': 0.051, 'Literature': 0.05}
Earth
{'Nature': 0.372, 'Earth': 0.174, 'Logic': 0.087}
Engineering
{'Engineering': 0.446, 'Technology': 0.421, 'Architecture': 0.118, 'Biology': 0.091, 'Philosophy': 0.069, 'Belief': 0.067, 'Culture': 0.065, 'Logic': 0.053, 'Science': 0.048, 'Mathematics': 0.044}
Education
{'Education': 0.715, 'Culture': 0.176, 'Science': 0.146}
Electronics
{'Electronics': 0.485, 'Internet': 0.281, 'Computing': 0.14}
Food
{'Food': 0.186}
Games
{}
Internet
{'Media': 0.279, 'Technology': 0.184, 'Engineering': 0.146, 'Computing': 0.143, 'Culture': 0.105, 'Science': 0.102, 'Transport': 0.088, 'Logic': 0.084, 'Literature': 

### Infobox - kosinusova podobnost

In [197]:
for art in articles:
    art['infobox_sims'] = tfdif_test_cosine([' '.join(art.get('infobox_tokens'))], vectorizer_datamuse, trained_model_datamuse)
    art['infobox_sims'] = {k: v for k, v in sorted(art.get('infobox_sims').items(), key = lambda item: item[1], reverse=True)}    
    print(art.get('title'))
    print(art.get('infobox_sims'))

Art
{}
Computing
{}
Crime
{}
Dance
{}
Earth
{'Earth': 0.185, 'Electronics': 0.158, 'Logic': 0.048, 'Internet': 0.045, 'Nature': 0.042, 'Culture': 0.029, 'Media': 0.015, 'Geography': 0.014, 'Film': 0.013, 'Biology': 0.008, 'Society': 0.008, 'Science': 0.007, 'Technology': 0.007}
Engineering
{}
Education
{}
Electronics
{}
Food
{}
Games
{}
Internet
{}
Language
{}
Life
{}
Mathematics
{}
Music
{'Music': 0.366, 'Dance': 0.203, 'Culture': 0.178, 'Theatre': 0.119, 'Medicine': 0.102, 'Literature': 0.077, 'History': 0.063, 'Art': 0.055, 'Philosophy': 0.052, 'Sculpture': 0.033, 'Painting': 0.032}
Medicine
{'Medicine': 0.624, 'Sculpture': 0.153, 'Health': 0.143, 'Music': 0.133, 'Logic': 0.064}
Nature
{}
Recreation
{}
Religion
{}
Statistics
{}
Science
{}
Sculpture
{}
Technology
{}
War
{}
Society
{}
Media
{}
Health
{}
Belief
{}
Fitness
{}
Exercise
{}
Biology
{}
History
{}
Philosophy
{}
Transport
{}
Painting
{}
Literature
{}
Geography
{}
Culture
{}
Theatre
{}
Architecture
{}
Film
{}
Sport
{}
Logic
{}

###  Text clanku - kosinusova podobnost

In [198]:
for art in articles:
    art['text_sims'] = tfdif_test_cosine([' '.join(art.get('text_tokens'))], vectorizer_datamuse, trained_model_datamuse)
    art['text_sims'] = {k: v for k, v in sorted(art.get('text_sims').items(), key = lambda item: item[1], reverse=True)}
    print(art.get('title'))
    print(art.get('text_sims'))

Art
{'Art': 0.336, 'Literature': 0.285, 'Theatre': 0.273, 'Culture': 0.181, 'Painting': 0.18, 'Sculpture': 0.16, 'Exercise': 0.096, 'Nature': 0.071, 'Media': 0.067, 'Philosophy': 0.064, 'Logic': 0.045, 'Internet': 0.042, 'Society': 0.041, 'Science': 0.039, 'History': 0.038, 'Language': 0.037, 'Technology': 0.031, 'Music': 0.029, 'Earth': 0.027, 'Architecture': 0.023, 'Life': 0.023, 'Fitness': 0.021, 'Religion': 0.02, 'Belief': 0.02, 'Engineering': 0.02, 'Education': 0.017, 'Film': 0.012, 'Computing': 0.012, 'Dance': 0.011, 'Recreation': 0.011, 'Statistics': 0.011, 'Biology': 0.01, 'Health': 0.008, 'Medicine': 0.008, 'War': 0.008, 'Electronics': 0.008, 'Games': 0.007, 'Mathematics': 0.005, 'Transport': 0.004, 'Food': 0.003, 'Sport': 0.002, 'Geography': 0.001, 'Crime': 0.001}
Computing
{'Computing': 0.623, 'Technology': 0.387, 'Internet': 0.232, 'Engineering': 0.152, 'Logic': 0.108, 'Science': 0.102, 'Literature': 0.083, 'Architecture': 0.064, 'Statistics': 0.06, 'Exercise': 0.056, 'Medi

Music
{'Music': 0.547, 'Dance': 0.217, 'Medicine': 0.124, 'Culture': 0.068, 'Literature': 0.059, 'Theatre': 0.054, 'Exercise': 0.052, 'History': 0.052, 'Education': 0.033, 'Internet': 0.032, 'Games': 0.031, 'Philosophy': 0.029, 'Biology': 0.027, 'Media': 0.026, 'Statistics': 0.025, 'Nature': 0.025, 'Science': 0.025, 'Society': 0.022, 'Life': 0.021, 'Architecture': 0.02, 'Technology': 0.02, 'Art': 0.019, 'Sport': 0.018, 'Electronics': 0.017, 'Computing': 0.016, 'Logic': 0.014, 'Religion': 0.013, 'Engineering': 0.012, 'Language': 0.011, 'Painting': 0.011, 'Sculpture': 0.011, 'Earth': 0.01, 'Health': 0.01, 'Film': 0.009, 'Recreation': 0.008, 'Belief': 0.008, 'Mathematics': 0.007, 'Fitness': 0.006, 'War': 0.003, 'Transport': 0.002, 'Geography': 0.001, 'Crime': 0.001}
Medicine
{'Medicine': 0.696, 'Health': 0.292, 'Literature': 0.112, 'Exercise': 0.111, 'Music': 0.105, 'Science': 0.103, 'Biology': 0.079, 'Technology': 0.078, 'Fitness': 0.074, 'Internet': 0.069, 'Education': 0.055, 'History':

History
{'History': 0.375, 'Life': 0.167, 'Literature': 0.121, 'Science': 0.077, 'Culture': 0.059, 'Nature': 0.052, 'Society': 0.051, 'Education': 0.041, 'Exercise': 0.039, 'Philosophy': 0.039, 'Earth': 0.032, 'Technology': 0.032, 'Internet': 0.03, 'Logic': 0.027, 'Media': 0.026, 'War': 0.026, 'Biology': 0.023, 'Language': 0.022, 'Engineering': 0.019, 'Statistics': 0.017, 'Electronics': 0.012, 'Geography': 0.011, 'Mathematics': 0.011, 'Art': 0.01, 'Belief': 0.01, 'Theatre': 0.009, 'Religion': 0.009, 'Computing': 0.008, 'Fitness': 0.007, 'Painting': 0.005, 'Sculpture': 0.005, 'Health': 0.005, 'Transport': 0.005, 'Film': 0.004, 'Sport': 0.004, 'Architecture': 0.003, 'Recreation': 0.003, 'Medicine': 0.003, 'Crime': 0.003, 'Games': 0.002, 'Dance': 0.001}
Philosophy
{'Philosophy': 0.325, 'Logic': 0.202, 'Literature': 0.189, 'Culture': 0.171, 'Belief': 0.148, 'Nature': 0.124, 'Science': 0.083, 'Education': 0.082, 'Religion': 0.066, 'Internet': 0.058, 'History': 0.058, 'Life': 0.047, 'Technol

### Kosinusova podobnost s testovacimi clankami - DATAMUSE TFIDF

### Anchor text - kosinusova podobnost

In [194]:
for art in articles_test:
    art['anchor_sims'] = tfdif_test_cosine([' '.join(art.get('anchors_tokens'))], vectorizer_datamuse, trained_model_datamuse)
    art['anchor_sims'] = {k: v for k, v in sorted(art.get('anchor_sims').items(), key = lambda item: item[1], reverse=True)}    
    print(art.get('title'))
    print(art.get('anchor_sims'))

Anarchism
{'Nature': 0.196, 'War': 0.144, 'Philosophy': 0.136, 'Science': 0.112, 'Language': 0.108, 'Culture': 0.095, 'Media': 0.091, 'Logic': 0.072, 'Education': 0.066, 'Society': 0.063, 'Crime': 0.045, 'Statistics': 0.041, 'Belief': 0.032, 'Religion': 0.03, 'Literature': 0.027, 'Biology': 0.027, 'Computing': 0.022, 'Electronics': 0.016, 'History': 0.013, 'Life': 0.012}
Autism
{'Medicine': 0.349, 'Health': 0.224, 'Fitness': 0.127, 'Society': 0.122, 'Philosophy': 0.107, 'Culture': 0.103, 'Architecture': 0.102, 'Language': 0.093, 'Logic': 0.075, 'Belief': 0.066, 'Literature': 0.064, 'Music': 0.058, 'Life': 0.057, 'Statistics': 0.051, 'Education': 0.035, 'Engineering': 0.031, 'Exercise': 0.028, 'Food': 0.019, 'Dance': 0.017, 'Theatre': 0.017, 'Nature': 0.017, 'Technology': 0.016, 'Earth': 0.014, 'Science': 0.014, 'Media': 0.013, 'Biology': 0.012}
Albedo
{'Geography': 0.278, 'Nature': 0.169, 'Earth': 0.142, 'Logic': 0.142, 'Science': 0.072, 'Dance': 0.044, 'Literature': 0.04}
A
{'Language

### Kategorie Wiki - kosinusova podobnost

In [193]:
for art in articles_test:
    art['categories_sims'] = tfdif_test_cosine([' '.join(art.get('category_wiki_tokens'))], vectorizer_datamuse, trained_model_datamuse)
    art['categories_sims'] = {k: v for k, v in sorted(art.get('categories_sims').items(), key = lambda item: item[1], reverse=True)}    
    print(art.get('title'))
    print(art.get('categories_sims'))

Anarchism
{'Philosophy': 0.246, 'Society': 0.224, 'Culture': 0.135, 'Science': 0.054, 'Logic': 0.044, 'Literature': 0.042}
Autism
{'Language': 0.239, 'Media': 0.144, 'Philosophy': 0.102, 'Culture': 0.095, 'Society': 0.081, 'Logic': 0.078}
Albedo
{'Earth': 0.234, 'Transport': 0.094, 'Language': 0.079, 'Geography': 0.071}
A
{'Culture': nan, 'Food': nan, 'Language': nan, 'Literature': nan, 'Art': nan, 'Dance': nan, 'Film': nan, 'Music': nan, 'Theatre': nan, 'Architecture': nan, 'Painting': nan, 'Sculpture': nan, 'Games': nan, 'Sport': nan, 'Recreation': nan, 'Media': nan, 'Internet': nan, 'Geography': nan, 'Earth': nan, 'Health': nan, 'Fitness': nan, 'Exercise': nan, 'Life': nan, 'Medicine': nan, 'History': nan, 'Education': nan, 'Crime': nan, 'War': nan, 'Transport': nan, 'Mathematics': nan, 'Logic': nan, 'Statistics': nan, 'Biology': nan, 'Nature': nan, 'Science': nan, 'Philosophy': nan, 'Religion': nan, 'Belief': nan, 'Society': nan, 'Technology': nan, 'Computing': nan, 'Electronics': 

  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)


### Infobox - kosinusova podobnost

In [192]:
for art in articles_test:
    art['infobox_sims'] = tfdif_test_cosine([' '.join(art.get('infobox_tokens'))], vectorizer_datamuse, trained_model_datamuse)
    art['infobox_sims'] = {k: v for k, v in sorted(art.get('infobox_sims').items(), key = lambda item: item[1], reverse=True)}
    print(art.get('title'))
    print(art.get('infobox_sims'))

Anarchism
{}
Autism
{'Medicine': 0.233, 'Literature': 0.157, 'Media': 0.135, 'Society': 0.119, 'Internet': 0.097, 'Exercise': 0.091, 'Life': 0.081, 'Health': 0.08, 'Language': 0.078, 'Fitness': 0.057, 'Mathematics': 0.019, 'Nature': 0.019, 'History': 0.018, 'Technology': 0.018, 'Science': 0.016}
Albedo
{}
A
{'Statistics': 0.24, 'Language': 0.219}
Alabama
{'Nature': 0.237, 'War': 0.15, 'Internet': 0.112, 'Language': 0.103, 'History': 0.042, 'Geography': 0.038, 'Exercise': 0.025, 'Statistics': 0.025, 'Theatre': 0.024, 'Painting': 0.024, 'Media': 0.024, 'Music': 0.021, 'Life': 0.014, 'Dance': 0.013}
Achilles
{}
Abraham Lincoln
{'War': 0.3, 'Nature': 0.214, 'Literature': 0.053, 'Crime': 0.049, 'Life': 0.045, 'Theatre': 0.044, 'Painting': 0.044, 'Film': 0.038, 'Sport': 0.027, 'Media': 0.022, 'History': 0.021}
Aristotle
{'Philosophy': 0.286, 'Logic': 0.242, 'Biology': 0.163, 'Culture': 0.147, 'Belief': 0.113, 'Science': 0.084, 'Electronics': 0.084, 'Music': 0.08, 'Education': 0.075, 'Technol

###  Text clanku - kosinusova podobnost

In [191]:
for art in articles_test:
    art['text_sims'] = tfdif_test_cosine([' '.join(art.get('text_tokens'))], vectorizer_datamuse, trained_model_datamuse)
    art['text_sims'] = {k: v for k, v in sorted(art.get('text_sims').items(), key = lambda item: item[1], reverse=True)}
    print(art.get('title'))
    print(art.get('text_sims'))

Anarchism
{'Literature': 0.355, 'Philosophy': 0.166, 'Nature': 0.149, 'Society': 0.144, 'Education': 0.125, 'Media': 0.101, 'Science': 0.088, 'Culture': 0.087, 'War': 0.078, 'Logic': 0.075, 'History': 0.061, 'Exercise': 0.058, 'Internet': 0.056, 'Life': 0.05, 'Belief': 0.048, 'Earth': 0.037, 'Art': 0.035, 'Language': 0.031, 'Theatre': 0.026, 'Statistics': 0.026, 'Religion': 0.024, 'Electronics': 0.024, 'Biology': 0.023, 'Painting': 0.018, 'Games': 0.014, 'Sculpture': 0.013, 'Technology': 0.013, 'Film': 0.012, 'Fitness': 0.012, 'Engineering': 0.012, 'Recreation': 0.011, 'Music': 0.01, 'Crime': 0.01, 'Architecture': 0.009, 'Sport': 0.009, 'Medicine': 0.007, 'Computing': 0.006, 'Dance': 0.005, 'Health': 0.005, 'Geography': 0.004, 'Mathematics': 0.004, 'Transport': 0.003, 'Food': 0.002}
Autism
{'Literature': 0.196, 'Media': 0.173, 'Internet': 0.09, 'Society': 0.071, 'Medicine': 0.054, 'Health': 0.044, 'Language': 0.043, 'Exercise': 0.041, 'Science': 0.04, 'Fitness': 0.035, 'Nature': 0.035,

Algeria
{'Internet': 0.246, 'Literature': 0.124, 'Nature': 0.076, 'Media': 0.059, 'War': 0.059, 'Culture': 0.047, 'History': 0.046, 'Exercise': 0.039, 'Education': 0.036, 'Earth': 0.035, 'Society': 0.03, 'Life': 0.028, 'Statistics': 0.027, 'Science': 0.027, 'Language': 0.024, 'Electronics': 0.024, 'Film': 0.021, 'Religion': 0.021, 'Technology': 0.021, 'Theatre': 0.019, 'Music': 0.018, 'Biology': 0.018, 'Engineering': 0.018, 'Philosophy': 0.017, 'Medicine': 0.015, 'Art': 0.014, 'Geography': 0.013, 'Painting': 0.012, 'Sport': 0.012, 'Health': 0.012, 'Transport': 0.011, 'Sculpture': 0.01, 'Games': 0.009, 'Logic': 0.008, 'Dance': 0.007, 'Architecture': 0.007, 'Food': 0.005, 'Fitness': 0.005, 'Mathematics': 0.005, 'Recreation': 0.004, 'Crime': 0.003, 'Computing': 0.003, 'Belief': 0.002}
List of Atlas Shrugged characters
{'Literature': 0.257, 'Exercise': 0.157, 'Society': 0.146, 'Engineering': 0.131, 'Nature': 0.112, 'Life': 0.11, 'Electronics': 0.109, 'Philosophy': 0.094, 'Logic': 0.092, 'B

### Kosinusova podobnost s  s testovacimi clankami - WIKI clanky TFIDF

### Anchor text - kosinusova podobnost

In [199]:
for art in articles_test:
    art['anchor_sims'] = tfdif_test_cosine([' '.join(art.get('anchors_tokens'))], vectorizer_wiki, trained_model_wiki)
    art['anchor_sims'] = {k: v for k, v in sorted(art.get('anchor_sims').items(), key = lambda item: item[1], reverse=True)}    
    print(art.get('title'))
    print(art.get('anchor_sims'))

Anarchism
{'Biology': 0.04, 'Earth': 0.039, 'Belief': 0.035, 'History': 0.033, 'Medicine': 0.03, 'Statistics': 0.029, 'Language': 0.027, 'Film': 0.025, 'Health': 0.021, 'Fitness': 0.021, 'War': 0.019, 'Culture': 0.016, 'Media': 0.016, 'Life': 0.016, 'Science': 0.016, 'Technology': 0.015, 'Religion': 0.014, 'Internet': 0.013, 'Sport': 0.012, 'Education': 0.012, 'Logic': 0.012, 'Dance': 0.011, 'Painting': 0.011, 'Sculpture': 0.011, 'Geography': 0.011, 'Exercise': 0.011, 'Crime': 0.011, 'Literature': 0.01, 'Theatre': 0.01, 'Games': 0.01, 'Philosophy': 0.01, 'Engineering': 0.01, 'Transport': 0.009, 'Nature': 0.009, 'Society': 0.009, 'Food': 0.008, 'Recreation': 0.007, 'Computing': 0.007, 'Electronics': 0.007, 'Art': 0.005, 'Music': 0.005, 'Mathematics': 0.005}
Autism
{'Media': 0.105, 'Crime': 0.059, 'Sculpture': 0.04, 'Logic': 0.04, 'Health': 0.037, 'Belief': 0.036, 'Culture': 0.035, 'Theatre': 0.035, 'Mathematics': 0.033, 'Film': 0.028, 'Fitness': 0.026, 'War': 0.025, 'Games': 0.024, 'His

### Kategorie Wiki - kosinusova podobnost

In [200]:
for art in articles_test:
    art['categories_sims'] = tfdif_test_cosine([' '.join(art.get('category_wiki_tokens'))], vectorizer_wiki, trained_model_wiki)
    art['categories_sims'] = {k: v for k, v in sorted(art.get('categories_sims').items(), key = lambda item: item[1], reverse=True)}    
    print(art.get('title'))
    print(art.get('categories_sims'))

Anarchism
{'History': 0.086, 'Belief': 0.078, 'Statistics': 0.043, 'Medicine': 0.03, 'Culture': 0.025, 'Earth': 0.025, 'Biology': 0.024, 'Fitness': 0.018, 'Language': 0.017, 'Film': 0.017, 'Life': 0.017, 'War': 0.017, 'Science': 0.015, 'Religion': 0.015, 'Literature': 0.014, 'Painting': 0.014, 'Education': 0.014, 'Sculpture': 0.012, 'Society': 0.012, 'Sport': 0.01, 'Philosophy': 0.01, 'Technology': 0.01, 'Health': 0.009, 'Exercise': 0.009, 'Crime': 0.009, 'Electronics': 0.009, 'Dance': 0.008, 'Theatre': 0.008, 'Recreation': 0.008, 'Geography': 0.008, 'Media': 0.007, 'Internet': 0.007, 'Engineering': 0.007, 'Computing': 0.006, 'Logic': 0.005, 'Art': 0.004, 'Mathematics': 0.004, 'Nature': 0.004, 'Music': 0.002, 'Games': 0.002, 'Food': 0.001}
Autism
{'Media': 0.043, 'Crime': 0.037, 'Sculpture': 0.015, 'Education': 0.015, 'Culture': 0.012, 'Film': 0.011, 'Painting': 0.01, 'War': 0.01, 'Earth': 0.009, 'Logic': 0.009, 'Theatre': 0.008, 'Mathematics': 0.008, 'Recreation': 0.007, 'Geography': 

### Infobox - kosinusova podobnost

In [201]:
for art in articles_test:
    art['infobox_sims'] = tfdif_test_cosine([' '.join(art.get('infobox_tokens'))], vectorizer_wiki, trained_model_wiki)
    art['infobox_sims'] = {k: v for k, v in sorted(art.get('infobox_sims').items(), key = lambda item: item[1], reverse=True)}
    print(art.get('title'))
    print(art.get('infobox_sims'))

Anarchism
{}
Autism
{'Art': 0.49, 'Games': 0.449, 'Fitness': 0.443, 'Mathematics': 0.431, 'Culture': 0.429, 'Life': 0.408, 'Medicine': 0.397, 'Earth': 0.391, 'Film': 0.386, 'Biology': 0.386, 'Internet': 0.379, 'Logic': 0.371, 'Philosophy': 0.366, 'Dance': 0.362, 'Media': 0.361, 'Statistics': 0.352, 'Crime': 0.349, 'Painting': 0.336, 'Food': 0.333, 'Belief': 0.332, 'Sport': 0.31, 'Sculpture': 0.309, 'Geography': 0.304, 'War': 0.293, 'Theatre': 0.288, 'Electronics': 0.272, 'Health': 0.27, 'Music': 0.263, 'Engineering': 0.257, 'Exercise': 0.253, 'Technology': 0.251, 'Language': 0.25, 'Science': 0.207, 'Literature': 0.205, 'Nature': 0.204, 'Recreation': 0.176, 'History': 0.161, 'Religion': 0.118, 'Society': 0.098, 'Computing': 0.083, 'Education': 0.014, 'Transport': 0.012}
Albedo
{}
A
{'Art': 0.329, 'Mathematics': 0.295, 'Culture': 0.271, 'Medicine': 0.264, 'Games': 0.261, 'Fitness': 0.26, 'Earth': 0.247, 'Life': 0.244, 'Internet': 0.237, 'Biology': 0.234, 'Film': 0.233, 'Dance': 0.229, 'L

###  Text clanku - kosinusova podobnost

In [202]:
for art in articles_test:
    art['text_sims'] = tfdif_test_cosine([' '.join(art.get('text_tokens'))], vectorizer_wiki, trained_model_wiki)
    art['text_sims'] = {k: v for k, v in sorted(art.get('text_sims').items(), key = lambda item: item[1], reverse=True)}
    print(art.get('title'))
    print(art.get('text_sims'))

Anarchism
{'Fitness': 0.295, 'Society': 0.283, 'Philosophy': 0.282, 'Biology': 0.273, 'Life': 0.268, 'Belief': 0.26, 'Games': 0.246, 'Logic': 0.229, 'Film': 0.228, 'Religion': 0.225, 'Culture': 0.223, 'Earth': 0.215, 'Medicine': 0.212, 'Sport': 0.207, 'Internet': 0.203, 'Statistics': 0.2, 'Sculpture': 0.198, 'Art': 0.192, 'Dance': 0.185, 'Painting': 0.18, 'Food': 0.171, 'History': 0.171, 'War': 0.168, 'Music': 0.161, 'Media': 0.151, 'Mathematics': 0.147, 'Engineering': 0.144, 'Crime': 0.134, 'Health': 0.132, 'Geography': 0.13, 'Language': 0.127, 'Electronics': 0.125, 'Technology': 0.115, 'Theatre': 0.112, 'Science': 0.103, 'Literature': 0.102, 'Recreation': 0.101, 'Computing': 0.099, 'Exercise': 0.094, 'Nature': 0.094, 'Education': 0.021, 'Transport': 0.02, 'Architecture': 0.001}
Autism
{'Mathematics': 0.448, 'Art': 0.44, 'Games': 0.439, 'Fitness': 0.425, 'Life': 0.39, 'Culture': 0.389, 'Film': 0.379, 'Internet': 0.369, 'Logic': 0.362, 'Medicine': 0.36, 'Earth': 0.354, 'Media': 0.35, '

Altruism
{'Fitness': 0.514, 'Games': 0.492, 'Art': 0.48, 'Life': 0.458, 'Mathematics': 0.452, 'Biology': 0.448, 'Culture': 0.44, 'Earth': 0.44, 'Belief': 0.418, 'Medicine': 0.415, 'Internet': 0.414, 'Philosophy': 0.413, 'Film': 0.408, 'Logic': 0.408, 'Statistics': 0.392, 'Dance': 0.38, 'Painting': 0.366, 'Sport': 0.366, 'Crime': 0.36, 'War': 0.36, 'Food': 0.359, 'Sculpture': 0.342, 'Media': 0.334, 'Geography': 0.318, 'Engineering': 0.302, 'Health': 0.295, 'Music': 0.294, 'Theatre': 0.291, 'Language': 0.289, 'Electronics': 0.278, 'Technology': 0.273, 'History': 0.261, 'Exercise': 0.26, 'Literature': 0.227, 'Science': 0.225, 'Recreation': 0.218, 'Nature': 0.215, 'Religion': 0.182, 'Society': 0.172, 'Computing': 0.153, 'Transport': 0.06, 'Education': 0.026, 'Architecture': 0.004}
Ayn Rand
{'Fitness': 0.42, 'Art': 0.409, 'Games': 0.401, 'Biology': 0.385, 'Culture': 0.384, 'Life': 0.382, 'Philosophy': 0.365, 'Earth': 0.358, 'Medicine': 0.354, 'Mathematics': 0.343, 'Internet': 0.337, 'Film':

### Kosinusova podobnost  s testovacimi clankami - WIKI kategorie TFIDF

### Anchor text - kosinusova podobnost

In [204]:
for art in articles_test:
    art['anchor_sims_cat'] = tfdif_test_cosine([' '.join(art.get('anchors_tokens'))], vectorizer_wiki2, trained_model_wiki2)
    art['anchor_sims_cat'] = {k: v for k, v in sorted(art.get('anchor_sims').items(), key = lambda item: item[1], reverse=True)}    
    print(art.get('title'))
    print(art.get('anchor_sims_cat'))

Anarchism
{'Biology': 0.04, 'Earth': 0.039, 'Belief': 0.035, 'History': 0.033, 'Medicine': 0.03, 'Statistics': 0.029, 'Language': 0.027, 'Film': 0.025, 'Health': 0.021, 'Fitness': 0.021, 'War': 0.019, 'Culture': 0.016, 'Media': 0.016, 'Life': 0.016, 'Science': 0.016, 'Technology': 0.015, 'Religion': 0.014, 'Internet': 0.013, 'Sport': 0.012, 'Education': 0.012, 'Logic': 0.012, 'Dance': 0.011, 'Painting': 0.011, 'Sculpture': 0.011, 'Geography': 0.011, 'Exercise': 0.011, 'Crime': 0.011, 'Literature': 0.01, 'Theatre': 0.01, 'Games': 0.01, 'Philosophy': 0.01, 'Engineering': 0.01, 'Transport': 0.009, 'Nature': 0.009, 'Society': 0.009, 'Food': 0.008, 'Recreation': 0.007, 'Computing': 0.007, 'Electronics': 0.007, 'Art': 0.005, 'Music': 0.005, 'Mathematics': 0.005}
Autism
{'Media': 0.105, 'Crime': 0.059, 'Sculpture': 0.04, 'Logic': 0.04, 'Health': 0.037, 'Belief': 0.036, 'Culture': 0.035, 'Theatre': 0.035, 'Mathematics': 0.033, 'Film': 0.028, 'Fitness': 0.026, 'War': 0.025, 'Games': 0.024, 'His

  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b :

### Kategorie Wiki - kosinusova podobnost

In [205]:
for art in articles_test:
    art['categories_sims_cat'] = tfdif_test_cosine([' '.join(art.get('category_wiki_tokens'))], vectorizer_wiki2, trained_model_wiki2)
    art['categories_sims_cat'] = {k: v for k, v in sorted(art.get('categories_sims').items(), key = lambda item: item[1], reverse=True)}    
    print(art.get('title'))
    print(art.get('categories_sims_cat'))

Anarchism
{'History': 0.086, 'Belief': 0.078, 'Statistics': 0.043, 'Medicine': 0.03, 'Culture': 0.025, 'Earth': 0.025, 'Biology': 0.024, 'Fitness': 0.018, 'Language': 0.017, 'Film': 0.017, 'Life': 0.017, 'War': 0.017, 'Science': 0.015, 'Religion': 0.015, 'Literature': 0.014, 'Painting': 0.014, 'Education': 0.014, 'Sculpture': 0.012, 'Society': 0.012, 'Sport': 0.01, 'Philosophy': 0.01, 'Technology': 0.01, 'Health': 0.009, 'Exercise': 0.009, 'Crime': 0.009, 'Electronics': 0.009, 'Dance': 0.008, 'Theatre': 0.008, 'Recreation': 0.008, 'Geography': 0.008, 'Media': 0.007, 'Internet': 0.007, 'Engineering': 0.007, 'Computing': 0.006, 'Logic': 0.005, 'Art': 0.004, 'Mathematics': 0.004, 'Nature': 0.004, 'Music': 0.002, 'Games': 0.002, 'Food': 0.001}
Autism
{'Media': 0.043, 'Crime': 0.037, 'Sculpture': 0.015, 'Education': 0.015, 'Culture': 0.012, 'Film': 0.011, 'Painting': 0.01, 'War': 0.01, 'Earth': 0.009, 'Logic': 0.009, 'Theatre': 0.008, 'Mathematics': 0.008, 'Recreation': 0.007, 'Geography': 

  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b :

### Infobox - kosinusova podobnost

In [206]:
for art in articles_test:
    art['infobox_sims_cat'] = tfdif_test_cosine([' '.join(art.get('infobox_tokens'))], vectorizer_wiki2, trained_model_wiki2)
    art['infobox_sims_cat'] = {k: v for k, v in sorted(art.get('infobox_sims').items(), key = lambda item: item[1], reverse=True)}
    print(art.get('title'))
    print(art.get('infobox_sims_cat'))

Anarchism
{}
Autism
{'Art': 0.49, 'Games': 0.449, 'Fitness': 0.443, 'Mathematics': 0.431, 'Culture': 0.429, 'Life': 0.408, 'Medicine': 0.397, 'Earth': 0.391, 'Film': 0.386, 'Biology': 0.386, 'Internet': 0.379, 'Logic': 0.371, 'Philosophy': 0.366, 'Dance': 0.362, 'Media': 0.361, 'Statistics': 0.352, 'Crime': 0.349, 'Painting': 0.336, 'Food': 0.333, 'Belief': 0.332, 'Sport': 0.31, 'Sculpture': 0.309, 'Geography': 0.304, 'War': 0.293, 'Theatre': 0.288, 'Electronics': 0.272, 'Health': 0.27, 'Music': 0.263, 'Engineering': 0.257, 'Exercise': 0.253, 'Technology': 0.251, 'Language': 0.25, 'Science': 0.207, 'Literature': 0.205, 'Nature': 0.204, 'Recreation': 0.176, 'History': 0.161, 'Religion': 0.118, 'Society': 0.098, 'Computing': 0.083, 'Education': 0.014, 'Transport': 0.012}
Albedo
{}
A
{'Art': 0.329, 'Mathematics': 0.295, 'Culture': 0.271, 'Medicine': 0.264, 'Games': 0.261, 'Fitness': 0.26, 'Earth': 0.247, 'Life': 0.244, 'Internet': 0.237, 'Biology': 0.234, 'Film': 0.233, 'Dance': 0.229, 'L

  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b :

###  Text clanku - kosinusova podobnost

In [207]:
for art in articles_test:
    art['text_sims_cat'] = tfdif_test_cosine([' '.join(art.get('text_tokens'))], vectorizer_wiki2, trained_model_wiki2)
    art['text_sims_cat'] = {k: v for k, v in sorted(art.get('text_sims').items(), key = lambda item: item[1], reverse=True)}
    print(art.get('title'))
    print(art.get('text_sims_cat'))

  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b :

Anarchism
{'Fitness': 0.295, 'Society': 0.283, 'Philosophy': 0.282, 'Biology': 0.273, 'Life': 0.268, 'Belief': 0.26, 'Games': 0.246, 'Logic': 0.229, 'Film': 0.228, 'Religion': 0.225, 'Culture': 0.223, 'Earth': 0.215, 'Medicine': 0.212, 'Sport': 0.207, 'Internet': 0.203, 'Statistics': 0.2, 'Sculpture': 0.198, 'Art': 0.192, 'Dance': 0.185, 'Painting': 0.18, 'Food': 0.171, 'History': 0.171, 'War': 0.168, 'Music': 0.161, 'Media': 0.151, 'Mathematics': 0.147, 'Engineering': 0.144, 'Crime': 0.134, 'Health': 0.132, 'Geography': 0.13, 'Language': 0.127, 'Electronics': 0.125, 'Technology': 0.115, 'Theatre': 0.112, 'Science': 0.103, 'Literature': 0.102, 'Recreation': 0.101, 'Computing': 0.099, 'Exercise': 0.094, 'Nature': 0.094, 'Education': 0.021, 'Transport': 0.02, 'Architecture': 0.001}
Autism
{'Mathematics': 0.448, 'Art': 0.44, 'Games': 0.439, 'Fitness': 0.425, 'Life': 0.39, 'Culture': 0.389, 'Film': 0.379, 'Internet': 0.369, 'Logic': 0.362, 'Medicine': 0.36, 'Earth': 0.354, 'Media': 0.35, '

Alain Connes
{'Sport': 0.345, 'Fitness': 0.317, 'Games': 0.316, 'Art': 0.297, 'Life': 0.29, 'Internet': 0.285, 'Culture': 0.278, 'Logic': 0.277, 'Painting': 0.266, 'Biology': 0.264, 'Medicine': 0.26, 'Film': 0.259, 'Philosophy': 0.254, 'Dance': 0.253, 'Earth': 0.242, 'Food': 0.237, 'Belief': 0.232, 'Statistics': 0.222, 'Mathematics': 0.215, 'Crime': 0.203, 'Geography': 0.191, 'Media': 0.189, 'Theatre': 0.187, 'Music': 0.183, 'Electronics': 0.183, 'War': 0.181, 'Engineering': 0.176, 'Sculpture': 0.174, 'Health': 0.173, 'History': 0.163, 'Technology': 0.163, 'Language': 0.156, 'Religion': 0.148, 'Exercise': 0.147, 'Nature': 0.132, 'Science': 0.131, 'Literature': 0.121, 'Recreation': 0.109, 'Society': 0.094, 'Computing': 0.074, 'Transport': 0.024, 'Education': 0.012}
Allan Dwan
{'Computing': 0.298, 'Fitness': 0.245, 'Games': 0.243, 'Life': 0.236, 'Art': 0.226, 'Internet': 0.22, 'Culture': 0.213, 'Philosophy': 0.208, 'Medicine': 0.207, 'Biology': 0.206, 'Painting': 0.204, 'Logic': 0.199, '

  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)


## Presna zhoda

### 6. Z tela článku vyhľadať najčastejšie používané termy a tie, ktoré boli identifikované v kroku 2

Find exact match words or expressions with categorised words

In [35]:
Find exact match words or expressions with categorised words

def find_exact_match(articles, categories):
    for article in articles:
        article['categories_exact_text'] = []
        article['categories_exact_anchors'] = []
        article['categories_exact_infobox'] = []
        for category in categories:
            related_words = category.get('related_words')
            found_text = []
            found_anchors = []
            found_infobox = []
            found_text = list(filter(lambda word: re.findall(rf'\W+({word})\W+', article['text'], re.IGNORECASE), related_words))
            found_anchors = list(filter(lambda word: re.findall(rf'\W+({word})\W+', str(article['anchors']).strip('[]'), re.IGNORECASE), related_words))
            found_infobox = list(filter(lambda word: re.findall(rf'\W+({word})\W+', str(article['infobox']).strip('[]'), re.IGNORECASE), related_words))
            if found_text:
                article['categories_exact_text'].append({'category':category.get('category'),'related_words':found_text})
            if found_anchors:
                article['categories_exact_anchors'].append({'category':category.get('category'),'related_words':found_anchors})
            if found_infobox:
                article['categories_exact_infobox'].append({'category':category.get('category'),'related_words':found_infobox})
    return articles

In [None]:
def save_articles(articles, file_name):
    with open(f'../data/{file_name}.json', 'w') as outfile:
        json.dump(articles, outfile, indent=4)

In [None]:
exact_match = find_exact_match(articles, cats_with_words)
save_articles(exact_match, 'wiki_100_exact_match')

### Vyskusat PySpark

In [None]:
import findspark
findspark.init()
findspark.find()
import pyspark
findspark.find()

In [None]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
conf = SparkConf().setAppName('SparkApp').setMaster("local")
sc = pyspark.SparkContext(conf = conf)
spark = SparkSession(sc)

In [None]:
tic = time.perf_counter()
numeric_val = sc.parallelize(articles)
square_udf_int = udf(lambda z: remove_stop_words(z))
#numeric_val.map(lambda x: remove_stop_words(x)).collect()
toc = time.perf_counter()
print(f"Performed in {toc - tic:0.4f} seconds")

In [None]:
tic = time.perf_counter()
numeric_val.map(lambda x: square_udf_int(x)).collect()
toc = time.perf_counter()
print(f"Performed in {toc - tic:0.4f} seconds")

In [None]:
def square(x):
    return x**2

In [None]:
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf
square_udf_int = udf(lambda z: square(z), IntegerType())

In [None]:
square_udf_int([1,2,3])