# 1. Introduction

In [92]:
import numpy as np
import os
import json
import numpy.random
from collections import Counter
from itertools import product

In [None]:
in_folders = ['art_201811', 'sports']

In [27]:
SMALL_PERCENTILE = 90
LARGE_PERCENTILE = 99

In [69]:
def get_data(in_folder, max_number, number_count, file_pattern, zfill_number):
    file_numbers = numpy.random.choice(np.arange(1,max_number+1), size=number_count,replace=False)

    data_array = []
    for file_number in file_numbers:
        file_name = os.path.join(os.getcwd(),in_folder,file_pattern+'%s.json'%(str(file_number).zfill(zfill_number)))
                                 
        with open(file_name,'r', encoding='utf8') as json_file:  
            data = json.load(json_file)
            p = {}
            p['data'] = data['text']
            p["title"] = data["title"]
            p["site_section"] = data['thread']["site_section"]
            p["published"] = data["published"]
            
            data_array.append(p)
    return data_array

In [78]:
def get_dictionaries(in_folder):
    data_array = []
    folder = os.path.join(os.getcwd(),in_folder,'json')

    for file_name in os.listdir(folder):
        if file_name.endswith(".json"):
            with open(os.path.join(os.getcwd(),in_folder,'json',file_name),'r', encoding='utf8') as json_file:  
                data = json.load(json_file)
                p = {}
                p['data'] = data['data']
                p["title"] = data["title"]
                p["site_section"] = data['site_section']
                p["published"] = data["published"]
                p["categories"] = data["categories"]
                data_array.append(p)
    return data_array

In [28]:
from nltk.corpus import wordnet

In [18]:
from nltk.corpus import stopwords

# You will have to download the set of stop words the first time
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\TL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
# Load stop words
stop_words = stopwords.words('english')
nltk.download('punkt')

from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('names')
nltk.download('averaged_perceptron_tagger')

english_names = nltk.corpus.names.words()

def remove_names(words):
    words_filtered = []
    
    for w in words:
        if w not in english_names:
            words_filtered.append(w)
    return words_filtered

def remove_stop_words(words):
    words_filtered = []
    
    for w in words:
        if w not in stop_words:
            words_filtered.append(w)
    return words_filtered

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\TL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package names to
[nltk_data]     C:\Users\TL\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\TL\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [20]:
# -*- coding: utf-8 -*-
import re
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

In [21]:
def get_nouns(data):
    result = []
    #text = word_tokenize("They refuse to permit us to obtain the refuse permit")
    tagged_words = nltk.pos_tag(data)
    for tagged_word in tagged_words:
        if tagged_word[1] in ['NN','NNS']:
            result.append(tagged_word)
    return result

In [22]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import wikipediaapi

In [24]:
def get_wikipedia(keyword):
    wiki_wiki = wikipediaapi.Wikipedia('en')
    page_py = wiki_wiki.page(keyword)
    if page_py.exists():
        return (page_py.summary, page_py.categories)
    return None

In [64]:
import re
def filter_categories(cats):
    cats_ = cats.copy()
    list_ = ['Wiki','pages needing','accuracy disputes','redlink','category link','Pages containing'
             ,'Articles ','Vague or ambiguous','Good articles','Articles lack','Wikipedia pages'
             ,' symbols','Wikipedia articles','Pages with','CS','All Wikipedia articles'
             ,'Use American','Use British English','disambiguation','Disambiguation','-language sources'
             ,'dmy dates','mdy dates','Webarchive template','All articles','Articles containing','Articles with', 
             'Pages using','link template','time', 'calendar', 'Calendar', 'date', 'month','day of', 'Year', 'Units of'
             ,'Customary units', 'Integer','Punctuation', 'All self-contradictory articles', 'Weeks']
     
    for cat in cats:
        if any(l_ in cat for l_ in list_):
            cats_.pop(cat)
    result = []
    for cat in cats_:
        s1 = cat.replace("Category:","")
        result.append(s1)
    return result

In [65]:
def get_all_categories(data_array):
    set_categories = set()
    all_categories = []
    for data in data_array:
        for cat in data['categories']:
            for c in cat:
                set_categories.add(c)
                all_categories.append(c)
            
    return all_categories


def remove_stop_categries(all_categories):
    cats_ = all_categories.copy()
    list_ = ['Wiki','pages needing','accuracy disputes','redlink','category link','Pages containing'
             ,'Articles ','Vague or ambiguous','Good articles','Articles lack','Wikipedia pages'
             ,' symbols','Wikipedia articles','Pages with','CS','All Wikipedia articles'
             ,'Use American','Use British English','disambiguation','Disambiguation','-language sources'
             ,'dmy dates','mdy dates','Webarchive template','All articles','Articles containing','Articles with' 
             ,'Pages using','link template','time', 'calendar', 'Calendar', 'date', 'month','day of', 'Year', 'Units of'
             ,'Customary units', 'Integer','Punctuation', 'All self-contradictory articles', 'Weeks'
            ]

    for cat in all_categories:
        if any(l_ in cat for l_ in list_):
            cats_.remove(cat)
    return cats_


In [45]:
def check_create_folder(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

In [61]:
def add_categories(data_array, in_folder):
    for count in range(0,len(data_array)):
        data = data_array[count]
        print(count)
        data['categories'] = []
        sentenses = split_into_sentences(data['data'])
        for sentense in sentenses:
            words = word_tokenize(sentense)
            words = remove_names(words)
            words = remove_stop_words(words)
            nouns = get_nouns(words)
            categories = []
            for noun in nouns:
                wiki = get_wikipedia(noun[0])
                if wiki:
                    cats = filter_categories(wiki[1])
                    data['categories'].append(cats)
        check_create_folder(os.path.join(os.getcwd(), in_folder,'json'))
        with open(os.path.join(os.getcwd(), in_folder,'json','data_array%s.json'%(str(count))), 'w') as fout:
            json.dump(data , fout)      
    return data_array

In [37]:
def prune_data_array(data_array):
    max_value = list(data_array['class'][0].values())[0]
    cnt = 0
    items = []
    for item in data_array['class']:
        if max_value > list(item.values())[0]:
            max_value = list(item.values())[0]
            cnt += 1
            if cnt > 2:
                data_array['class'] = items
                return data_array
        items.append(item)    
        #print(item)
    return data_array     

In [88]:
def get_all_theme(data_array, in_folder):
    classification_results = []
    new_data = []
    for i in range(0,len(data_array)):
        categories_of_art = get_all_categories(data_array[i:i+1])
        categories_of_art = remove_stop_categries(categories_of_art)

        dict_categories = dict(Counter(categories_of_art))

        a1_sorted_keys = sorted(dict_categories, key=dict_categories.get, reverse=True)
        if len(dict_categories) > 0:
            max_perc = np.percentile(list(dict_categories.values()), SMALL_PERCENTILE)
            most_important = []
            most_important_dict = []
            for r in a1_sorted_keys:
                if dict_categories[r] < max_perc:
                    break
                print( r, dict_categories[r])
                most_important.append(r)
                most_important_dict.append({r: dict_categories[r]})
            classification_results.append(most_important_dict)
            new_data.append(data_array[i])
            data_array[i]['class'] = most_important_dict
        else:
            data_array[i]['class'] = []
        check_create_folder(os.path.join(os.getcwd(),in_folder,'json_theme'))
        with open(os.path.join(os.getcwd(),in_folder,'json_theme','data_classified%s.json'%(str(i))), 'w') as fout:
            json.dump(data_array[i], fout)    
    return data_array

In [94]:
def get_classification_results(data_array, in_folder, keyword):
    allsyns_array = []
    for i in range(0,len(data_array)):
        if len(data_array[i]['class']) > 0:
            data_array[i] = prune_data_array(data_array[i])
            word_list = [list(item.keys())[0] for item in data_array[i]['class']]

            allsyns1 = set(ss for word in word_list for ss in wordnet.synsets(word, pos=wordnet.NOUN))
            if len(allsyns1) > 0:
                allsyns2 = set(wordnet.synsets(keyword, pos=wordnet.NOUN))
                best = max((wordnet.wup_similarity(s1, s2) or 0, s1, s2) for s1, s2 in 
                    product(allsyns1, allsyns2))
                worst = min((wordnet.wup_similarity(s1, s2) or 0, s1, s2) for s1, s2 in 
                    product(allsyns1, allsyns2))
                print((best[0]+worst[0])/2)
                data = {'data':data_array[i]['data'],'best':best[0], 'worst':worst[0]}
                allsyns_array.append(data)
                check_create_folder(os.path.join(os.getcwd(),in_folder,'3level_theme'))
                with open(os.path.join(os.getcwd(),in_folder,'3level_theme','data_array%s.json'%(str(i))), 'w') as fout:
                    json.dump(data , fout)   
    return allsyns_array

In [41]:
#data_array_business = get_data('business', 14794, 1000, 'news_00', 5)

data_array_art = get_data('art_201811', 100, 100, 'post_', 3)

len(data_array_art)

100

In [57]:
data_array_art = add_categories(data_array_art, 'art_201811')

0
1
2


In [79]:
data_array_art = get_dictionaries('art_201811')

In [90]:
len(data_array_art)

100

In [89]:
data_array_art = get_all_theme(data_array_art, 'art_201811')

All set index articles 25
Surnames 20
Chemical elements 14
Native element minerals 14
Post-transition metals 14
Source attribution 14
Tin 14
Given names 11
Vietnamese-language surnames 10
All stub articles 9
Greek letters 7
Chinese-language surnames 7
Vietnamese musical instruments 6
Egyptian gods 6
Solar gods 6
Underworld gods 6
Incomplete lists from February 2011 5
Vietnamese names 5
Coastal construction 4
Freight transport 4
Port infrastructure 4
Wharves 4
Crops 3
Fodder 3
Scandals 3
Currencies of Vietnam 3
Modern obsolete currencies 3
Money stubs 3
Airborne warfare 3
Military transport 3
Parachuting 3
1990s LGBT-related films 3
1990s biographical films 3
1990s drama films 3
1998 in American television 3
1998 television films 3
American LGBT-related films 3
American biographical films 3
American drama films 3
American films 3
American mockumentary films 3
Biographical television films 3
Drama films based on actual events 3
Drama television films 3
Female bisexuality in film 3
Films 

2018 singles 6
2018 songs 6
Song articles with missing songwriters 6
Legal terminology 6
Psychology 6
Survey methodology 6
Symptom stubs 6
Healthcare occupations 5
Identity 5
Role status 5
Role theory 5
Sociological terminology 5
Group theory 5
Mathematical objects 5
Numbers 5
Genetic disorders by system 5
Drugs 5
Concepts in metaphysics 5
Cloning 5
Genes 5
Genetics 5
Polymorphism (biology) 5
Medical signs 5
All stub articles 10
All set index articles 9
Surnames 8
German words and phrases 5
1861 establishments in Massachusetts 4
Educational institutions established in 1861 4
Engineering universities and colleges in Massachusetts 4
Instances of Infobox university using image size 4
Land-grant universities and colleges 4
Massachusetts Institute of Technology 4
Rugby league stadiums in the United States 4
Science and technology in Massachusetts 4
Technological universities in the United States 4
Universities and colleges in Cambridge, Massachusetts 4
V-12 Navy College Training Program 4
G

Subsidiary motions 3
Audiology 3
Auditory system 3
Definition 3
Lexicography 3
Mathematical terminology 3
Ontology 3
All set index articles 50
Surnames 46
1861 establishments in Massachusetts 32
Educational institutions established in 1861 32
Engineering universities and colleges in Massachusetts 32
Instances of Infobox university using image size 32
Land-grant universities and colleges 32
Massachusetts Institute of Technology 32
Rugby league stadiums in the United States 32
Science and technology in Massachusetts 32
Technological universities in the United States 32
Universities and colleges in Cambridge, Massachusetts 32
V-12 Navy College Training Program 32
All stub articles 30
Dispute resolution 22
Ethics 22
Human rights abuses 22
Violence 22
Violence against men 22
War 22
Hungarian-language surnames 20
German words and phrases 13
Austrian nobility 12
Danish nobility 12
Finnish nobility 12
German nobility 12
Swedish nobility 12
Dutch culture 9
Profanity by language 9
German-languag

Planning 2
Parts of clothing 2
Stereotypes 1
Concepts in ethics 1
Humans 1
People 1
Personal life 1
Personhood 1
Self 1
Intention 1
Management 1
Planning 1
Painting materials 1
Woven fabrics 1
Aesthetics 1
Arts 1
Thought experiments 1
Visual arts 1
Currency 1
Economic anthropology 1
Emergence 1
Monetary economics 1
Money 1
Publishing 9
Media industry 8
Titles 6
Comics 6
Narrative forms 6
Industry 5
Media formats 3
Aesthetics 2
Earth 2
World 2
Superheroes 2
Books 2
Documents 2
Paper products 2
Legal entities 2
Graphemes 11
ISO basic Latin letters 11
Vowel letters 8
All stub articles 6
Communes of the Territoire de Belfort 4
Territoire de Belfort geography stubs 4
Latin words and phrases 4
Names of God 4
Consiglieri 4
Organized crime members by role 4
Differential operators 4
Mathematical notation 4
Vector calculus 4
Types of country subdivisions 4
Aesthetics 3
Arts 3
Thought experiments 3
Visual arts 3
Populated places in East Attica 2
Vari-Voula-Vouliagmeni 2
1822 births 2
1907 deaths 

Day 5
Kitchen 4
Sales 4
Human communication 3
Language 3
Nonverbal communication 3
Blogs 3
Diaries 3
Journalism 3
New media 3
Non-fiction genres 3
Web 2.0 3
Words coined in the 1990s 3
Employment 3
Occupations 3
Concepts in metaphysics 3
Concepts in physics 3
Physical quantities 3
SI base quantities 3
Time 3
UCUM base quantities 3
Communication 3
Oral communication 3
Crimes 7
Assault 6
Human behavior 6
Crime 5
Dispute resolution 5
Ethics 5
Social conflict 5
Violence 5
Occupations 3
Virtue 3
Hatred 3
Concepts in metaphysics 2
Information 2
Information science 2
Artificial intelligence 2
Educational psychology 2
Neuropsychological assessment 2
Problem solving 2
Psychology articles needing expert attention 2
Silence 2
Sound 2
Bullying 2
Discrimination 2
Prejudices 2
Sexuality and gender-related prejudices 2
Legal professions 2
Criminal law 2
Universities and colleges 2
Area 3
Architects 2
Architecture occupations 2
Professional certification in architecture 2
Business 2
Entrepreneurship 2

In [95]:
allsyns_array_art = get_classification_results(data_array_art, 'art_201811', 'technology')

0.18888888888888888
0.13596491228070173
0.42105263157894735
0.4673202614379085
0.18596491228070175
0.4766081871345029
0.4415204678362573
0.3219814241486068
0.4552429667519181
0.5
0.1736111111111111
0.2583333333333333
0.47794117647058826
0.4375
0.1213235294117647
0.21929824561403508
0.2583333333333333
0.3277777777777778
0.16666666666666666
0.19166666666666665
0.2583333333333333
0.5138888888888888
0.4901315789473684
0.4875
0.2583333333333333
0.4415204678362573
0.44166666666666665
0.4791666666666667
0.3026315789473684
0.45263157894736844
0.5
0.5
0.4901315789473684
0.43236714975845414
0.29084967320261434
0.4444444444444444
0.21929824561403508
0.4415204678362573
0.5138888888888888
0.2583333333333333
0.21637426900584794
0.4444444444444444
0.4879227053140096
0.2583333333333333
0.4809782608695652
0.38888888888888884
0.4875
0.2828282828282828
0.21666666666666667
0.4963235294117647
0.4901315789473684
0.4809782608695652
0.3402777777777778
0.2583333333333333
0.4963235294117647
0.3603238866396761
0

In [None]:
#data_array_business = get_data('business', 14794, 1000, 'news_00', 5)
data_array_sport = get_data('sports', 87156, 100, 'news_00', 5)

In [58]:

data_array_sport = add_categories(data_array_sport, 'sports')

0
1
2


In [97]:
data_array_sport =  get_dictionaries('sports')

In [100]:
data_array_sport = get_all_theme(data_array_sport, 'sports')

Constitutional state types 1
Forms of government 1
Government 1
Government institutions 1
Political terminology 1
Chants 1
Hindu philosophical concepts 1
Indian poetics 1
Mantras 1
Meditation 1
Mysticism 1
Puja (Hinduism) 1
Sanskrit words and phrases 1
Spiritual practice 1
2000s in video gaming 1
2000s toys 1
2001 in video gaming 1
2010s in video gaming 1
2010s toys 1
Computer-related introductions in 2001 1
Eighth-generation video game consoles 1
Home video game consoles 1
Microsoft franchises 1
Microsoft video game consoles 1
Products introduced in 2001 1
Seventh-generation video game consoles 1
Sixth-generation video game consoles 1
Xbox 1
Days 1
Cars 1
German inventions 1
Wheeled vehicles 1
Software requirements 1
Doors 1
Types of gates 1
Companies 1
Legal entities 1
Atmospheric radiation 1
Climate forcing 1
IARC Group 1 carcinogens 1
Light sources 1
Solar energy 1
Sun 1
Days 1
1876 introductions 1
American inventions 1
Canadian inventions 1
Discovery and invention controversies 1


Economic ideologies 2
Accuracy disputes from September 2018 1
Anti-capitalism 1
Anti-fascism 1
Economic systems 1
Left-wing politics 1
Political ideologies 1
Political movements 1
Socialism 1
Capitalism 1
Economic liberalism 1
Production economics 1
Profit 1
Social philosophy 1
Subscription required using via 1
Aristotle 1
Concepts in metaphysics 1
Essentialism 1
Madhyamaka 1
Modal logic 1
Philosophy of life 1
Days 1
Cars 2
German inventions 2
Wheeled vehicles 2
City 2
Days 1
Comedy 2
Business process 2
Business terms 2
Cleanup tagged articles with a reason field from March 2017 2
Distribution (marketing) 2
Forecasting 2
Sales 2
Sales occupations 2
Blood 1
Hematology 1
Tissues (biology) 1
Time 1
Elections 2
Elections 2
Office and administrative support occupations 2
Companies 1
Legal entities 1
Days 1
Communication 1
News 1
Radio terminology 1
Sociology of knowledge 1
Television terminology 1
Documents 1
Technical communication 1
Aerial bombs 1
Bombs 1
Chinese inventions 1
Days 2
Chine

Graphemes 1
ISO basic Latin letters 1
Vowel letters 1
Night 2
Parts of a day 2
Spoken articles 2
Time in astronomy 2
Giants 1
Mythic humanoids 1
Mythological monsters 1
Elections 11
Parliamentary procedure 2
Voting 2
Political science terminology 2
Crimes 2
Deception 2
Financial crimes 2
Fraud 2
Property crimes 2
Tort law 2
Forms of government 2
Parliamentary procedure 5
Meetings 4
Public records 3
Leadership 3
Management 3
Management occupations 3
Organizational theory 3
Humans 2
People 2
Cleanup tagged articles with a reason field from February 2016 2
Vehicles 2
Legal terminology 2
Ownership 2
Property 2
Documents 6
Law 4
Educational stages 3
Higher education 3
Types of university or college 3
Universities and colleges 3
Youth 3
Books 3
Media formats 3
Paper products 3
Technical communication 3
Law enforcement 9
Legal professions 9
Criminal law 8
Government occupations 8
Positions of authority 6
Local government 5
Prosecution 5
Prosecutors 5
Occupations 5
Lawyers 4
Legal ethics 4
Amb

In [101]:
allsyns_array_sport = get_classification_results(data_array_sport, 'sports', 'sports')

0.3713235294117647
0.4055727554179567
0.40294117647058825
0.4055727554179567
0.4226190476190476
0.38596491228070173
0.4305555555555556
0.4617647058823529
0.55
0.5222222222222223
0.4617647058823529
0.4196078431372549
0.5192982456140351
0.45263157894736844
0.4375
0.41544117647058826
0.4196078431372549
0.44166666666666665
0.2932692307692308
0.3713450292397661
0.41544117647058826
0.4454545454545455
0.39140271493212675
0.411764705882353
0.3055555555555556
0.27485380116959063
0.3482142857142857
0.4617647058823529
0.48120300751879697
0.44166666666666665
0.4126984126984127
0.5254901960784314
0.411764705882353
0.3713235294117647
0.4055727554179567
0.4875
0.39215686274509803
0.4196078431372549
0.3026315789473684
0.4196078431372549
0.532967032967033
0.39215686274509803
0.40294117647058825
0.3713235294117647
0.3713235294117647
0.41544117647058826
0.40977443609022557
0.45263157894736844
0.38888888888888884
0.411764705882353
0.43724696356275305
0.4305555555555556
0.24632352941176472
0.44545454545454

In [108]:
### what is the typical result for clearly correlated words?
allsyns1 = set(wordnet.synsets('computer', pos=wordnet.NOUN))
allsyns2 = set(wordnet.synsets('compost', pos=wordnet.NOUN))
best = max((wordnet.wup_similarity(s1, s2) or 0, s1, s2) for s1, s2 in 
    product(allsyns1, allsyns2))
worst = min((wordnet.wup_similarity(s1, s2) or 0, s1, s2) for s1, s2 in 
    product(allsyns1, allsyns2))
print(best[0], worst[0],(best[0]+worst[0])/2)

0.3076923076923077 0.25 0.27884615384615385


In [103]:
best

(0.5714285714285714, Synset('andiron.n.01'), Synset('bark.n.03'))