In [20]:
import concurrent.futures
import re
import requests
import random
from collections import defaultdict
import spacy
from spacy import displacy
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

from gensim import corpora
from gensim import models
from gensim import similarities
from gensim.corpora import Dictionary


def get_wikipedia_article(wiki_article, timeout=10):
    '''
    Method that gets a Wikipedia page. 
    '''
    wiki_page_url = 'https://en.wikipedia.org/wiki/'+wiki_article
    response = requests.get(url=wiki_page_url, timeout=timeout)
    article = response.text
    return article


def get_wikipedia_articles(wiki_articles):
    with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
        futures = []
        for article in wiki_articles:
            futures.append(executor.submit(get_wikipedia_article,
                                           wiki_article=article)
                          )
        articles = []
        for future in concurrent.futures.as_completed(futures):
            articles.append(future.result())
    return articles

wiki_articles = [
    # Movies:
    'Rocky_IV', 
    'The_Terminator', 
    'Memento_(film)', 
    'Braveheart',
    'Heat_(1995_film)',
    'Iron_Man_3',
    'Inside_Out_(2015_film)',
    'Rambo_III',
    'Space_Jam',
    # Greek Athletes
    'Vassilis_Spanoulis',
    'Eleftherios_Petrounias',
    'Konstantinos_Kenteris',
    'Stefanos_Tsitsipas',
    'Nikos_Galis',
    'Vasilis_Hatzipanagis',
    'Giannis_Antetokounmpo',
    #Global Athletes and Actors
    'Michael_Jordan', # Michael Jordan , except for famous athlete, was an actor in Space Jam , so it is interesting to see the outcome of our algorithms :-)
    #Actors
    'Al_Pacino',
    'Robert_De_Niro',
    'Mel_Gibson',
    'Robert_Downey_Jr.',
    'Arnold_Schwarzenegger'
    ]

In [2]:
#Original Corpus from the requested articles
corpus = get_wikipedia_articles(wiki_articles)

In [3]:
# Functions to clean article

def clean_html(raw_html):
    cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    cleantext = re.sub(cleanr, ' ', raw_html)
    return cleantext

def clean_punctuation_marks(text):
    return re.sub(r'[^\w\s]',' ', text)

def clean_return_and_tabs(text):
    return re.sub(r'[\n, \t]',' ', text)

def clean_article(data):
    cdata = clean_html(data)
    cdata = clean_punctuation_marks(cdata)
    cdata = clean_return_and_tabs(cdata)
    return cdata

In [4]:
ordered_articles = []
for c in corpus:
    title_pos = c.find('<title>')+7
    title = c[title_pos:title_pos+50].split(' - ')[0].lower()
    ordered_articles.append(title)

corpus = [clean_article(a) for a in corpus]

In [5]:
print('Our corpus has', len(corpus), 'documents')

Our corpus has 22 documents


In [6]:
documents = corpus

In [7]:
#Helper class to store subjects
class Subjects():

    def __init__(self,document_id,text):
        self.document_id = document_id
        self.text = text

In [8]:
#Helper class to store name entities
class NameEntities():

    def __init__(self,document_id,text):
        self.document_id = document_id
        self.text = text

In [9]:
#Helper class to add values to dictionary
class dictionaryManager(dict):
  
    def __init__(self):
        self = dict()
          
    def add(self, key, value):
        self[key] = value

In [10]:
def extract_sentences(text):
    text = re.split('[.?]', text)
    clean_sent = []
    for sent in text:
        clean_sent.append(sent)
    return clean_sent

In [11]:
#Load spacy library for English
nlp_english = spacy.load('en_core_web_sm')

#Get words stoplist for english language
stoplist = stopwords.words('english')

In [12]:
#Get for each article the subjects and store them in a collection with an id as key holder for each document and of course the subjects.
document_subjects = []
subjects=[]
counter=0

for document in documents:
    counter+=1
    words = nltk.tokenize.word_tokenize(document)
    fdist = nltk.FreqDist(words)
    doc = nlp_english(document)
    for token in doc:
        if (re.findall("_SP", token.tag_)):
            continue
        if (token.dep_ == "nsubj"):
            if str(token.text) not in subjects:
                subjects.append(str(token.text).lower().replace(" ", ""))
                document_subjects.append(Subjects(counter,token.text))
            

In [13]:
#Iterate through documents collection, get sentence for each one of them and for each sentence store subjects in a new collection
#Store name entities in a dictionary
name_entities = []
entities=[]
sentences=dictionaryManager()
sentence_clear=[]
subjects=[]
counter=0
article_id=0
for article in documents:
    article_id+=1
    doc = nlp_english(article)
    sentence = next(doc.sents) 
    sentence_spans = list(doc.sents)
    for s in sentence_spans:
        if len(s) > 1:
            counter+=100
            sentences.add(counter,clean_article(s.text))
            cleared_text = clean_article(s.text)
            sentence_clear.append(cleared_text)
            if len(s.ents) > 0:
                name_entities.append(NameEntities(article_id,s.ents))
                for e in s.ents:
                    entity = str(e).lower().replace(" ", "")
                    if entity not in entities:
                        entities.append(entity)
            

In [30]:
#Create a new collection which is the new intersection between the subjects list and name entities ones.

common_name_entities = set(subjects) & set(entities)
common_name_entities_list = list(common_name_entities)

In [14]:
#Create new corpus as bag of words in order to calculate the TF-IDF and LDA

texts = [
    [word for word in document.lower().split() if word not in stoplist]
    for document in documents
]

frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [
    [token for token in text if frequency[token] > 1 and len(token) > 1]
    for text in texts
]

dictionary = corpora.Dictionary(texts)
corpus2 = [dictionary.doc2bow(text) for text in texts]

In [15]:
#Train TF-IDF

tfidf = models.TfidfModel(corpus2)

matrix = similarities.MatrixSimilarity(tfidf[corpus2])

wiki_article = 0
for article_sims in zip(ordered_articles, list(matrix)):
    print('Article',ordered_articles[wiki_article],':')
    sorted_articles = sorted(zip(ordered_articles, article_sims[1]), key= lambda x: -x[1])[1:10]
    for sa in sorted_articles:
        print('\t* ', sa[0], ':', sa[1])
    wiki_article += 1

Article vasilis hatzipanagis :
	*  nikos galis : 0.05930094
	*  vassilis spanoulis : 0.0392687
	*  rocky iv : 0.03643353
	*  michael jordan : 0.025617478
	*  giannis antetokounmpo : 0.02204236
	*  rambo iii : 0.020701978
	*  konstantinos kenteris : 0.016912537
	*  stefanos tsitsipas : 0.016590396
	*  space jam : 0.010364338
Article eleftherios petrounias :
	*  konstantinos kenteris : 0.16971952
	*  nikos galis : 0.03894057
	*  stefanos tsitsipas : 0.018807884
	*  michael jordan : 0.018679997
	*  vassilis spanoulis : 0.018407414
	*  vasilis hatzipanagis : 0.008787708
	*  iron man 3 : 0.007922967
	*  al pacino : 0.007455821
	*  the terminator : 0.007215864
Article heat (1995 film) :
	*  robert de niro : 0.12506604
	*  al pacino : 0.11436206
	*  memento (film) : 0.049268425
	*  space jam : 0.044434436
	*  iron man 3 : 0.042685
	*  braveheart : 0.039162166
	*  inside out (2015 film) : 0.038378444
	*  the terminator : 0.03593237
	*  rambo iii : 0.03493269
Article rocky iv :
	*  rambo iii : 

In [None]:
#Michael Jordan has bigger similarity to Giannis Antetokounmpo in comparison with the movie that played him self called Space Jam. That's logical, he is first an athelete and secondly an actor!!

In [16]:
#LDA
NUM_TOPICS = 20

lda_model = models.LdaModel(corpus2, id2word=dictionary, num_topics=NUM_TOPICS, passes=5)

lda_model.get_topics()[0]

for t in range(NUM_TOPICS):
    print('TOPIC', t,':')
    print(lda_model.show_topic(t))

TOPIC 0 :
[('retrieved', 0.0055432143), ('mw', 0.0054155895), ('archived', 0.0041476595), ('output', 0.0038850955), ('parser', 0.003427323), ('film', 0.003093446), ('may', 0.0030912515), ('original', 0.0028050216), ('2020', 0.0026860766), ('jordan', 0.0023277428)]
TOPIC 1 :
[('retrieved', 0.006094558), ('mw', 0.00456165), ('film', 0.004269564), ('original', 0.0037337008), ('archived', 0.0036826336), ('output', 0.0032380384), ('parser', 0.0028901114), ('may', 0.002555513), ('2020', 0.0025270437), ('2018', 0.002200967)]
TOPIC 2 :
[('mw', 0.006326621), ('retrieved', 0.0058673504), ('original', 0.004248489), ('parser', 0.004223608), ('archived', 0.00395002), ('film', 0.0035955107), ('output', 0.003380074), ('jordan', 0.002955665), ('2020', 0.0021982626), ('2010', 0.0021694552)]
TOPIC 3 :
[('retrieved', 0.005632358), ('mw', 0.0040267003), ('parser', 0.0038793124), ('output', 0.003665599), ('film', 0.003631018), ('archived', 0.0035872697), ('original', 0.0028912462), ('may', 0.002249602), ('

In [18]:
sentences_fixed = sentence_clear[70:]

In [19]:
sentences_cleaned=[]
for sent in sentences_fixed:
    sentences_cleaned.append(extract_sentences(sent))

In [23]:
random.shuffle(sentences_fixed)

In [24]:
#Visualize model, highlight dependencies

a = 0
for sentence in sentences_fixed:
    a+=1
    doc = nlp_english(sentence)
    displacy.render(doc, style='dep', jupyter=True, options={'distance': 95})
    if a==10:
        break

In [25]:
# Export SVG files of dependency parses
from pathlib import Path
import os

a = 0
current_folder = os.getcwd() + "/images/"

for sentence in sentences_fixed:
    a+=1
    doc = nlp_english(sentence)
    svg = displacy.render(doc, style="dep", jupyter=False)
    file_name = current_folder + "/" + str(a) + ".svg"

    if not os.path.exists(current_folder):
        os.makedirs(current_folder)
    output_path = Path(file_name)
    output_path.open("w", encoding="utf-8").write(svg)
    if a==10:
        break
    

In [26]:
#Test model for other sentences (human brain)

doc = 'The human brain is the central organ of the human nervous system, and with the spinal cord makes up the central nervous system. The brain consists of the cerebrum, the brainstem and the cerebellum. It controls most of the activities of the body, processing, integrating, and coordinating the information it receives from the sense organs, and making decisions as to the instructions sent to the rest of the body. The brain is contained in, and protected by, the skull bones of the head.'.split()

doc_vector = lda_model.id2word.doc2bow(doc)
doc_topics = lda_model[doc_vector]
print('Here is the predicted weight of each topic:')
doc_topics

Here is the predicted weight of each topic:


[(9, 0.94982547)]

In [29]:
#Test model for other sentences (classic guitar)

doc = 'The classical guitar (also known as the classic guitar) is a member of the guitar family used in classical music. An acoustic wooden string instrument with strings made of gut or nylon, it is a precursor of the modern acoustic and electric guitars, both of which use metal strings. Classical guitars are derived from the Spanish vihuela and gittern in the fifteenth and sixteenth century, which later evolved into the seventeenth and eighteenth-century Baroque guitar and later the modern classical guitar in the mid-nineteenth century.'.split()

doc_vector = lda_model.id2word.doc2bow(doc)
doc_topics = lda_model[doc_vector]
print('Here is the predicted weight of each topic:')
doc_topics

Here is the predicted weight of each topic:


[(9, 0.8795259), (10, 0.07533508)]