In [1]:
import pandas as pd
import xmltodict, nltk
import json
import re, string, ast
import numpy as np

from gensim import models
from gensim.corpora import Dictionary, MmCorpus
from nltk import download, tokenize, word_tokenize, pos_tag 
from nltk.corpus import stopwords
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/matt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
with open('enwiki-20200101-pages-articles-multistream1.xml-p10p30302', encoding='utf8') as file:
    doc = xmltodict.parse(file.read())

In [4]:
df_text = pd.DataFrame(columns=['title', 'text', 'wiki_link', 'redirect'])

for page in doc['mediawiki']['page']:
    title = page['title']
    text_w = ''
    text_s = ''
    wiki_link = ''
    redirect = 'F'
    
    if 'redirect' in page:
        # only keeping redirecting link
        txt = re.search('(\[\[(.*?)\]\])', page['revision']['text']['#text']).group(1)
        txt = re.sub('\[*\]*', '', txt)
        redirect = 'T'
        wiki_link = txt.strip()
        
    else:
        # getting rid of {{~}}, [[File:~]], <!-- ~ -->, <ref ~ />, <ref ~</ref>, <br~>
        txt = re.sub(r'({{(.*?)}})|(\[\[File:(.*?)\n)|(\<\!\-\-(.*?)\-\-\>)|(\<ref(.*?)\/\>)|(\<ref(.*?)\<\/ref\>)|(\<br(\s?\/?)\>)', 
                     '', page['revision']['text']['#text'], 0, re.DOTALL)
        
        # separating internal links
        link = re.findall('(\[\[(.*?)\]\])', txt)
        text_w = re.sub('(\[\[(.*?)\]\])|(\\n)', ' ', txt, 0, re.DOTALL)
        text_s = re.sub('(?<=^\[\[\b).*(?=\b\|(.*?)\]\])|(\\n)',' ',txt, 0, re.DOTALL) #keeping the links first
        text_s = re.sub('(\[\[(.*?)\]\])|(\\n)', ' ', text_s, 0, re.DOTALL)
        
        for c in link:
            if '|' in c[1]:
                sep = c[1].split('|')
                wiki_link = wiki_link + ', ' + sep[0]
                text_w = text_w + ', ' + sep[1]
            else:
                wiki_link = wiki_link + ', ' + c[1]
                text_w = text_w + ', ' + c[1]
                
    df_text = df_text.append({'title': title, 'text': text_w, 'wiki_link': wiki_link, 'redirect': redirect, 'sentences': text_s}, ignore_index=True) 
    

<hr>

### Section that I created the random sampling google sheet. 

<hr>

In [9]:
df_text = df_text.head(2000)

In [10]:
def preprocess_sentence(doc):
    return tokenize.sent_tokenize(doc)

In [11]:
df_text['sentences'] = df_text['sentences'].apply(preprocess_sentence)

In [12]:
nltk.download('stopwords')
nltk.download('punkt')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /home/matt/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/matt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
def preprocess_word(doc):
    doc = doc.lower()  # Lower the text.
    doc = word_tokenize(doc)  # Split into words.
    doc = [w for w in doc if not w in stop_words]  # Remove stopwords.
    doc = [w for w in doc if w.isalpha()]  # Remove numbers and punctuation.
    while (doc.count('n')): 
        doc.remove('n') 
    while (doc.count('br')): 
        doc.remove('br') 
    return doc

In [14]:
df_text['text'] = df_text['text'].apply(preprocess_word)

In [15]:
def preprocess_link(doc):
    if doc.startswith(', '):
        doc = doc[2:]
    doc = doc.split(', ')
    return doc

In [16]:
df_text['wiki_link'] = df_text['wiki_link'].apply(preprocess_link)

# Text Modelling for each topic

In [17]:
df_text.head(5)

Unnamed: 0,title,text,wiki_link,redirect,sentences
0,AccessibleComputing,[],[Computer accessibility],T,[]
1,Anarchism,"[rejects, deemed, unjust, advocates, replaceme...","[Anti-authoritarianism, Political philosophy, ...",F,[ '''Anarchism''' is an and that ...
2,AfghanistanHistory,[],[History of Afghanistan],T,[]
3,AfghanistanGeography,[],[Geography of Afghanistan],T,[]
4,AfghanistanPeople,[],[Demographics of Afghanistan],T,[]


In [18]:
df_text['text']

0                                                      []
1       [rejects, deemed, unjust, advocates, replaceme...
2                                                      []
3                                                      []
4                                                      []
                              ...                        
1995    [american, english, term, seven, brightest, st...
1996    [settled, nbsp, bc, münir, karaloğlu, city, ea...
1997    [capital, billion, gini, hdi, decrease, bs, cc...
1998    [united, states, international, date, line, we...
1999    [gônoprojatontri, bangladesh, flag, national, ...
Name: text, Length: 2000, dtype: object

In [19]:
def filter_words(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ' or pos[:2] == 'RB'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [20]:
#Picks out nouns, adverbs, adjectives and removes unwanted characters of each sentence for each article
final_ls = []
for i in range(0, len(df_text)):
    fil_sent = df_text['sentences'][i]
    sen_list = []
    for j in range(0,len(fil_sent)):
        b = filter_words(fil_sent[j])
        res = re.sub('['+string.punctuation+']', '', b).split() 
        listToStr = ' '.join([str(val) for val in res]) 
        sen_list.append(listToStr)
    listToStr2 = ' '.join([str(val) for val in sen_list])
    final_ls.append(listToStr2)
new_df = pd.DataFrame({'tokens': final_ls})
new_df

Unnamed: 0,tokens
0,
1,Anarchism unjust replacement societies volunta...
2,
3,
4,
...,...
1995,American English term brightest stars Plough B...
1996,subdivisiontype subdivisionname subdivisiontyp...
1997,largestcity capital languagestype languages Ot...
1998,archipelago areakm2 areasqmi lengthkm widthkm ...


<hr>

# Matt's territory of LDAvis and topic distribution example with cached LDA

In [21]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
import pandas as pd
import sys
import gensim
from gensim import corpora
from gensim.models import LdaModel
import pyLDAvis.gensim
import gensim
pyLDAvis.enable_notebook()

stopwords = set(stopwords.words('english'))
punctuation = set(string.punctuation) 
lemmatize = WordNetLemmatizer()
import warnings
warnings.filterwarnings("ignore")


In [26]:
text = df_text['text'].dropna()
text_full = text[text.apply(len)>0]


In [27]:
dictionary = corpora.Dictionary(text_full)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in text_full]
corpora.MmCorpus.serialize('corpus.mm', doc_term_matrix)
Lda = gensim.models.ldamodel.LdaModel


In [28]:
ldamodel = Lda(doc_term_matrix, num_topics=15, id2word = dictionary, passes=15)
c = gensim.corpora.MmCorpus('corpus.mm')
ldavisdata = pyLDAvis.gensim.prepare(ldamodel, c, dictionary)
pyLDAvis.save_html(ldavisdata,'wiki_topics_15.html')

And now for scoring them

In [31]:
import pickle

In [32]:
file = open('dictionary.pkl', 'wb')
pickle.dump(dictionary, file)
file.close()

In [33]:
file = open('ldamodel.pkl', 'wb')
pickle.dump(ldamodel, file)
file.close()

In [38]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in text]
corpora.MmCorpus.serialize('corpus.mm', doc_term_matrix)
id2word = corpora.Dictionary(text)
mm = [id2word.doc2bow(t) for t in text]
topics = pd.DataFrame(dict(ldamodel[x]) for x in mm)
df_text = df_text.join(topics)

In [43]:
df_text.head(5)

Unnamed: 0,title,text,wiki_link,redirect,sentences,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,AccessibleComputing,[],[Computer accessibility],T,[],0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667
1,Anarchism,"[rejects, deemed, unjust, advocates, replaceme...","[Anti-authoritarianism, Political philosophy, ...",F,[ '''Anarchism''' is an and that ...,,0.725207,,,,,,,,,0.269706,,,,
2,AfghanistanHistory,[],[History of Afghanistan],T,[],0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667
3,AfghanistanGeography,[],[Geography of Afghanistan],T,[],0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667
4,AfghanistanPeople,[],[Demographics of Afghanistan],T,[],0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667


In [44]:
df_text.to_csv('LDA_Topics_Sample.csv')

<hr>