In [1]:
import pandas as pd
import xmltodict
import json
import re, string

In [2]:
df_click = pd.read_csv('Datasets/clickstream-enwiki-2020-01.tsv', delimiter='\t', encoding='utf-8', names=['referer', 'resource', 'path', 'count'])

In [3]:
df_click.head(10)

Unnamed: 0,referer,resource,path,count
0,Eddie_Albert,The_Dude_Goes_West,link,17
1,other-empty,The_Dude_Goes_West,external,112
2,Gale_Storm,The_Dude_Goes_West,link,15
3,other-external,Gianluca_Scamacca,external,18
4,other-internal,Gianluca_Scamacca,external,111
5,Ascoli_Calcio_1898_F.C.,Gianluca_Scamacca,link,87
6,other-empty,Gianluca_Scamacca,external,311
7,2019–20_Coppa_Italia,Gianluca_Scamacca,link,333
8,2018_UEFA_European_Under-19_Championship,Gianluca_Scamacca,link,23
9,other-search,Gianluca_Scamacca,external,2638


In [2]:
with open('Datasets/enwiki-20200101-pages-articles-multistream-index1.txt-p10p30302', encoding='utf8') as file:
    data_index = file.read()

In [2]:
with open('Datasets/enwiki-20200101-pages-articles-multistream1.xml-p10p30302', encoding='utf8') as file:
    #data_text = file.read()
    doc = xmltodict.parse(file.read())

In [4]:
print(data_index[:300])

615:10:AccessibleComputing
615:12:Anarchism
615:13:AfghanistanHistory
615:14:AfghanistanGeography
615:15:AfghanistanPeople
615:18:AfghanistanCommunications
615:19:AfghanistanTransportations
615:20:AfghanistanMilitary
615:21:AfghanistanTransnationalIssues
615:23:AssistiveTechnology
615:24:AmoeboidTax


In [5]:
#print(data_text[:1000000])

<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd" version="0.10" xml:lang="en">
  <siteinfo>
    <sitename>Wikipedia</sitename>
    <dbname>enwiki</dbname>
    <base>https://en.wikipedia.org/wiki/Main_Page</base>
    <generator>MediaWiki 1.35.0-wmf.11</generator>
    <case>first-letter</case>
    <namespaces>
      <namespace key="-2" case="first-letter">Media</namespace>
      <namespace key="-1" case="first-letter">Special</namespace>
      <namespace key="0" case="first-letter" />
      <namespace key="1" case="first-letter">Talk</namespace>
      <namespace key="2" case="first-letter">User</namespace>
      <namespace key="3" case="first-letter">User talk</namespace>
      <namespace key="4" case="first-letter">Wikipedia</namespace>
      <namespace key="5" case="first-letter">Wikipedia talk</namespace>
      <n

In [44]:
print(doc['mediawiki']['page'][11])



In [87]:
df_text = pd.DataFrame(columns=['title', 'text', 'wiki_link', 'redirect'])

for page in doc['mediawiki']['page']:
    title = page['title']
    text = ''
    wiki_link = ''
    redirect = 'F'
    
    if 'redirect' in page:
        # only keeping redirecting link
        txt = re.search('(\[\[(.*?)\]\])', page['revision']['text']['#text']).group(1)
        txt = re.sub('\[*\]*', '', txt)
        redirect = 'T'
        wiki_link = txt.strip()
        
    else:
        # getting rid of {{~}}, [[File:~]], <!-- ~ -->, <ref ~ />, <ref ~</ref>, <br~>
        txt = re.sub(r'({{(.*?)}})|(\[\[File:(.*?)\n)|(\<\!\-\-(.*?)\-\-\>)|(\<ref(.*?)\/\>)|(\<ref(.*?)\<\/ref\>)|(\<br(\s?\/?)\>)', 
                     '', page['revision']['text']['#text'], 0, re.DOTALL)
        
        # separating internal links
        link = re.findall('(\[\[(.*?)\]\])', txt)
        text = re.sub('(\[\[(.*?)\]\])|(\\n)', ' ', txt, 0, re.DOTALL)
        
        for c in link:
            if '|' in c[1]:
                sep = c[1].split('|')
                wiki_link = wiki_link + ', ' + sep[0]
                text = text + ', ' + sep[1]
            else:
                wiki_link = wiki_link + ', ' + c[1]
                text = text + ', ' + c[1]
                
    df_text = df_text.append({'title': title, 'text': text, 'wiki_link': wiki_link, 'redirect': redirect}, ignore_index=True)
    

In [47]:
import nltk
from nltk import download
from nltk import word_tokenize
from nltk.corpus import stopwords

In [50]:
nltk.download('stopwords')
nltk.download('punkt')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\young\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\young\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [88]:
def preprocess(doc):
    doc = doc.lower()  # Lower the text.
    doc = word_tokenize(doc)  # Split into words.
    doc = [w for w in doc if not w in stop_words]  # Remove stopwords.
    doc = [w for w in doc if w.isalpha()]  # Remove numbers and punctuation.
    while (doc.count('n')): 
        doc.remove('n') 
    while (doc.count('br')): 
        doc.remove('br') 
    return doc

In [89]:
df_text['text'] = df_text['text'].apply(preprocess)

In [94]:
def preprocess_link(doc):
    if doc.startswith(', '):
        doc = doc[2:]
    doc = doc.split(', ')
    return doc

In [91]:
df_text['wiki_link'] = df_text['wiki_link'].apply(preprocess_link)

In [92]:
df_text.head(15)

Unnamed: 0,title,text,wiki_link,redirect
0,AccessibleComputing,[],[Computer accessibility],T
1,Anarchism,"[rejects, deemed, unjust, advocates, replaceme...","[Anti-authoritarianism, Political philosophy, ...",F
2,AfghanistanHistory,[],[History of Afghanistan],T
3,AfghanistanGeography,[],[Geography of Afghanistan],T
4,AfghanistanPeople,[],[Demographics of Afghanistan],T
5,AfghanistanCommunications,[],[Communications in Afghanistan],T
6,AfghanistanTransportations,[],[Transport in Afghanistan],T
7,AfghanistanMilitary,[],[Afghan Armed Forces],T
8,AfghanistanTransnationalIssues,[],[Foreign relations of Afghanistan],T
9,AssistiveTechnology,[],[Assistive_technology],T


In [93]:
# I think an excel cell has a limitation to word count. When it's too long, it overflows. You better use df directly instead of csv
df_text.to_csv (r'Results/article_text1.csv', index = False, header=True)

In [7]:
with open('Datasets/pageviews-20200101-000000', encoding='utf8') as file:
    data_view = file.read()

In [8]:
print(data_view[:300])

aa Main_Page 8 0
aa Special:Log/block 1 0
aa User:Teles 1 0
aa Wikipedia 1 0
aa.b Special:Contributions/Teles 1 0
aa.b Special:ListUsers 4 0
aa.b Special:Log/block 1 0
aa.b User:Sir_Lestaty_de_Lioncourt 2 0
aa.d User:Teles 1 0
ab 1357 1 0
ab 165 1 0
ab 1880 1 0
ab 1906 1 0
ab 1987 1 0
ab 2020 1 0
ab
