## Data

In [15]:
import os
import re
import bz2
import html

dataset = []
# iterate to all the folders and get the title and file name from all the index.html files

# read the file from index files
file = open(os.getcwd()+"/wiki_00.txt",  'r', encoding="utf-8")
file = bz2.open("wiki_00.bz2", "rb")
text = html.unescape(file.read().strip().decode("utf-8"))
#text = re.sub("\n"," ",text)
#text = re.sub(" +"," ",text)
file.close()
#print(text)

# use simple regular expression to retrieve the article id, url, title and body.
article_id = re.findall('<doc id="(.*)" url', text)
article_url = re.findall('url="(.*)" title', text)
article_title = re.findall('title="(.*)">', text)
article_body =[]
article_body =[]
print(article_id)
print(article_url[:3])
print(article_title)
text = re.sub("\n"," ",text)
text = re.sub(" +"," ",text)
text = re.sub("\'","'",text) # issue created by previous substitutions
for title in article_title:
    #regex = title+'">\n'+title+'\n\n([\s\S]*)\n\n\n\n\n\n\n\n\n</doc>'
    regex = title+'"> '+title+'(.*) </doc>'
    body = re.findall(regex, text)
    #print(body)
    article_body.append(body)
articles =[]
for article in article_body:
    body = article[0].split('</doc>', 1)[0]
    #print(body + ' ENDEND')
    articles.append(body)
    
print(len(article_id),len(article_url),len(article_title),len(article_body))

print(articles[len(articles)-1])

for j in range(len(article_id)):
    dataset.append((article_id,article_url,article_title,article_body))     
        
N = len(dataset)

['12', '25', '39', '290', '303', '305']
['https://en.wikipedia.org/wiki?curid=12', 'https://en.wikipedia.org/wiki?curid=25', 'https://en.wikipedia.org/wiki?curid=39']
['Anarchism', 'Autism', 'Albedo', 'A', 'Alabama', 'Achilles']
6 6 6 6
 In Greek mythology, Achilles or Achilleus ( ; , "Achilleus" ) was a Greek hero of the Trojan War and the central character and the greatest warrior of Homer's "Iliad". His mother was the immortal Nereid Thetis, and his father, the mortal Peleus, was the king of the Myrmidons. Achilles' most notable feat during the Trojan War was the slaying of the Trojan hero Hector outside the gates of Troy. Although the death of Achilles is not presented in the "Iliad", other sources concur that he was killed near the end of the Trojan War by Paris, who shot him in the heel with an arrow. Later legends (beginning with a poem by Statius in the 1st century AD) state that Achilles was invulnerable in all of his body except for his heel because, when his mother Thetis di

## Clean

In [25]:
import numpy as np

def convert_lower_case(data):
    return np.char.lower(data)

from nltk.corpus import stopwords
from nltk import word_tokenize
#nltk.download('stopwords')

def remove_stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    # iterate over all the stop words and not append to the list if it’s a stop word
    new_text = ""
    for word in words:
        if word not in stop_words and len(word) > 1: # remove stop words and single characters
            new_text = new_text + " " + word
    return new_text

def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n'"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ') # remove every occurence of this symbol
        data = np.char.replace(data, "  ", " ") # remove extra spaces
    data = np.char.replace(data, ',', '') #remove comma seperately at last?
    return data

def remove_apostrophe(data):
    return np.char.replace(data, "'", "") #seperate?

from nltk.stem.porter import *
from nltk.stem import PorterStemmer

def stemming(data): # reduce words to its stem
    stemmer= PorterStemmer() # rule-based stemmer, identifies and removes the suffix or affix of a word
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text
# A better efficient way to proceed is to first lemmatise and then stem

import num2words

def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w),lang='en')
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text

def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data) #remove comma seperately
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = convert_numbers(data)
    data = stemming(data)
    data = remove_punctuation(data)
    data = convert_numbers(data)
    data = stemming(data) #needed again as we need to stem the words
    data = remove_punctuation(data) #needed again as num2word is giving few hypens and commas fourty-one
    data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
    return data

In [26]:
processed_text = []
processed_title = []
for i in dataset:
    processed_text.append(word_tokenize(str(preprocess(i[3]))))
    processed_title.append(word_tokenize(str(preprocess(i[2]))))

## Model

In [36]:
DF = {}
# iterate through all the words in all the documents and store the document id’s for each word.
for i in range(N):
    tokens = processed_text[i] # body of the document
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}

    tokens = processed_title[i]
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}
            
for i in DF:
    DF[i] = len(DF[i]) # unique words, we don’t actually need the list of docs, we just need the count

total_vocab_size = len(DF)
total_vocab = [x for x in DF]

#getter
def doc_freq(word):
    try:
        print(word)
        #print(DF[word])
        return DF[word]
    except:
        return 0

In [40]:
# let’s use dictionary with (document, token) pair as key and any TF-IDF score as the value
# tf_idf dictionary is for body, we will use the same logic for to build a dictionary tf_idf_title for the words in title.
from collections import Counter

# Calculate TF-IDF for Body for all docs
doc = 0
tf_idf = {}
#iterate over all documents
for i in range(N):  
    tokens = processed_text[i]
    counter = Counter(tokens + processed_title[i])  
    words_count = len(tokens + processed_title[i])
    # Counter can give us the frequency of the tokens, calculate tf and idf and finally store as a (doc, token) pair in tf_idf.
    for token in np.unique(tokens):      
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = np.log((N+1.)/(df+1.)) #numerator is added 1 to avoid negative values
        tf_idf[doc, token] = tf*idf
    doc += 1
# Calculate TF-IDF for title for all docs   
doc = 0
tf_idf_title = {}
for i in range(N):
    tokens = processed_title[i]
    counter = Counter(tokens + processed_text[i])
    words_count = len(tokens + processed_text[i])
    for token in np.unique(tokens):
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = np.log((N+1)/(df+1)) #numerator is added 1 to avoid negative values
        tf_idf_title[doc, token] = tf*idf
    doc += 1

02
0.0
04
0.0
05
0.0
09
0.0
10
0.0
100
0.0
1000
0.0
10000
0.0
100000
0.0
101821
0.0
104991
0.0
105
0.0
106280
0.0
108
0.0
108000
0.0
108135
0.0
10th
0.0
11
0.0
113
0.0
116250
0.0
12
0.0
120000
0.0
120155
0.0
121054
0.0
121110
0.0
121800
0.0
12410
0.0
125
0.0
12770
0.0
12th
0.0
13
0.0
130
0.0
136
0.0
1380121
0.0
13th
0.0
14
0.0
1400
0.0
143000
0.0
147
0.0
1496
0.0
15
0.0
150647
0.0
1540
0.0
154040
0.0
157th
0.0
15th
0.0
160
0.0
1600
0.0
1617
0.0
162000
0.0
1642
0.0
165
0.0
167160
0.0
16th
0.0
17
0.0
170
0.0
1702
0.0
1711
0.0
1747
0.0
1760
0.0
1763
0.0
1767
0.0
1770
0.0
1783
0.0
1793
0.0
1798
0.0
17th
0.0
18
0.0
180
0.0
1800
0.0
18000
0.0
1804
0.0
1810
0.0
181000
0.0
1812
0.0
1813
0.0
1817
0.0
1817–1862
0.0
1819
0.0
1820
0.0
1823
0.0
1825
0.0
1826
0.0
1830
0.0
1840
0.0
1842
0.0
1844
0.0
1846
0.0
1847
0.0
1849
0.0
185
0.0
1850
0.0
1851
0.0
1860
0.0
1861
0.0
1864
0.0
1865
0.0
1867
0.0
1868
0.0
1871
0.0
1872
0.0
1874
0.0
1875
0.0
1889
0.0
1890
0.0
1891
0.0
1896
0.0
18th
0.0
18–30
0.0
1900
0

0.0
blood
0.0
blu
0.0
blue
0.0
board
0.0
boast
0.0
bob
0.0
bodi
0.0
bodkin
0.0
bohemian
0.0
boll
0.0
bolshevik
0.0
bond
0.0
bone
0.0
book
0.0
border
0.0
borgu
0.0
born
0.0
borrow
0.0
borysthen
0.0
boss
0.0
botch
0.0
boundari
0.0
bow
0.0
bowl
0.0
box
0.0
boy
0.0
boycott
0.0
boyd
0.0
bradley
0.0
brain
0.0
branch
0.0
brand
0.0
brasfield
0.0
brassier
0.0
bravest
0.0
brazil
0.0
brdf
0.0
breakdown
0.0
breath
0.0
bridg
0.0
bridgeport
0.0
bright
0.0
brightest
0.0
bring
0.0
brisei
0.0
briseu
0.0
britain
0.0
british
0.0
broad
0.0
broadband
0.0
broader
0.0
bromin
0.0
bronz
0.0
brook
0.0
brookley
0.0
brother
0.0
brought
0.0
brown
0.0
bryant–denni
0.0
brégier
0.0
bu
0.0
bubbl
0.0
buddharaksa
0.0
buddhism
0.0
buddhist
0.0
budget
0.0
bug
0.0
build
0.0
built
0.0
bulgar
0.0
burden
0.0
bureau
0.0
bureaucrat
0.0
buri
0.0
burial
0.0
burlap
0.0
burn
0.0
burnt
0.0
bush
0.0
busi
0.0
busiest
0.0
butcher
0.0
button
0.0
bypass
0.0
byrn
0.0
byzantin
0.0
byzantinu
0.0
cabinet
0.0
cadmu
0.0
cadr
0.0
cahaba
0.0
cah

du
0.0
dual
0.0
ducat
0.0
due
0.0
duel
0.0
dump
0.0
duplic
0.0
durat
0.0
dust
0.0
dvd
0.0
dwell
0.0
dynam
0.0
dysfunct
0.0
dysmorph
0.0
dysphoria
0.0
dysregul
0.0
dĩo
0.0
earli
0.0
earlier
0.0
earliest
0.0
earn
0.0
earth
0.0
earthworm
0.0
easi
0.0
east
0.0
eastern
0.0
eat
0.0
echo
0.0
echolalia
0.0
eclect
0.0
eco
0.0
ecolog
0.0
econom
0.0
economi
0.0
edg
0.0
educ
0.0
edward
0.0
ef5
0.0
effect
0.0
efficaci
0.0
effort
0.0
egalitarian
0.0
egg
0.0
ego
0.0
egoism
0.0
egoist
0.0
egyptian
0.0
eibi
0.0
eight
0.0
eighth
0.0
either
0.0
eiu
0.0
elabor
0.0
elantra
0.0
elbow
0.0
elder
0.0
elea
0.0
eleat
0.0
elect
0.0
elector
0.0
electr
0.0
electra
0.0
electromagnet
0.0
electron
0.0
element
0.0
elementari
0.0
elev
0.0
eleven
0.0
eli
0.0
elig
0.0
elimin
0.0
elisabeth
0.0
elit
0.0
ellul
0.0
elmor
0.0
elva
0.0
elysian
0.0
elysium
0.0
em
0.0
emancip
0.0
embark
0.0
embodi
0.0
embrac
0.0
emerald
0.0
emerg
0.0
emigr
0.0
emil
0.0
emma
0.0
emot
0.0
emperor
0.0
empha
0.0
emphasi
0.0
empir
0.0
employ
0.0
empow

incid
0.0
includ
0.0
incom
0.0
incompat
0.0
incorpor
0.0
incorrect
0.0
incr
0.0
increa
0.0
increasingli
0.0
incumb
0.0
indefinit
0.0
independ
0.0
index
0.0
indian
0.0
indic
0.0
indigen
0.0
indirect
0.0
individu
0.0
individualist
0.0
indo
0.0
induc
0.0
industri
0.0
indycar
0.0
inequ
0.0
inevit
0.0
infanc
0.0
infant
0.0
infantil
0.0
infect
0.0
infecti
0.0
infer
0.0
infest
0.0
inflamm
0.0
inflammatori
0.0
inflat
0.0
inflict
0.0
influenc
0.0
influenti
0.0
influx
0.0
inform
0.0
infrastructur
0.0
ing
0.0
inhabit
0.0
inherit
0.0
inhibit
0.0
initi
0.0
injuri
0.0
inland
0.0
innuendo
0.0
inoxum
0.0
inscrib
0.0
inscript
0.0
insist
0.0
insol
0.0
inspir
0.0
instal
0.0
instanc
0.0
instantli
0.0
instead
0.0
institut
0.0
instruct
0.0
instrument
0.0
insula
0.0
insular
0.0
insur
0.0
insurg
0.0
insurrect
0.0
insurrectionari
0.0
intak
0.0
integr
0.0
intellectu
0.0
inten
0.0
intend
0.0
intent
0.0
interact
0.0
intercept
0.0
interchang
0.0
interest
0.0
interf
0.0
intergovern
0.0
intergraph
0.0
intermedi
0.0


philosoph
0.0
philosophi
0.0
phoenician
0.0
phoenix
0.0
phonem
0.0
phonet
0.0
phosphoru
0.0
photiu
0.0
photometri
0.0
photometria
0.0
photosynthesi
0.0
photovolta
0.0
phoutthavihan
0.0
phrygiu
0.0
phthalat
0.0
phthia
0.0
physi
0.0
physic
0.0
physician
0.0
piano
0.0
pictogram
0.0
pictur
0.0
piec
0.0
piedmont
0.0
pierr
0.0
pin
0.0
pindar
0.0
pine
0.0
pioneer
0.0
pipe
0.0
pistol
0.0
pitch
0.0
place
0.0
placebo
0.0
placement
0.0
plagu
0.0
plain
0.0
plaisir
0.0
plan
0.0
planet
0.0
planetari
0.0
plant
0.0
plantat
0.0
planter
0.0
plastic
0.0
plateau
0.0
platform
0.0
platformist
0.0
plato
0.0
play
0.0
plead
0.0
plenti
0.0
plini
0.0
plu
0.0
plum
0.0
plunder
0.0
plural
0.0
pocket
0.0
podárkē
0.0
poem
0.0
poet
0.0
point
0.0
poison
0.0
polar
0.0
pole
0.0
polic
0.0
polici
0.0
polit
0.0
politician
0.0
polixena
0.0
poll
0.0
pollut
0.0
polyamori
0.0
polyxena
0.0
pomponiu
0.0
pompou
0.0
pontu
0.0
pontárchē
0.0
poor
0.0
poorer
0.0
poorli
0.0
pope
0.0
popul
0.0
popular
0.0
populari
0.0
popularli
0.0
port

0.0
spoke
0.0
sponsor
0.0
spontan
0.0
spook
0.0
sporad
0.0
sport
0.0
spracklen
0.0
spread
0.0
spring
0.0
spurt
0.0
squar
0.0
squat
0.0
sr
0.0
ssab
0.0
ssri
0.0
st
0.0
stabil
0.0
stabl
0.0
stadium
0.0
stage
0.0
stainless
0.0
stake
0.0
stalin
0.0
stanc
0.0
stand
0.0
standard
0.0
star
0.0
start
0.0
state
0.0
statehood
0.0
stateless
0.0
statement
0.0
statewid
0.0
state—along
0.0
station
0.0
statism
0.0
statist
0.0
statiu
0.0
statu
0.0
statuari
0.0
stay
0.0
steadi
0.0
steal
0.0
steel
0.0
steelmak
0.0
steep
0.0
stela
0.0
stele
0.0
stem
0.0
step
0.0
stephanu
0.0
stephen
0.0
stereotyp
0.0
steve
0.0
stigma
0.0
still
0.0
stimul
0.0
stimuli
0.0
stirner
0.0
stirnerist
0.0
stoic
0.0
stone
0.0
stop
0.0
stop—th
0.0
storey
0.0
stori
0.0
storm
0.0
strabo
0.0
strain
0.0
strand
0.0
strategi
0.0
stream
0.0
strengthen
0.0
stress
0.0
stretch
0.0
strike
0.0
strikingli
0.0
stroke
0.0
strong
0.0
strongest
0.0
stronghold
0.0
strongli
0.0
strove
0.0
struck
0.0
structur
0.0
struggl
0.0
student
0.0
studi
0.0
style

0.0
05
0.0
09
0.0
10
0.0
100
0.0
1000
0.0
10000
0.0
100000
0.0
101821
0.0
104991
0.0
105
0.0
106280
0.0
108
0.0
108000
0.0
108135
0.0
10th
0.0
11
0.0
113
0.0
116250
0.0
12
0.0
120000
0.0
120155
0.0
121054
0.0
121110
0.0
121800
0.0
12410
0.0
125
0.0
12770
0.0
12th
0.0
13
0.0
130
0.0
136
0.0
1380121
0.0
13th
0.0
14
0.0
1400
0.0
143000
0.0
147
0.0
1496
0.0
15
0.0
150647
0.0
1540
0.0
154040
0.0
157th
0.0
15th
0.0
160
0.0
1600
0.0
1617
0.0
162000
0.0
1642
0.0
165
0.0
167160
0.0
16th
0.0
17
0.0
170
0.0
1702
0.0
1711
0.0
1747
0.0
1760
0.0
1763
0.0
1767
0.0
1770
0.0
1783
0.0
1793
0.0
1798
0.0
17th
0.0
18
0.0
180
0.0
1800
0.0
18000
0.0
1804
0.0
1810
0.0
181000
0.0
1812
0.0
1813
0.0
1817
0.0
1817–1862
0.0
1819
0.0
1820
0.0
1823
0.0
1825
0.0
1826
0.0
1830
0.0
1840
0.0
1842
0.0
1844
0.0
1846
0.0
1847
0.0
1849
0.0
185
0.0
1850
0.0
1851
0.0
1860
0.0
1861
0.0
1864
0.0
1865
0.0
1867
0.0
1868
0.0
1871
0.0
1872
0.0
1874
0.0
1875
0.0
1889
0.0
1890
0.0
1891
0.0
1896
0.0
18th
0.0
18–30
0.0
1900
0.0
1901
0.

boundari
0.0
bow
0.0
bowl
0.0
box
0.0
boy
0.0
boycott
0.0
boyd
0.0
bradley
0.0
brain
0.0
branch
0.0
brand
0.0
brasfield
0.0
brassier
0.0
bravest
0.0
brazil
0.0
brdf
0.0
breakdown
0.0
breath
0.0
bridg
0.0
bridgeport
0.0
bright
0.0
brightest
0.0
bring
0.0
brisei
0.0
briseu
0.0
britain
0.0
british
0.0
broad
0.0
broadband
0.0
broader
0.0
bromin
0.0
bronz
0.0
brook
0.0
brookley
0.0
brother
0.0
brought
0.0
brown
0.0
bryant–denni
0.0
brégier
0.0
bu
0.0
bubbl
0.0
buddharaksa
0.0
buddhism
0.0
buddhist
0.0
budget
0.0
bug
0.0
build
0.0
built
0.0
bulgar
0.0
burden
0.0
bureau
0.0
bureaucrat
0.0
buri
0.0
burial
0.0
burlap
0.0
burn
0.0
burnt
0.0
bush
0.0
busi
0.0
busiest
0.0
butcher
0.0
button
0.0
bypass
0.0
byrn
0.0
byzantin
0.0
byzantinu
0.0
cabinet
0.0
cadmu
0.0
cadr
0.0
cahaba
0.0
cahokia
0.0
calcha
0.0
calcium
0.0
calcul
0.0
call
0.0
calvert
0.0
camaraderi
0.0
cambodia
0.0
came
0.0
camellia
0.0
camp
0.0
campaign
0.0
campu
0.0
canada
0.0
candid
0.0
canebrak
0.0
canon
0.0
canopi
0.0
canyon
0.0
cap

entiti
0.0
entitl
0.0
entrust
0.0
envi
0.0
environ
0.0
eo
0.0
epc
0.0
epic
0.0
epicentr
0.0
epigenet
0.0
epirot
0.0
epiru
0.0
episod
0.0
epitheta
0.0
epitom
0.0
equal
0.0
equat
0.0
equiv
0.0
era
0.0
erect
0.0
eri
0.0
ernest
0.0
erod
0.0
erot
0.0
errico
0.0
error
0.0
escap
0.0
especi
0.0
espou
0.0
essay
0.0
essenc
0.0
essenti
0.0
establish
0.0
estat
0.0
estim
0.0
estuari
0.0
et
0.0
etc
0.0
ethic
0.0
ethiopia
0.0
ethnic
0.0
etruscan
0.0
etymolog
0.0
eudoru
0.0
eugen
0.0
euripid
0.0
europ
0.0
european
0.0
euxin
0.0
euxinu
0.0
evalu
0.0
evangel
0.0
evapotranspir
0.0
eve
0.0
even
0.0
event
0.0
eventu
0.0
evergreen
0.0
everi
0.0
everlast
0.0
everybodi
0.0
everyday
0.0
everyon
0.0
evid
0.0
evil
0.0
evok
0.0
evolut
0.0
evolutionari
0.0
evolv
0.0
exacerb
0.0
exact
0.0
examin
0.0
exampl
0.0
excav
0.0
exceed
0.0
exceedingli
0.0
except
0.0
exchang
0.0
exci
0.0
exclu
0.0
exclud
0.0
exclusive—although
0.0
execut
0.0
exemplifi
0.0
exempt
0.0
exerci
0.0
exet
0.0
exhibit
0.0
exhum
0.0
exil
0.0
exist
0.

0.0
necessari
0.0
necessarili
0.0
need
0.0
needl
0.0
neg
0.0
negat
0.0
negoti
0.0
neighbor
0.0
neighbour
0.0
neill
0.0
neopagan
0.0
neoptolemu
0.0
nereid
0.0
nereu
0.0
nerv
0.0
nervou
0.0
nestor
0.0
net
0.0
netherland
0.0
network
0.0
neural
0.0
neurodevelop
0.0
neurogenet
0.0
neuroimag
0.0
neuroinflamm
0.0
neurolog
0.0
neuropsycholog
0.0
neuropsychologist
0.0
neurotyp
0.0
neutral
0.0
never
0.0
new
0.0
newer
0.0
newli
0.0
news
0.0
newspap
0.0
next
0.0
nh
0.0
nicknam
0.0
nicola
0.0
nihilist
0.0
nine
0.0
nineteen
0.0
nineteenth
0.0
ninth
0.0
niob
0.0
nippon
0.0
noi
0.0
nomo
0.0
non
0.0
none
0.0
nonverb
0.0
nonviol
0.0
norm
0.0
normal
0.0
north
0.0
northeast
0.0
northeastern
0.0
northern
0.0
northernmost
0.0
northwest
0.0
north–south
0.0
notabl
0.0
notat
0.0
note
0.0
notetak
0.0
notic
0.0
notion
0.0
notorieti
0.0
novat
0.0
novel
0.0
novemb
0.0
nowaday
0.0
npp
0.0
nuclear
0.0
nuclei
0.0
nucleu
0.0
nucor
0.0
nudist
0.0
number
0.0
numer
0.0
nurseri
0.0
nutrit
0.0
nâso
0.0
nêso
0.0
nêsoi
0.0
o

restrain
0.0
restraint
0.0
restrict
0.0
result
0.0
retail
0.0
retain
0.0
retard
0.0
retel
0.0
retold
0.0
retribut
0.0
retrospect
0.0
rett
0.0
return
0.0
reu
0.0
reveal
0.0
reveng
0.0
revenu
0.0
rever
0.0
revi
0.0
review
0.0
reviv
0.0
revolt
0.0
revolut
0.0
revolutionari
0.0
reward
0.0
rewrit
0.0
reynold
0.0
rhyme
0.0
rhythmic
0.0
rich
0.0
richard
0.0
ride
0.0
ridg
0.0
ridgelin
0.0
right
0.0
rightli
0.0
riley
0.0
ring
0.0
riot
0.0
rise
0.0
risk
0.0
risperidon
0.0
ritual
0.0
ritualist
0.0
rival
0.0
river
0.0
rivièr
0.0
road
0.0
robert
0.0
robi
0.0
rock
0.0
rocker
0.0
rocket
0.0
rocki
0.0
rocking—to
0.0
rodrigo
0.0
roger
0.0
rojava
0.0
role
0.0
rolex
0.0
roman
0.0
romanc
0.0
roof
0.0
rooftop
0.0
room
0.0
root
0.0
rosenwald
0.0
roughli
0.0
rout
0.0
routin
0.0
rubber
0.0
rubella
0.0
rudolf
0.0
ruin
0.0
rule
0.0
ruler
0.0
run
0.0
rural
0.0
rush
0.0
russel
0.0
russia
0.0
russian
0.0
rust
0.0
ruthlessli
0.0
ryner
0.0
saanich
0.0
sabotag
0.0
sac
0.0
sack
0.0
sacrif
0.0
sacrileg
0.0
safe
0.0
saf

windblown
0.0
windom
0.0
wing
0.0
winter
0.0
wish
0.0
wit
0.0
withdraw
0.0
withdrawn
0.0
withdrew
0.0
within
0.0
without
0.0
woman
0.0
women
0.0
wonder
0.0
wood
0.0
woodpeck
0.0
wor
0.0
word
0.0
wore
0.0
work
0.0
workabl
0.0
worker
0.0
workforc
0.0
workingmen
0.0
workplac
0.0
world
0.0
worldview
0.0
worldwid
0.0
worsen
0.0
worship
0.0
worst
0.0
would
0.0
wound
0.0
wrath
0.0
write
0.0
writer
0.0
written
0.0
wrongli
0.0
wrote
0.0
wto
0.0
xa011
0.0
xa0120
0.0
xa065
0.0
xa070
0.0
xa0700
0.0
xa0bc–ad
0.0
xa0billion
0.0
xa0day
0.0
xa0feet
0.0
xa0km
0.0
xa0m
0.0
xa0mil
0.0
xa0million
0.0
xa0perc
0.0
xa0roman
0.0
xa0year
0.0
xa0°c
0.0
xa0°f
0.0
xa0–
0.0
xa0—
0.0
yazoo
0.0
year
0.0
yearli
0.0
yellow
0.0
yellowhamm
0.0
yet
0.0
yield
0.0
yokohama
0.0
york
0.0
young
0.0
younger
0.0
youngest
0.0
youth
0.0
zapatista
0.0
zenith
0.0
zeno
0.0
zero
0.0
zerzan
0.0
zeu
0.0
zhuangzi
0.0
zisli
0.0
zmiinyi
0.0
zone
0.0
zoroastrian
0.0
érotiqu
0.0
ākhpdeó
0.0
āk̂pedió
0.0
ōkú
0.0
αὐτός
0.0
δρόμος
0.0
δῖος
0.0

0.0
eat
0.0
echo
0.0
echolalia
0.0
eclect
0.0
eco
0.0
ecolog
0.0
econom
0.0
economi
0.0
edg
0.0
educ
0.0
edward
0.0
ef5
0.0
effect
0.0
efficaci
0.0
effort
0.0
egalitarian
0.0
egg
0.0
ego
0.0
egoism
0.0
egoist
0.0
egyptian
0.0
eibi
0.0
eight
0.0
eighth
0.0
either
0.0
eiu
0.0
elabor
0.0
elantra
0.0
elbow
0.0
elder
0.0
elea
0.0
eleat
0.0
elect
0.0
elector
0.0
electr
0.0
electra
0.0
electromagnet
0.0
electron
0.0
element
0.0
elementari
0.0
elev
0.0
eleven
0.0
eli
0.0
elig
0.0
elimin
0.0
elisabeth
0.0
elit
0.0
ellul
0.0
elmor
0.0
elva
0.0
elysian
0.0
elysium
0.0
em
0.0
emancip
0.0
embark
0.0
embodi
0.0
embrac
0.0
emerald
0.0
emerg
0.0
emigr
0.0
emil
0.0
emma
0.0
emot
0.0
emperor
0.0
empha
0.0
emphasi
0.0
empir
0.0
employ
0.0
empow
0.0
empress
0.0
emul
0.0
en
0.0
enabl
0.0
enact
0.0
enceladu
0.0
encompass
0.0
encount
0.0
encourag
0.0
end
0.0
endang
0.0
enemi
0.0
energi
0.0
enforc
0.0
enfranchi
0.0
engag
0.0
engel
0.0
engin
0.0
england
0.0
english
0.0
enhanc
0.0
enjoy
0.0
enlighten
0.0
enorm


onward
0.0
open
0.0
oper
0.0
opera
0.0
oppo
0.0
oppon
0.0
opportun
0.0
opposit
0.0
oppress
0.0
opt
0.0
optic
0.0
optim
0.0
option
0.0
optometri
0.0
oracl
0.0
orang
0.0
orb
0.0
order
0.0
ordin
0.0
orest
0.0
org
0.0
organ
0.0
organi
0.0
organisationalist
0.0
orient
0.0
origin
0.0
orion
0.0
orlean
0.0
orthodox
0.0
orthographi
0.0
oscar
0.0
osteopath
0.0
otherwi
0.0
ought
0.0
outbreak
0.0
outcom
0.0
outdat
0.0
outer
0.0
outlaw
0.0
outlin
0.0
outokumpu
0.0
output
0.0
outsid
0.0
outweigh
0.0
overcom
0.0
overdiagnosi
0.0
overestim
0.0
overlap
0.0
overload
0.0
overlook
0.0
overrid
0.0
overs
0.0
overseen
0.0
oversight
0.0
overt
0.0
overtli
0.0
overturn
0.0
ovid
0.0
owe
0.0
owner
0.0
ownership
0.0
ox
0.0
oxú
0.0
pace
0.0
pacif
0.0
pacifist
0.0
packag
0.0
paddl
0.0
paeonian
0.0
paid
0.0
paiderasteia
0.0
pain
0.0
paint
0.0
pair
0.0
palac
0.0
palat
0.0
pale
0.0
paleo
0.0
palmer
0.0
pamphlet
0.0
pamphylian
0.0
panel
0.0
panhellen
0.0
panic
0.0
paper
0.0
par
0.0
paradox
0.0
parallel
0.0
paramet
0.0
p

0.0
sclerosi
0.0
scope
0.0
scot
0.0
scrape
0.0
screen
0.0
screener
0.0
script
0.0
scythia
0.0
scythian
0.0
se
0.0
sea
0.0
seaport
0.0
season
0.0
seat
0.0
seattl
0.0
secc
0.0
secess
0.0
second
0.0
secondari
0.0
secret
0.0
secretari
0.0
sect
0.0
sectarian
0.0
section
0.0
secular
0.0
secur
0.0
see
0.0
seek
0.0
seem
0.0
seen
0.0
segment
0.0
segreg
0.0
seiz
0.0
select
0.0
self
0.0
sell
0.0
selma
0.0
semi
0.0
semicur
0.0
semistructur
0.0
semit
0.0
sen
0.0
senat
0.0
send
0.0
senior
0.0
sensat
0.0
sensit
0.0
sensor
0.0
sensori
0.0
sent
0.0
sentenc
0.0
sentiment
0.0
separ
0.0
sephard
0.0
septemb
0.0
sequenc
0.0
sequestr
0.0
seri
0.0
serif
0.0
seriou
0.0
serv
0.0
servic
0.0
servisfirst
0.0
session
0.0
set
0.0
settl
0.0
settlement
0.0
settler
0.0
seven
0.0
seventeen
0.0
seventh
0.0
sever
0.0
sewel
0.0
sex
0.0
sexual
0.0
sha
0.0
shade
0.0
shadow
0.0
shakespear
0.0
shall
0.0
shape
0.0
share
0.0
sharp
0.0
sharpen
0.0
sharpli
0.0
shatter
0.0
sheep
0.0
sheet
0.0
shelbi
0.0
sheriff
0.0
shield
0.0
shift

0.0
vase
0.0
vatican
0.0
vega
0.0
vegan
0.0
veget
0.0
vehicl
0.0
vener
0.0
venu
0.0
verac
0.0
verbal
0.0
verbo
0.0
verg
0.0
versa
0.0
version
0.0
versu
0.0
vertic
0.0
veterinari
0.0
veto
0.0
via
0.0
viabl
0.0
vice
0.0
victori
0.0
vienna
0.0
vietnam
0.0
view
0.0
viewer
0.0
vigor
0.0
vigour
0.0
violenc
0.0
violent
0.0
virgil
0.0
viri
0.0
virtu
0.0
visibl
0.0
visigoth
0.0
vision
0.0
visit
0.0
visual
0.0
vitamin
0.0
vocabulari
0.0
vocal
0.0
vocat
0.0
voic
0.0
void
0.0
volin
0.0
voltairin
0.0
voluntar
0.0
voluntari
0.0
vote
0.0
voter
0.0
votiv
0.0
vow
0.0
vowel
0.0
voyag
0.0
vs
0.0
vulner
0.0
wachovia
0.0
wage
0.0
wait
0.0
walden
0.0
walk
0.0
wall
0.0
wallac
0.0
wander
0.0
want
0.0
war
0.0
warm
0.0
warmer
0.0
warmth
0.0
warn
0.0
warrant
0.0
warren
0.0
warrior
0.0
washington
0.0
wast
0.0
wat
0.0
water
0.0
waterway
0.0
wave
0.0
wavelength
0.0
wavi
0.0
way
0.0
weak
0.0
weaken
0.0
wealth
0.0
weapon
0.0
wear
0.0
weather
0.0
wed
0.0
weed
0.0
week
0.0
weevil
0.0
wef
0.0
weigh
0.0
weight
0.0
weitl


dionysiu
0.0
dioxid
0.0
dip
0.0
direct
0.0
directli
0.0
dirti
0.0
disabl
0.0
disagr
0.0
disappear
0.0
disast
0.0
disc
0.0
disciplin
0.0
disco
0.0
discount
0.0
discov
0.0
discrimin
0.0
disea
0.0
disenfranchi
0.0
disfranchi
0.0
disgui
0.0
dishonour
0.0
disintegr
0.0
disk
0.0
dismemb
0.0
dismiss
0.0
disobedi
0.0
disord
0.0
dispar
0.0
disparag
0.0
display
0.0
dispo
0.0
disproven
0.0
disput
0.0
disrupt
0.0
dissemin
0.0
distanc
0.0
distinct
0.0
distinctli
0.0
distinguish
0.0
distract
0.0
distraught
0.0
distress
0.0
distribut
0.0
district
0.0
disturb
0.0
diver
0.0
diverg
0.0
diversifi
0.0
divi
0.0
divid
0.0
divin
0.0
dixi
0.0
djarilgatch
0.0
dna
0.0
dnieper
0.0
doc
0.0
doctor
0.0
doctrin
0.0
document
0.0
dog
0.0
dollar
0.0
domest
0.0
domin
0.0
donald
0.0
donat
0.0
done
0.0
doom
0.0
dorothi
0.0
dothan
0.0
doubl
0.0
doubt
0.0
doug
0.0
downtown
0.0
dozen
0.0
draft
0.0
drag
0.0
dragonfli
0.0
drama
0.0
dramat
0.0
draw
0.0
dream
0.0
drew
0.0
dri
0.0
drive
0.0
driven
0.0
dromo
0.0
drool
0.0
drop
0.0

0.0
jame
0.0
januari
0.0
japan
0.0
jean
0.0
jefferson
0.0
jemison
0.0
jeremiah
0.0
jew
0.0
jewelleri
0.0
jewish
0.0
jim
0.0
job
0.0
johann
0.0
john
0.0
join
0.0
joint
0.0
jointli
0.0
jone
0.0
jordan
0.0
joseph
0.0
josiah
0.0
jove
0.0
joy
0.0
jpss
0.0
judaism
0.0
judg
0.0
judici
0.0
juli
0.0
june
0.0
jura
0.0
juri
0.0
jurisdict
0.0
justic
0.0
juxtaposit
0.0
jvc
0.0
kanner
0.0
karkinit
0.0
karl
0.0
kay
0.0
keen
0.0
keep
0.0
kentucki
0.0
kept
0.0
key
0.0
ki
0.0
kiliya
0.0
kill
0.0
kilomet
0.0
kind
0.0
king
0.0
kingdom
0.0
klan
0.0
klux
0.0
knight
0.0
know
0.0
knowledg
0.0
known
0.0
koasati
0.0
koeberl
0.0
koppen
0.0
kosa
0.0
kritzikli
0.0
kronstadt
0.0
kropotkin
0.0
ku
0.0
kuwaiti
0.0
la
0.0
label
0.0
labor
0.0
labour
0.0
lack
0.0
laconia
0.0
ladd–peebl
0.0
lag
0.0
lake
0.0
lambert
0.0
lambertian
0.0
land
0.0
landauer
0.0
landau–kleffn
0.0
landform
0.0
languag
0.0
lao
0.0
lapidari
0.0
larg
0.0
larger
0.0
largest
0.0
last
0.0
late
0.0
later
0.0
latin
0.0
latino
0.0
latitud
0.0
latter
0.0
l

0.0
program
0.0
programm
0.0
progress
0.0
prohibit
0.0
project
0.0
proletariat
0.0
prolif
0.0
prometheu
0.0
promi
0.0
promin
0.0
promot
0.0
prone
0.0
pronoun
0.0
pronounc
0.0
pronunci
0.0
propaganda
0.0
propagandist
0.0
proper
0.0
properti
0.0
propertiu
0.0
propheci
0.0
prophet
0.0
propo
0.0
propon
0.0
proport
0.0
proselyt
0.0
prospect
0.0
prosper
0.0
prostat
0.0
protect
0.0
protector
0.0
protest
0.0
protestor
0.0
proto
0.0
proudhon
0.0
prove
0.0
proven
0.0
provi
0.0
provid
0.0
provinc
0.0
proxim
0.0
psalm
0.0
psychiatr
0.0
psychiatri
0.0
psychiatrist
0.0
psychoact
0.0
psycholog
0.0
psychologist
0.0
psychopath
0.0
psychosoci
0.0
pteridophyt
0.0
ptolemi
0.0
public
0.0
publicli
0.0
publish
0.0
punctur
0.0
punish
0.0
punk
0.0
pupil
0.0
purcha
0.0
pursu
0.0
pursuit
0.0
purview
0.0
push
0.0
put
0.0
pyre
0.0
pyrrha
0.0
pyrrhu
0.0
pythia
0.0
póda
0.0
pṓd
0.0
qua
0.0
qualiti
0.0
quantif
0.0
quantifi
0.0
quantiti
0.0
quarrel
0.0
quarter
0.0
queen
0.0
queer
0.0
question
0.0
questionnair
0.0
quic

tuber
0.0
tucker
0.0
tumulo
0.0
tumulu
0.0
turn
0.0
turner
0.0
tuscaloosa
0.0
tuskeg
0.0
twenti
0.0
twentieth
0.0
twice
0.0
twin
0.0
two
0.0
type
0.0
typic
0.0
tyrant
0.0
tzu
0.0
uab
0.0
uk
0.0
ukrain
0.0
ukrainian
0.0
ultim
0.0
ultraviolet
0.0
unabl
0.0
unaffect
0.0
unanim
0.0
unavoid
0.0
uncertain
0.0
unchang
0.0
uncial
0.0
unclear
0.0
uncommon
0.0
unconfirm
0.0
unconv
0.0
undef
0.0
underdiagno
0.0
underdiagnosi
0.0
underestim
0.0
underfund
0.0
undergird
0.0
undergradu
0.0
underground
0.0
underinvest
0.0
underrepr
0.0
understand
0.0
understood
0.0
underway
0.0
underworld
0.0
undesir
0.0
undocu
0.0
une
0.0
unemploy
0.0
unequ
0.0
unev
0.0
uneven
0.0
unfea
0.0
unfortun
0.0
unhind
0.0
unidentifi
0.0
uniform
0.0
unincorpor
0.0
uninhabit
0.0
union
0.0
uniqu
0.0
unit
0.0
unitarian
0.0
uniti
0.0
univ
0.0
unjust
0.0
unknown
0.0
unless
0.0
unlik
0.0
unnecessari
0.0
unoffici
0.0
unrealist
0.0
unreimbur
0.0
unrel
0.0
unresolv
0.0
unround
0.0
unschool
0.0
unselect
0.0
unsupport
0.0
untim
0.0
unus

0.0
coercion
0.0
coerciv
0.0
coexist
0.0
cognit
0.0
cohe
0.0
coher
0.0
coin
0.0
cold
0.0
colder
0.0
collap
0.0
collar
0.0
collect
0.0
collectiv
0.0
collectivi
0.0
collectivist
0.0
collector
0.0
colleg
0.0
colon
0.0
coloni
0.0
colonist
0.0
colonn
0.0
color
0.0
coloss
0.0
colour
0.0
columbia
0.0
columbu
0.0
com
0.0
combin
0.0
come
0.0
comet
0.0
comfort
0.0
command
0.0
comment
0.0
commerci
0.0
commiss
0.0
commit
0.0
committ
0.0
common
0.0
commonli
0.0
commonplac
0.0
commun
0.0
communard
0.0
communist
0.0
compact
0.0
companhia
0.0
compani
0.0
companion
0.0
compar
0.0
comparison
0.0
compass
0.0
compat
0.0
compen
0.0
compet
0.0
compil
0.0
complet
0.0
complex
0.0
compli
0.0
complianc
0.0
complic
0.0
compo
0.0
compon
0.0
composit
0.0
compound
0.0
comprehen
0.0
comprehend
0.0
compri
0.0
compromi
0.0
compul
0.0
comput
0.0
comrad
0.0
concentr
0.0
concept
0.0
conceptu
0.0
concern
0.0
conclu
0.0
conclud
0.0
concur
0.0
concuss
0.0
conden
0.0
condit
0.0
conduc
0.0
conduct
0.0
conecuh
0.0
confed
0.0
c

0.0
grown
0.0
growth
0.0
grunya
0.0
gubernatori
0.0
gui
0.0
guid
0.0
guidelin
0.0
guido
0.0
gulf
0.0
gut
0.0
gut–brain
0.0
guy
0.0
guàrdia
0.0
habitat
0.0
hack
0.0
hade
0.0
hadley
0.0
hagu
0.0
hail
0.0
hair
0.0
haleyvil
0.0
half
0.0
halicarnassu
0.0
hamper
0.0
han
0.0
hand
0.0
handl
0.0
handsom
0.0
handwrit
0.0
handwritten
0.0
hapk
0.0
happen
0.0
happiest
0.0
haralson
0.0
harass
0.0
harbert
0.0
hard
0.0
harder
0.0
hardli
0.0
hardship
0.0
hare
0.0
harm
0.0
harmonica
0.0
harp
0.0
harsh
0.0
hawaiian
0.0
head
0.0
headquart
0.0
heal
0.0
health
0.0
healthcar
0.0
hear
0.0
heart
0.0
heat
0.0
heavi
0.0
heavili
0.0
hebrew
0.0
hector
0.0
hecuba
0.0
hedonist
0.0
hedreen
0.0
heel
0.0
height
0.0
heinrich
0.0
held
0.0
helen
0.0
hellen
0.0
hellespont
0.0
help
0.0
hemisph
0.0
henc
0.0
hennaci
0.0
henri
0.0
hephaest
0.0
hephaestu
0.0
hera
0.0
heracl
0.0
herb
0.0
herit
0.0
heritag
0.0
herm
0.0
hernando
0.0
hero
0.0
herodotu
0.0
heroic
0.0
hesperu
0.0
heterogen
0.0
heteronorm
0.0
heterosexu
0.0
hfa
0.0
hi

optim
0.0
option
0.0
optometri
0.0
oracl
0.0
orang
0.0
orb
0.0
order
0.0
ordin
0.0
orest
0.0
org
0.0
organ
0.0
organi
0.0
organisationalist
0.0
orient
0.0
origin
0.0
orion
0.0
orlean
0.0
orthodox
0.0
orthographi
0.0
oscar
0.0
osteopath
0.0
otherwi
0.0
ought
0.0
outbreak
0.0
outcom
0.0
outdat
0.0
outer
0.0
outlaw
0.0
outlin
0.0
outokumpu
0.0
output
0.0
outsid
0.0
outweigh
0.0
overcom
0.0
overdiagnosi
0.0
overestim
0.0
overlap
0.0
overload
0.0
overlook
0.0
overrid
0.0
overs
0.0
overseen
0.0
oversight
0.0
overt
0.0
overtli
0.0
overturn
0.0
ovid
0.0
owe
0.0
owner
0.0
ownership
0.0
ox
0.0
oxú
0.0
pace
0.0
pacif
0.0
pacifist
0.0
packag
0.0
paddl
0.0
paeonian
0.0
paid
0.0
paiderasteia
0.0
pain
0.0
paint
0.0
pair
0.0
palac
0.0
palat
0.0
pale
0.0
paleo
0.0
palmer
0.0
pamphlet
0.0
pamphylian
0.0
panel
0.0
panhellen
0.0
panic
0.0
paper
0.0
par
0.0
paradox
0.0
parallel
0.0
paramet
0.0
parent
0.0
pari
0.0
park
0.0
parker
0.0
parkway
0.0
parliamentari
0.0
parmenid
0.0
parol
0.0
part
0.0
parthia
0.0


stalin
0.0
stanc
0.0
stand
0.0
standard
0.0
star
0.0
start
0.0
state
0.0
statehood
0.0
stateless
0.0
statement
0.0
statewid
0.0
state—along
0.0
station
0.0
statism
0.0
statist
0.0
statiu
0.0
statu
0.0
statuari
0.0
stay
0.0
steadi
0.0
steal
0.0
steel
0.0
steelmak
0.0
steep
0.0
stela
0.0
stele
0.0
stem
0.0
step
0.0
stephanu
0.0
stephen
0.0
stereotyp
0.0
steve
0.0
stigma
0.0
still
0.0
stimul
0.0
stimuli
0.0
stirner
0.0
stirnerist
0.0
stoic
0.0
stone
0.0
stop
0.0
stop—th
0.0
storey
0.0
stori
0.0
storm
0.0
strabo
0.0
strain
0.0
strand
0.0
strategi
0.0
stream
0.0
strengthen
0.0
stress
0.0
stretch
0.0
strike
0.0
strikingli
0.0
stroke
0.0
strong
0.0
strongest
0.0
stronghold
0.0
strongli
0.0
strove
0.0
struck
0.0
structur
0.0
struggl
0.0
student
0.0
studi
0.0
style
0.0
styx
0.0
su
0.0
sub
0.0
subdivi
0.0
subgroup
0.0
subject
0.0
submit
0.0
suboptim
0.0
subsequ
0.0
subset
0.0
subsidiari
0.0
subsist
0.0
substanc
0.0
substanti
0.0
subtrop
0.0
subtyp
0.0
suburban
0.0
succ
0.0
succeed
0.0
success
0.

0.0
121110
0.0
121800
0.0
12410
0.0
125
0.0
12770
0.0
12th
0.0
13
0.0
130
0.0
136
0.0
1380121
0.0
13th
0.0
14
0.0
1400
0.0
143000
0.0
147
0.0
1496
0.0
15
0.0
150647
0.0
1540
0.0
154040
0.0
157th
0.0
15th
0.0
160
0.0
1600
0.0
1617
0.0
162000
0.0
1642
0.0
165
0.0
167160
0.0
16th
0.0
17
0.0
170
0.0
1702
0.0
1711
0.0
1747
0.0
1760
0.0
1763
0.0
1767
0.0
1770
0.0
1783
0.0
1793
0.0
1798
0.0
17th
0.0
18
0.0
180
0.0
1800
0.0
18000
0.0
1804
0.0
1810
0.0
181000
0.0
1812
0.0
1813
0.0
1817
0.0
1817–1862
0.0
1819
0.0
1820
0.0
1823
0.0
1825
0.0
1826
0.0
1830
0.0
1840
0.0
1842
0.0
1844
0.0
1846
0.0
1847
0.0
1849
0.0
185
0.0
1850
0.0
1851
0.0
1860
0.0
1861
0.0
1864
0.0
1865
0.0
1867
0.0
1868
0.0
1871
0.0
1872
0.0
1874
0.0
1875
0.0
1889
0.0
1890
0.0
1891
0.0
1896
0.0
18th
0.0
18–30
0.0
1900
0.0
1901
0.0
1903
0.0
1910
0.0
1911
0.0
1913
0.0
1915
0.0
1917
0.0
1920
0.0
1921
0.0
1925
0.0
1926
0.0
1928
0.0
1929
0.0
1936
0.0
1937
0.0
1938
0.0
1940
0.0
1941
0.0
1943
0.0
1945
0.0
1950
0.0
1954
0.0
1955–56
0.0
19

disciplin
0.0
disco
0.0
discount
0.0
discov
0.0
discrimin
0.0
disea
0.0
disenfranchi
0.0
disfranchi
0.0
disgui
0.0
dishonour
0.0
disintegr
0.0
disk
0.0
dismemb
0.0
dismiss
0.0
disobedi
0.0
disord
0.0
dispar
0.0
disparag
0.0
display
0.0
dispo
0.0
disproven
0.0
disput
0.0
disrupt
0.0
dissemin
0.0
distanc
0.0
distinct
0.0
distinctli
0.0
distinguish
0.0
distract
0.0
distraught
0.0
distress
0.0
distribut
0.0
district
0.0
disturb
0.0
diver
0.0
diverg
0.0
diversifi
0.0
divi
0.0
divid
0.0
divin
0.0
dixi
0.0
djarilgatch
0.0
dna
0.0
dnieper
0.0
doc
0.0
doctor
0.0
doctrin
0.0
document
0.0
dog
0.0
dollar
0.0
domest
0.0
domin
0.0
donald
0.0
donat
0.0
done
0.0
doom
0.0
dorothi
0.0
dothan
0.0
doubl
0.0
doubt
0.0
doug
0.0
downtown
0.0
dozen
0.0
draft
0.0
drag
0.0
dragonfli
0.0
drama
0.0
dramat
0.0
draw
0.0
dream
0.0
drew
0.0
dri
0.0
drive
0.0
driven
0.0
dromo
0.0
drool
0.0
drop
0.0
droplet
0.0
drown
0.0
drug
0.0
drómo
0.0
dsm
0.0
dsp
0.0
du
0.0
dual
0.0
ducat
0.0
due
0.0
duel
0.0
dump
0.0
duplic
0.0
d

0.0
neurogenet
0.0
neuroimag
0.0
neuroinflamm
0.0
neurolog
0.0
neuropsycholog
0.0
neuropsychologist
0.0
neurotyp
0.0
neutral
0.0
never
0.0
new
0.0
newer
0.0
newli
0.0
news
0.0
newspap
0.0
next
0.0
nh
0.0
nicknam
0.0
nicola
0.0
nihilist
0.0
nine
0.0
nineteen
0.0
nineteenth
0.0
ninth
0.0
niob
0.0
nippon
0.0
noi
0.0
nomo
0.0
non
0.0
none
0.0
nonverb
0.0
nonviol
0.0
norm
0.0
normal
0.0
north
0.0
northeast
0.0
northeastern
0.0
northern
0.0
northernmost
0.0
northwest
0.0
north–south
0.0
notabl
0.0
notat
0.0
note
0.0
notetak
0.0
notic
0.0
notion
0.0
notorieti
0.0
novat
0.0
novel
0.0
novemb
0.0
nowaday
0.0
npp
0.0
nuclear
0.0
nuclei
0.0
nucleu
0.0
nucor
0.0
nudist
0.0
number
0.0
numer
0.0
nurseri
0.0
nutrit
0.0
nâso
0.0
nêso
0.0
nêsoi
0.0
obe
0.0
obey
0.0
object
0.0
oblig
0.0
obliqu
0.0
oblivi
0.0
obscur
0.0
observ
0.0
obstacl
0.0
obtain
0.0
occas
0.0
occup
0.0
occupi
0.0
occur
0.0
occurr
0.0
ocean
0.0
oceanu
0.0
octob
0.0
odd
0.0
odysseu
0.0
odyssey
0.0
offer
0.0
offic
0.0
offici
0.0
offset
0

0.0
sky
0.0
skyro
0.0
slant
0.0
slave
0.0
slaveri
0.0
slay
0.0
sleep
0.0
sleev
0.0
slight
0.0
slightli
0.0
slow
0.0
slowli
0.0
small
0.0
smaller
0.0
smile
0.0
smoke
0.0
smooth
0.0
smyrna
0.0
snake
0.0
snatch
0.0
snow
0.0
snowfal
0.0
snowmelt
0.0
snowpack
0.0
snowstorm
0.0
snow–temperatur
0.0
social
0.0
socialist
0.0
societ
0.0
societi
0.0
socioeconom
0.0
socrat
0.0
soil
0.0
solair
0.0
solar
0.0
sold
0.0
soldier
0.0
sole
0.0
solid
0.0
solidar
0.0
solut
0.0
solv
0.0
solvent
0.0
someon
0.0
someth
0.0
sometim
0.0
somewhat
0.0
son
0.0
soon
0.0
soot
0.0
sophocl
0.0
sorghum
0.0
sorrow
0.0
sort
0.0
soto
0.0
sought
0.0
soul
0.0
soulless
0.0
sound
0.0
sourc
0.0
south
0.0
southeast
0.0
southeastern
0.0
southern
0.0
southernmost
0.0
southtrust
0.0
sovereign
0.0
soviet
0.0
soybean
0.0
space
0.0
spain
0.0
spanish
0.0
spar
0.0
spare
0.0
sparta
0.0
speak
0.0
spear
0.0
speci
0.0
special
0.0
specialist
0.0
specif
0.0
specifi
0.0
spectral
0.0
spectrum
0.0
specul
0.0
specular
0.0
specularli
0.0
speech
0.0

In [29]:
alpha = 0.3
# Merging the TF-IDF according to weights
# multiply the Body TF-IDF with alpha
for i in tf_idf:
    tf_idf[i] *= alpha
# Iterate Title IF-IDF for every (doc, token)
# if token is in body, replace the Body(doc, token) value with the value in Title(doc, token)
for i in tf_idf_title:
    tf_idf[i] = tf_idf_title[i]

In [30]:
# vectorize documents
D = np.zeros((N, total_vocab_size))
for i in tf_idf:
    try:
        ind = total_vocab.index(i[1]) # generate a index for each token
        D[i[0]][ind] = tf_idf[i] # document vectors
    except:
        pass

## Ranking

In [31]:
# theoretical concept: add tf_idf values of the tokens that are in query for every document.
# Iterate over all values in the dictionary and check if the value is present in the token.
# As our dictionary is a (document, token) key, when we find a token which is in query we will
# add the document id to another dictionary along with the tf-idf value
def matching_score(k, query):
    preprocessed_query = preprocess(query)
    tokens = word_tokenize(str(preprocessed_query))
    print("\nQuery:", query)
    print("")
    print(tokens)
    
    query_weights = {}
    for key in tf_idf:
        if key[1] in tokens:
            try:
                query_weights[key[0]] += tf_idf[key]
            except:
                query_weights[key[0]] = tf_idf[key]
    print(query_weights)
    # take the top k documents
    query_weights = sorted(query_weights.items(), key=lambda x: x[1], reverse=True)
    print("")
    ranking = []
    for i in query_weights[:k]:
        print(i)
        ranking.append(i[0])
    print(ranking)
    for i in ranking:
        print(i, dataset[i][0])

In [32]:
import math
# vectorize query
def gen_vector(tokens):
    Q = np.zeros((len(total_vocab)))
    counter = Counter(tokens)
    words_count = len(tokens)
    query_weights = {}
    for token in np.unique(tokens):
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = math.log((N+1)/(df+1))
        try:
            ind = total_vocab.index(token)
            Q[ind] = tf*idf
        except:
            pass
    return Q

def cosine_similarity(k, query):
    print(">>Cosine Similarity")
    preprocessed_query = preprocess(query)
    tokens = word_tokenize(str(preprocessed_query))
    print("\nQuery:", query)
    print(tokens)   
    query_vector = gen_vector(tokens)
    d_cosines = []
    for d in D:
        s = np.dot(query_vector, d)/(np.linalg.norm(query_vector)*np.linalg.norm(d))
        d_cosines.append(s)
    # take the top k documents
    ranking = np.array(d_cosines).argsort()[-k:][::-1].tolist()  
    print("") 
    print(ranking)
    for i in ranking:
        print(i, article_title[i])

## Test

In [33]:
query = "What is Anarchism?"
matching_score(10,query)
print("")
cosine_similarity(10,query)


Query: What is Anarchism?

['anarch']
{0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0, 5: 0.0}

(0, 0.0)
(1, 0.0)
(2, 0.0)
(3, 0.0)
(4, 0.0)
(5, 0.0)
[0, 1, 2, 3, 4, 5]
0 ['12', '25', '39', '290', '303', '305']
1 ['12', '25', '39', '290', '303', '305']
2 ['12', '25', '39', '290', '303', '305']
3 ['12', '25', '39', '290', '303', '305']
4 ['12', '25', '39', '290', '303', '305']
5 ['12', '25', '39', '290', '303', '305']

>>Cosine Similarity

Query: What is Anarchism?
['anarch']
anarch
6

[5, 4, 3, 2, 1, 0]
5 Achilles
4 Alabama
3 A
2 Albedo
1 Autism
0 Anarchism


