We will first retrieve a number of topics from the Wikipedia category tree. We will use the Wikimedia API for this. The code will write out all Wikipedia categories for which at least 200 documents exist, creating the file wiki_categories.txt. (It takes a minute or two, so be patient! 

In [31]:
import requests
import re

S = requests.Session()

URL = "https://en.wikipedia.org/w/api.php"

PARAMS = {
    "action": "query",
    "format": "json",
    "list": "allcategories",
    "acmin": 200,
    "aclimit": 500
}

from google.colab import drive
drive.mount('/content/drive')
f = open("./drive/MyDrive/Colab-Notebooks/Search-Engine/wiki_categories.txt",'w')

for i in range(100):
    R = S.get(url=URL, params=PARAMS)
    DATA = R.json()

    CATEGORIES = DATA["query"]["allcategories"]

    for cat in CATEGORIES:
        cat_name = cat["*"]
        m = re.search("[0-9]{4}",cat_name)
        #if cat_name[-6:] not in ['births','deaths']:
        if not m:
            f.write(cat_name+'\n')
    
    if "continue" in DATA:
        PARAMS["acfrom"] = DATA["continue"]["accontinue"]
    else:
        break

f.close()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Open your wiki_categories.txt file and scroll through it. Select 10 categories from it. Make sure you don't select categories which look like Wikipedia-internal labels (e.g. Wikipedia books', Pages with script errors', etc). Copy and paste those category names into a file named tmp_cat.txt in the top directory of the repo. We will now retrieve Wikipedia pages for our categories. We first need a list of page titles for each category. This will create a file titles.txt for each category in your data/ directory, containing 100 Wikipedia titles for each of the categories in your tmp_cat.txt file.

In [32]:
import requests
import os

def read_categories():
    with open("./drive/MyDrive/Colab-Notebooks/Search-Engine/tmp_cat.txt",'r') as f:
        categories = f.read().splitlines()
    return categories

S = requests.Session()

URL = "https://en.wikipedia.org/w/api.php"

categories = read_categories()
print(categories)


for cat in categories:
    #cat_dir = "data/categories/"+cat.replace(' ','_')
    cat_dir = "./drive/MyDrive/Colab-Notebooks/Search-Engine/Categories/"+cat.replace(' ','_')
    if not os.path.isdir(cat_dir):
        os.mkdir(cat_dir)
    title_file = open(os.path.join(cat_dir,"titles.txt"),'w')

    PARAMS = {
        "action": "query",
        "list": "categorymembers",
        "format": "json",
        "cmtitle": "Category:"+cat,
        "cmlimit": "100"
    }

    for i in range(1):    #increase 1 to more to get additional data
        R = S.get(url=URL, params=PARAMS)
        DATA = R.json()

        PAGES = DATA["query"]["categorymembers"]

        for page in PAGES:
            title = page["title"]
            ID = str(page["pageid"])
            if title[:9] != "Category:":
                title_file.write(ID+' '+title+'\n')
 
        if "continue" in DATA:
            PARAMS["cmcontinue"] = DATA["continue"]["cmcontinue"]
        else:
            break

    title_file.close()

['Brazilian telenovelas', 'Cold War films', 'Fauna of Zimbabwe', 'Freeware games', 'International law', 'Landscape painters', 'Members of the Chinese Academy of Sciences', 'Musicals based on novels', 'Particle physics', '21st-century women mathematicians']


We now need to retrieve the text of those documents. To speed things up and in order not to use too much hard disk space, we will just retrieve the intro text of each document rather than the whole page. This will create a file linear.txt for each category in your data/ directory, containing the introductory text of each Wikipedia page listed in titles.txt files. 

In [33]:
import requests
import os

def read_titles(filename):
    IDs = []
    titles = []
    f = open(filename,'r')
    for l in f:
        l.rstrip('\n')
        IDs.append(l.split()[0])
        titles.append(' '.join(l.split()[1:]))
    return IDs,titles


def read_categories():
    with open("./drive/MyDrive/Colab-Notebooks/Search-Engine/tmp_cat.txt",'r') as f:
        categories = f.read().splitlines()
    return categories



S = requests.Session()

URL = "https://en.wikipedia.org/w/api.php"


categories = read_categories()

for cat in categories:
    print("Processing category",cat)
    #cat_dir = "data/categories/"+cat.replace(' ','_')
    cat_dir = "./drive/MyDrive/Colab-Notebooks/Search-Engine/Categories/"+cat.replace(' ','_')
    title_file = os.path.join(cat_dir,"titles.txt")
    IDs, titles = read_titles(title_file)

    content_file = open(os.path.join(cat_dir,"linear.txt"),'w')

    for i in range(len(titles)):
        PARAMS = {
            "action": "query",
            "prop": "extracts",
            "format": "json",
            "exintro": True,
            "explaintext": True,
            "redirects": True,
            "titles": titles[i]
        }

        R = S.get(url=URL, params=PARAMS)
        DATA = R.json()

        PAGES = DATA["query"]["pages"]

        for page in PAGES:
            extract = PAGES[page]["extract"]
            content_file.write("<doc id=\""+IDs[i]+"\" title=\""+titles[i]+"\">\n")
            content_file.write(extract+'\n')
            content_file.write("</doc>\n\n")

    content_file.close()

Processing category Brazilian telenovelas
Processing category Cold War films
Processing category Fauna of Zimbabwe
Processing category Freeware games
Processing category International law
Processing category Landscape painters
Processing category Members of the Chinese Academy of Sciences
Processing category Musicals based on novels
Processing category Particle physics
Processing category 21st-century women mathematicians


Let's now convert our raw texts into sets of features. We will do this using a vocabulary of character ngrams.

In [34]:
#USAGE: python3 ngrams.py <ngram size>

import sys
from os import listdir
from os.path import isfile, isdir, join
    

d = './drive/MyDrive/Colab-Notebooks/Search-Engine/Categories'
catdirs = [join(d,o) for o in listdir(d) if isdir(join(d,o))]
n = 6
print(catdirs)
for cat in catdirs:
    ngrams = {}
    f = open(join(cat,'linear.txt'),'r')
    for l in f:
        if "<doc id" not in l and "</doc" not in l:
            l = l.rstrip('\n').lower()
            for i in range(len(l)-n+1):
                ngram = l[i:i+n]
                
                if ngram in ngrams:
                    ngrams[ngram]+=1
                else:
                    ngrams[ngram]=1
    f.close()

    ngramfile = open(join(cat,"linear."+str(n)+".ngrams"),'w')
    for k in sorted(ngrams, key=ngrams.get, reverse=True):
        ngramfile.write(k+'\t'+str(ngrams[k])+'\n')
    ngramfile.close() 


['./drive/MyDrive/Colab-Notebooks/Search-Engine/Categories/Brazilian_telenovelas', './drive/MyDrive/Colab-Notebooks/Search-Engine/Categories/Cold_War_films', './drive/MyDrive/Colab-Notebooks/Search-Engine/Categories/Fauna_of_Zimbabwe', './drive/MyDrive/Colab-Notebooks/Search-Engine/Categories/Freeware_games', './drive/MyDrive/Colab-Notebooks/Search-Engine/Categories/International_law', './drive/MyDrive/Colab-Notebooks/Search-Engine/Categories/Landscape_painters', './drive/MyDrive/Colab-Notebooks/Search-Engine/Categories/Members_of_the_Chinese_Academy_of_Sciences', './drive/MyDrive/Colab-Notebooks/Search-Engine/Categories/Musicals_based_on_novels', './drive/MyDrive/Colab-Notebooks/Search-Engine/Categories/Particle_physics', './drive/MyDrive/Colab-Notebooks/Search-Engine/Categories/21st-century_women_mathematicians']


In [35]:
import sys
import string
from math import log
from os import listdir
from os.path import isfile, isdir, join


d = './drive/MyDrive/Colab-Notebooks/Search-Engine/Categories'
catdirs = [join(d,o) for o in listdir(d) if isdir(join(d,o))]

def contain_punctuation(s):
    punctuation = [c for c in string.punctuation]
    punctuation.append(' ')
    r = any(c in s for c in punctuation) 
    return r

def normalise_tfs(tfs,total):
    for k,v in tfs.items():
        tfs[k] = v / total
    return tfs

def log_idfs(idfs,num_cats):
    for k,v in idfs.items():
        idfs[k] = log(num_cats / v)
    return idfs

cat_tfs = {}
cat_tf_idfs = {}
idfs = {}

for cat in catdirs:
    tfs = {}
    sum_freqs = 0
    #print("Processing",filename,"...")
    ngram_files = [join(cat,f) for f in listdir(cat) if isfile(join(cat, f)) and '.ngrams' in f]
    for ngram_file in ngram_files:
        f = open(ngram_file,'r')
        for l in f:
            l = l.rstrip()
            ngram = '\t'.join(i for i in l.split('\t')[:-1])
            freq = int(l.split('\t')[-1])
            tfs[ngram] = freq
            sum_freqs+=freq
            if ngram in idfs:
                idfs[ngram]+=1
            else:
                idfs[ngram]=1
        f.close()

    tfs = normalise_tfs(tfs,sum_freqs)
    cat_tfs[cat] = tfs

    #for k in sorted(idfs, key=tfs.get, reverse=True)[:10]:
    #    print(k,idfs[k])

idfs = log_idfs(idfs, len(catdirs))

vocab=[]

for cat in catdirs:
    tf_idfs = {}
    tfs = cat_tfs[cat]
    for ngram,tf in tfs.items():
        tf_idfs[ngram] = tf * idfs[ngram]
    cat_tf_idfs[cat] = tf_idfs

    c = 0
    for k in sorted(tf_idfs, key=tf_idfs.get, reverse=True):
        #only keep top 100 dimensions per category. Also, we won't keep ngrams with spaces
        if c == 100:
            break
        if k not in vocab and not contain_punctuation(k):
            vocab.append(k)
            c+=1

print("VOCAB SIZE:",len(vocab))

#Write tf-idfs for each category
for cat in catdirs:
    tf_idfs = cat_tf_idfs[cat]
    f = open(join(cat,'tf_idfs.txt'),'w')
    for ngram in sorted(vocab):
        if ngram in tf_idfs:
            f.write(ngram+' '+str(tf_idfs[ngram])+'\n')
        else:
            f.write(ngram+' 0.0\n')
    f.close()
 

vocab_file = open("./drive/MyDrive/Colab-Notebooks/Search-Engine/vocab_file.txt",'w')
for ngram in sorted(vocab):
    vocab_file.write(ngram+'\n')
vocab_file.close()

VOCAB SIZE: 1000


In [36]:
#USAGE: python3 mk_cat_vectors.py

import sys
import numpy as np
from os import listdir
from os.path import isfile, isdir, join
    

def read_vocab2():
    i_to_ngrams = {}
    ngrams_to_i = {}
    c = 0
    f = open('./drive/MyDrive/Colab-Notebooks/Search-Engine/vocab_file.txt','r')
    for l in f:
        l = l.rstrip()
        i_to_ngrams[c] = l
        ngrams_to_i = c
        c+=1
    return i_to_ngrams, ngrams_to_i

def read_vocab():
    with open('./drive/MyDrive/Colab-Notebooks/Search-Engine/vocab_file.txt','r') as f:
        vocab = f.read().splitlines()
    return vocab

d = './drive/MyDrive/Colab-Notebooks/Search-Engine/Categories'
catdirs = [join(d,o) for o in listdir(d) if isdir(join(d,o))]
vocab = read_vocab()
vector_file = open('./drive/MyDrive/Colab-Notebooks/Search-Engine/category_vectors.txt','w')

for cat in catdirs:
    print(cat)
    vec = np.zeros(len(vocab))
    f = open(join(cat,'tf_idfs.txt'),'r')
    for l in f:
        l = l.rstrip('\n')
        ngram = ' '.join([i for i in l.split()[:-1]])
        tf_idf = float(l.split()[-1])
        pos = vocab.index(ngram)
        vec[pos] = tf_idf
    f.close()

    vector_file.write(cat+' '+' '.join([str(v) for v in vec])+'\n')
vector_file.close()


./drive/MyDrive/Colab-Notebooks/Search-Engine/Categories/Brazilian_telenovelas
./drive/MyDrive/Colab-Notebooks/Search-Engine/Categories/Cold_War_films
./drive/MyDrive/Colab-Notebooks/Search-Engine/Categories/Fauna_of_Zimbabwe
./drive/MyDrive/Colab-Notebooks/Search-Engine/Categories/Freeware_games
./drive/MyDrive/Colab-Notebooks/Search-Engine/Categories/International_law
./drive/MyDrive/Colab-Notebooks/Search-Engine/Categories/Landscape_painters
./drive/MyDrive/Colab-Notebooks/Search-Engine/Categories/Members_of_the_Chinese_Academy_of_Sciences
./drive/MyDrive/Colab-Notebooks/Search-Engine/Categories/Musicals_based_on_novels
./drive/MyDrive/Colab-Notebooks/Search-Engine/Categories/Particle_physics
./drive/MyDrive/Colab-Notebooks/Search-Engine/Categories/21st-century_women_mathematicians


In [38]:
#USAGE: python3 classify.py <query_file>
import sys
import numpy as np
from math import sqrt


def cosine_similarity(v1, v2):
    num = np.dot(v1, v2)
    den_a = np.dot(v1, v1)
    den_b = np.dot(v2, v2)
    return num / (sqrt(den_a) * sqrt(den_b))

def read_vocab():
    with open('./drive/MyDrive/Colab-Notebooks/Search-Engine/vocab_file.txt','r') as f:
        vocab = f.read().splitlines()
    return vocab

def read_queries(query_file):
    with open(query_file) as f:
        queries = f.read().splitlines()
    return queries

def read_category_vectors():
    vectors = {}
    f = open('./drive/MyDrive/Colab-Notebooks/Search-Engine/category_vectors.txt','r')
    for l in f:
        l = l.rstrip('\n')
        fields = l.split()
        cat = fields[0]
        vec = np.array([float(v) for v in fields[1:]])
        vectors[cat] = vec
    return vectors

def get_ngrams(l,n):
    l = l.lower()
    ngrams = {}
    for i in range(0,len(l)-n+1):
        ngram = l[i:i+n]
        if ngram in ngrams:
            ngrams[ngram]+=1
        else:
            ngrams[ngram]=1
    return ngrams

def normalise_tfs(tfs,total):
    for k,v in tfs.items():
        tfs[k] = v / total
    return tfs

def mk_vector(vocab,tfs):
    vec = np.zeros(len(vocab))
    for t,f in tfs.items():
        if t in vocab:
            pos = vocab.index(t)
            vec[pos] = f
    return vec

vocab = read_vocab()
print(len(vocab))
vectors = read_category_vectors()
#queries = read_queries(sys.argv[1])
queries = open('./drive/MyDrive/Colab-Notebooks/Search-Engine/query_file.txt','r')

for q in queries:
    print("\nQUERY:",q)
    ngrams = {}
    cosines = {}
    for i in range(4,7):
        n = get_ngrams(q,i)
        ngrams = {**ngrams, **n}
    qvec = mk_vector(vocab,ngrams)
    for cat,vec in vectors.items():
        cosines[cat] = cosine_similarity(vec,qvec)
    for cat in sorted(cosines, key=cosines.get, reverse=True):
        print(cat,cosines[cat])

1000

QUERY: animals of zimbabwe

./drive/MyDrive/Colab-Notebooks/Search-Engine/Categories/Fauna_of_Zimbabwe 0.0962172268166385
./drive/MyDrive/Colab-Notebooks/Search-Engine/Categories/Cold_War_films 0.027295703295014084
./drive/MyDrive/Colab-Notebooks/Search-Engine/Categories/Brazilian_telenovelas 0.0
./drive/MyDrive/Colab-Notebooks/Search-Engine/Categories/Freeware_games 0.0
./drive/MyDrive/Colab-Notebooks/Search-Engine/Categories/International_law 0.0
./drive/MyDrive/Colab-Notebooks/Search-Engine/Categories/Landscape_painters 0.0
./drive/MyDrive/Colab-Notebooks/Search-Engine/Categories/Members_of_the_Chinese_Academy_of_Sciences 0.0
./drive/MyDrive/Colab-Notebooks/Search-Engine/Categories/Musicals_based_on_novels 0.0
./drive/MyDrive/Colab-Notebooks/Search-Engine/Categories/Particle_physics 0.0
./drive/MyDrive/Colab-Notebooks/Search-Engine/Categories/21st-century_women_mathematicians 0.0

QUERY: famous mathematicians

./drive/MyDrive/Colab-Notebooks/Search-Engine/Categories/21st-centu