Imports

In [4]:
from collections import OrderedDict
from bs4 import BeautifulSoup as bs

import numpy as np
import pandas as pd

import json
import gzip
import regex as re

import porterAlgo # Porter Stemming Algo

import spacy #python -m spacy download en_core_web_sm

class int64_encoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.int64):
            return int(obj)
        return json.JSONEncoder.default(self, obj)
        
#nlp = spacy.load('en_core_web_sm')

Stop Words

- Using Cacm Stopwords txt file
- https://github.com/AOikonomidis/collection-retrieval/blob/master/solr-config/cacm_configs/stopwords.txt

In [5]:
stopWords = []
with open("cacm_stopwords.txt","r") as stopWord:
    stopWords = [word.replace("\n","") for word in stopWord]


PorterStemming

- Used porterAlgo.py to stem words (removes stop words, numbers, etc.)
- Left numbers as is
- removeNone(word) -> removes the word 'None' from the list results.  
- http://tartarus.org/~martin/PorterStemmer/

In [6]:
def porterStemming(uncleaned_word):
    porterStemming = porterAlgo.PorterStemmer()

    if uncleaned_word.isnumeric(): 
        return uncleaned_word # keep numbers as is
    
    word = re.sub('[^a-zA-Z]','',uncleaned_word) # remove special chars
    word_cleaned = porterStemming.stem(word,0,len(word)-1).lower().replace("\n","") #lemmatize token
    
    if word_cleaned not in stopWords and len(word_cleaned)>2: # remove stopwords
        return word_cleaned
    else: return 'None'
    
def removeNone(word):
    bad = ['None']
    if word in bad:
        return False
    return True

#porterStemming("pythonung78$_^(   )")

Dictionary File 

- Gets the document frequency for each term in the collection

In [22]:
#Term : Document Frequency
documentFrequency = {}

def docFrequency(docSet):
    for word in docSet:
        if word in documentFrequency: documentFrequency[word]+=1
        else: documentFrequency[word]=1

Context Window
- How do we want to deal with the context windows?
- We only need one which is good
- Should be able to splice the doc tokens to get 5 to the left 5 to the right
- What do we do if the word is the first/last in the doc?

In [None]:
def getWindow(size, index, doc):
  if index < size:
    window = doc[:index + size + 1]
  else:
    window = doc[index-size:index + size + 1]
  return window

Posting List 

- Updates postingListDict {word: {id: [doc freq, postings]}}
- len(indexes) = term frequency
- https://btechgeeks.com/python-how-to-find-all-indexes-of-an-item-in-a-list/#Using_Numpy

In [16]:
postingListDict = {}

def postingsList(set,id,doc):
    for word in set:
        indexes = np.where(np.array(doc) ==  word)[0] # finding all occurences of word
        indexes = list(indexes)
        contextWindow = " ".join(getWindow(5, indexes[0], doc))
        dict = {str(id):[len(indexes),indexes,contextWindow]}
        if word in postingListDict: # update main postingListDictionary
            postingListDict[word][str(id)] = [len(indexes), indexes,contextWindow]
        else:
            postingListDict[word]=dict


Process HTML 

- Deals with HTML within doc['contents']
- Strips HTML code of all tags
- contents_pretified = contents.prettify()
- Stems content and returns it with id

In [9]:
def processContents(line):
     data=json.loads(line) # Load JSON
     id = data['id']
     title = data['title']

     contents = bs(data['contents'],"html.parser") #Deal with HTML content 

     contents_stripped=" ".join(contents.stripped_strings) # remove html tags from content
     contents_stripped = contents_stripped.split(" ")

     porterStem = list(map(porterStemming,contents_stripped)) # stem content
     porterStem = list(filter(removeNone,porterStem))

     return [id,porterStem]

Main

- Run through gzip file (212651 files in gzip)
- Start updating dictionary file and posting list

In [26]:

path  = "C:/Users/deand/OneDrive/Documents/CPS 842/trec_corpus_5000.jsonl.gz"
count = 0 # temporary

contentsPorterStemmed = []
with gzip.open(path,'rb') as file:

    for wiki in file:
        count+=1

        contents = processContents(wiki)
        wordsSet = set(contents[1])
        postingsList(wordsSet,contents[0],contents[1]) # Update Positing List
        docFrequency(wordsSet) # Update documentFrequency
       
        contentsPorterStemmed.append(contents)
    
        if count==30:break


In [28]:
postingListDict = OrderedDict(sorted(postingListDict.items())) #sorting dictionary
documentFrequency = OrderedDict(sorted(documentFrequency.items())) #sorting dictionary
postingListDict

OrderedDict([('0',
              {'593630': [1, [332]],
               1872229: [1, array([2674], dtype=int64)]}),
             ('00',
              {'1512303': [1, [1142]],
               1512303: [1, array([1142], dtype=int64)]}),
             ('007',
              {'1512303': [50,
                [106,
                 156,
                 200,
                 204,
                 210,
                 252,
                 259,
                 272,
                 288,
                 344,
                 349,
                 370,
                 656,
                 803,
                 863,
                 911,
                 984,
                 1068,
                 1116,
                 1159,
                 1193,
                 1197,
                 1209,
                 1326,
                 1419,
                 1425,
                 1582,
                 1596,
                 1722,
                 1796,
                 1875,
                 19