# Create Index Files

Imports

In [7]:
from collections import OrderedDict
from bs4 import BeautifulSoup as bs

import numpy as np
import pandas as pd

import json
import gzip
import regex as re

import porterAlgo # Porter Stemming Algo

class int64_encoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.int64):
            return int(obj)
        return json.JSONEncoder.default(self, obj)

import spacy #python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')

Stop Words

- Using Cacm Stopwords txt file
- https://github.com/AOikonomidis/collection-retrieval/blob/master/solr-config/cacm_configs/stopwords.txt

In [9]:
stopWords = []
with open("cacm_stopwords.txt","r") as stopWord:
    stopWords = [word.replace("\n","") for word in stopWord]


PorterStemming

- Used porterAlgo.py to stem words (removes stop words, numbers, etc.)
- Left numbers as is
- removeNone(word) -> removes the word 'None' from the list results.  
- http://tartarus.org/~martin/PorterStemmer/

In [15]:
def porterStemming(uncleaned_word):
    porterStemming = porterAlgo.PorterStemmer()

    if uncleaned_word.isnumeric(): 
        return uncleaned_word # keep numbers as is
    
    word = re.sub('[^a-zA-Z]','',uncleaned_word) # remove special chars
    word_cleaned = word.lower().replace("\n","")
    # word_cleaned = porterStemming.stem(word,0,len(word)-1).lower().replace("\n","") #lemmatize token
    
    if word_cleaned not in stopWords and len(word_cleaned)>2: # remove stopwords
        return word_cleaned
    else: return 'None'
    
def removeNone(word):
    bad = ['None']
    if word in bad:
        return False
    return True

#porterStemming("pythonung78$_^(   )")

Dictionary File 

- Gets the document frequency for each term in the collection

In [108]:
#Term : Document Frequency
documentFrequency = {}

def docFrequency(docSet):
    for word in docSet:
        if word in documentFrequency: documentFrequency[word]+=1
        else: documentFrequency[word] =1

Context Window
- How do we want to deal with the context windows?
- We only need one which is good
- Should be able to splice the doc tokens to get 5 to the left 5 to the right
- What do we do if the word is the first/last in the doc?

In [109]:
def getWindow(size, index, doc):
  if index < size:
    window = doc[:index + size + 1]
  else:
    window = doc[index-size:index + size + 1]
  return window

Posting List 

- Updates postingListDict {word: {id: [doc freq, postings]}}
- len(indexes) = term frequency
- #https://btechgeeks.com/python-how-to-find-all-indexes-of-an-item-in-a-list/#Using_Numpy

In [110]:
postingListDict = {}

def postingsList(set,id,doc):
    for word in set:
        indexes = np.where(np.array(doc) ==  word)[0] # finding all occurences of word
        indexes = list(indexes)
        context = " ".join(getWindow(5, indexes[0], doc))
        dict = {str(id):[len(indexes),indexes,context]}
        if word in postingListDict: # update main postingListDictionary
            postingListDict[word][str(id)] = [len(indexes), indexes,context]
        else:
            postingListDict[word]=dict


Process HTML 

- Deals with HTML within doc['contents']
- Strips HTML code of all tags
- contents_pretified = contents.prettify()
- Stems content and returns it with id

In [30]:
docTitleDict = {}

def processContents(line):
     data=json.loads(line) # Load JSON
     id = data['id']
     title = data['title']
     docTitleDict[id] = title

     contents = bs(data['contents'],"html.parser") #Deal with HTML content 

     contents_stripped=" ".join(contents.stripped_strings) # remove html tags from content
     contents_stripped = contents_stripped.split(" ")

     porterStem = list(map(porterStemming,contents_stripped)) # stem content
     porterStem = list(filter(removeNone,porterStem))

     return [id,title,porterStem]

Main

- Run through gzip file (212651 files in gzip)
- Start updating dictionary file and posting list

In [39]:

from http.client import CONTINUE


path = r"C:\Users\jkyle\Desktop\CPS842\trec_corpus_5000.jsonl.gz"
path = r"C:\Users\deand\OneDrive\Documents\CPS 842\A1\trec_corpus_5000.jsonl.gz"

count = 0 # temporary

# contentsPorterStemmed = []
with gzip.open(path,'rb') as file:

    for wiki in file:
        count+=1

        contents = processContents(wiki)
        doc = nlp(" ".join(contents[2]))
        # for i in doc:
        #     print(i.text, i.tag_)
        vocab = {}
        print("\n",contents[1],"~~~~~~~~~~~~~~~~~~~~~~~~~~~")
        tags = [[i.text,i.tag_ ]for i in doc if i.tag_ =="NNP" or i.tag_ =="NN"]
        print(tags)
        for ent in doc.ents:
            if ent.label_== "CARDINAL" or  ent.label_ =="DATE": continue
            if ent.text in vocab:
                vocab[ent.text]+=1
            else: vocab[ent.text]=1
            print(ent.text, ent.start_char, ent.end_char, ent.label_)
        print("--->","\n",vocab,"\n","~~~~~~~~~~~~~~~~~~~~~~~~~~~")
        if count==10:break

        # if count % 1000 == 0:
        #     print(count)



 James Bond in video games ~~~~~~~~~~~~~~~~~~~~~~~~~~~
james bond 0 10 PERSON
cpc 336 339 ORG
apple 340 345 ORG
macintosh msx 346 359 PERSON
bbc micro 390 399 ORG
amiga atari dos 400 415 PERSON
ian 815 818 NORP
british 837 844 NORP
james bond 851 861 PERSON
james bond 966 976 PERSON
james bond 1306 1316 PERSON
james bond 1390 1400 PERSON
world gbc 1457 1466 EVENT
russia 1603 1609 GPE
ian 1981 1984 NORP
james bond 2468 2478 PERSON
oddjob 2577 2583 GPE
eurocom 3038 3045 ORG
eurocom 3685 3692 ORG
john cleese 4119 4130 PERSON
roger moore 4220 4231 PERSON
russia 4715 4721 GPE
james bond 5358 5368 PERSON
quantum 5402 5409 ORG
eurocom 5501 5508 ORG
eurocom 5711 5718 ORG
daniel craig 5812 5824 PERSON
fiftieth 6475 6483 ORDINAL
eurocom 6860 6867 ORG
kevin bruner 7235 7247 PERSON
james bond 7326 7336 PERSON
s 7445 7446 ORG
james bond 7958 7968 PERSON
james bond 8211 8221 PERSON
microsoft 8649 8658 ORG
james bond 8914 8924 PERSON
desmond llewelyn 9178 9194 PERSON
japanese 9762 9770 NORP
sotoshi 

: 

Sort And Save Index Files

In [113]:
postingListDict = OrderedDict(sorted(postingListDict.items())) #sorting dictionary
documentFrequency = OrderedDict(sorted(documentFrequency.items())) #sorting dictionary
docTitleDict = OrderedDict(sorted(docTitleDict.items()))  # sorting dictionary

# postingListDict
# documentFrequency
# docTitleDict

with open('postingListDict.json', 'w') as f:
  json.dump(postingListDict, f, cls=int64_encoder)

with open('documentFrequency.json', 'w') as f:
  json.dump(documentFrequency, f)

with open('docTitleDict.json', 'w') as f:
  json.dump(docTitleDict, f)


# Query

Import / Load

In [114]:
import time
porterStemming = porterAlgo.PorterStemmer()

with open("postingListDict.json", "r") as postings:
  postings = json.load(postings)

with open("documentFrequency.json", "r") as freqs:
  freqs = json.load(freqs)

with open("docTitleDict.json", "r") as titles:
  titles = json.load(titles)


Search and display results

In [115]:
def search(base_query):
  # Parse Query
  query = porterStemming.stem(base_query, 0, len(
      base_query)-1).lower().replace("\n", "")

  # Doc Frequency
  doc_freq = freqs.get(query, 0)

  # Get docs where query term is found. Returns an empty ditc if not
  docs = postings.get(query, {})
  
  # Get doc information to return to user
  results = [[doc_id, titles[doc_id], docs[doc_id][0], docs[doc_id][1], docs[doc_id][2]] for doc_id in docs.keys()]
  
  # Sort results based on term frequency
  results.sort(key=lambda results: results[2], reverse=True)
  
  # Display results
  print("=======================================")
  print(f"Showing results for query: {base_query}")
  print(f"Total Hits (Doc Frequency): {doc_freq}")
  for i in range(len(results)):
    print("=======================================")
    print(f"Result {i+1}/{doc_freq}:")
    print("=======================================")
    print(f"id: {results[i][0]}")
    print(f"title: {results[i][1]}")
    print(f"term frequency in doc: {results[i][2]}")
    print(f"postings: {results[i][3]}")
    print(f"summary: {results[i][4]}")
  print("=======================================")


Main

In [116]:
query = input("Enter a one word query!")
query_count = 0
total_times = 0
while query != "ZZEND":
  query_count += 1
  start_time = time.time()
  search(query)
  exec_time = time.time() - start_time
  total_times += exec_time
  print(f"The query took {exec_time} seconds.\n\n\n")
  query = input("Enter a one word query!")

print("=======================================")
print("ENDING")
print("=======================================")
print(f"Average query time: {total_times/query_count}")
print("=======================================")


Showing results for query: ok
Total Hits (Doc Frequency): 0
The query took 5.602836608886719e-05 seconds.



Showing results for query: bonds
Total Hits (Doc Frequency): 46
Result 1/46:
id: 1512303
title: James Bond in video games
term frequency in doc: 107
postings: [1, 11, 110, 122, 138, 155, 162, 167, 186, 190, 203, 216, 221, 225, 231, 241, 254, 283, 287, 317, 337, 369, 382, 484, 521, 599, 632, 678, 743, 760, 822, 856, 917, 993, 1026, 1033, 1094, 1119, 1132, 1145, 1158, 1249, 1293, 1311, 1322, 1359, 1373, 1429, 1437, 1447, 1461, 1485, 1535, 1594, 1669, 1829, 1850, 1864, 1872, 1877, 1896, 1902, 1907, 1922, 1929, 1932, 1959, 1964, 1985, 1990, 1996, 2002, 2007, 2013, 2037, 2048, 2092, 2101, 2138, 2152, 2193, 2230, 2244, 2259, 2356, 2388, 2401, 2422, 2450, 2459, 2467, 2527, 2537, 2549, 2554, 2557, 2563, 2582, 2586, 2591, 2599, 2647, 2655, 2675, 2679, 2686, 2709]
summary: jame bond video game video game franchis
Result 2/46:
id: 159370
title: Rosamund Pike
term frequency in doc: 11
posti