## Task 1

In [4]:
import os, glob
import re
import string
from stemming.porter2 import stem



In [79]:
def process_line(line:str):
    term_frequency = {}
    line = line.strip()
    
     # remove p tags
    line = line.replace("<p>","").replace("</p>", "")
    
    # remove digits and punctuations
    line = line.translate(str.maketrans("", "", string.digits))
    line = line.translate(str.maketrans(string.punctuation, " "* len(string.punctuation)))

    # remove extra white spaces
    line = re.sub(r"\s+", " ", line)
    for term in line.split():
        term = stem(term.lower())
        if len(term) > 2 and (term not in stop_words):
            if term in term_frequency:
                term_frequency[term] += 1
            else:
                term_frequency[term] = 1
    return term_frequency

def index_docs(folder_path:str, stop_words: list[str]):
    Index = {}
    for file_path in glob.glob(f"{folder_path}/*.xml"):
        word_count = 0
        doc_id = ""
        term_frequency = {}
        with open(file_path, "r") as file:
            text_start = False
            for line in file:
                # remove \n and spaces
                line = line.strip()

                # obtain itemid
                if not doc_id:
                    if line.startswith("<newsitem "):
                        for part in line.split():
                            if part.startswith("itemid="):
                                doc_id = part.split("=")[1].strip("\"")

                # look for the content of <text></text>
                if line.startswith("<text>"):
                    text_start = True
                    continue
                elif line.startswith("</text>"):
                    text_start = False
                    break

                if text_start:
                    # remove p tags
                    line = line.replace("<p>","").replace("</p>", "")
                    
                    # remove digits and punctuations
                    line = line.translate(str.maketrans("", "", string.digits))
                    line = line.translate(str.maketrans(string.punctuation, " "* len(string.punctuation)))

                    # remove extra white spaces
                    line = re.sub(r"\s+", " ", line)
                    for term in line.split():
                        word_count += 1
                        term = term.lower()
                        term = stem(term)
                        if len(term) > 2 and (term not in stop_words):
                            if term in Index:
                                if doc_id in Index[term]:
                                    Index[term][doc_id] += 1
                                else:
                                    Index[term][doc_id] = 1
                            
                            else:
                                Index[term] = {str(doc_id): 1}


    return Index


In [18]:
print(os.getcwd())
stopwords_f = open('../week4/prac/common-english-words.txt', 'r') # wk3
stop_words = stopwords_f.read().split(',')

/workspaces/cab420-workspace/work/search-engine-technology/week6


In [19]:

Index = index_docs("data", stop_words)

In [67]:
def doc_at_a_time(I:dict, Q:dict):
    """
    @param I: { term: {doc_id:freq} }
    @param Q: {term: num_occurrences}
    """
    L = {} # Inverted List {query_term: {doc_id:num_occurrences} }
    R = {} # {doc_id : relevance}

    # Obtain the inverted list based on the Query
    for (term, freq) in I.items():
        if term in Q:
            L[term] = freq
        for doc_id in freq.keys():
            R[doc_id] = 0

    for doc_id in R:
        for term, doc_freq in L.items():
            # if the query term exist in the doc
            if doc_id in doc_freq:
                # the number of times that the term occurs in this doc
                num_occurrences_in_doc = doc_freq[doc_id]
                R[doc_id] += num_occurrences_in_doc * Q[term]
    return R
            


        

    


In [68]:
Query = {'formula':1, 'one':1}
Index = index_docs("data", stop_words)
print(doc_at_a_time(Index, Query))

{'809495': 1, '741299': 2, '809481': 0, '6146': 0}


In [69]:
def term_at_a_time(I, Q):
    """
    @param I: { term: {doc_id:freq} }
    @param Q: {term: num_occurrences}
    """
    L = {} # Inverted List {query_term: {doc_id:num_occurrences} }
    R = {} # {doc_id : relevance}

    # Obtain the inverted list based on the Query
    for (term, freq) in I.items():
        if term in Q:
            L[term] = freq
        for doc_id in freq.keys():
            R[doc_id] = 0


    for term in L:
        for doc_id in R:
            # if the term exist in the doc
            doc_freq = L[term]
            if doc_id in doc_freq:
                # num of times that the term occurs in this doc
                num_occurrences_in_doc = doc_freq[doc_id]
                R[doc_id] += num_occurrences_in_doc * Q[term]

    return R


## Task 4
Main function

In [89]:
Query = process_line("war")
Index = index_docs("data", stop_words)

result1 = doc_at_a_time(Index, Query)
result2 = term_at_a_time(Index, Query)

x1 = sorted(result1.items(), key=lambda x: x[1],reverse=True)
x2 = sorted(result2.items(), key=lambda x: x[1],reverse=True)
print(x1)
print(x2)


[('809495', 1), ('741299', 0), ('809481', 0), ('6146', 0)]
[('809495', 1), ('741299', 0), ('809481', 0), ('6146', 0)]


In [78]:
Index

{'pro': {'809495': 4},
 'iranian': {'809495': 1},
 'hizbollah': {'809495': 7},
 'fighter': {'809495': 1},
 'lebanon': {'809495': 11},
 'rain': {'809495': 1, '741299': 1},
 'score': {'809495': 1},
 'katyusha': {'809495': 4},
 'rocket': {'809495': 5},
 'northern': {'809495': 3},
 'israel': {'809495': 13},
 'tuesday': {'809495': 3, '809481': 1},
 'prompt': {'809495': 1},
 'threat': {'809495': 1},
 'prime': {'809495': 1},
 'minist': {'809495': 2, '6146': 1},
 'benjamin': {'809495': 1},
 'netanyahu': {'809495': 5},
 'tough': {'809495': 2},
 'isra': {'809495': 9},
 'respons': {'809495': 3},
 'quot': {'809495': 13, '741299': 4, '6146': 2},
 'quiet': {'809495': 3},
 'side': {'809495': 4},
 'border': {'809495': 2},
 'told': {'809495': 2},
 'report': {'809495': 2},
 'stand': {'809495': 2, '741299': 1},
 'damag': {'809495': 1},
 'hous': {'809495': 1},
 'dure': {'809495': 1, '741299': 1, '6146': 1},
 'tour': {'809495': 2},
 'town': {'809495': 3},
 'kiryat': {'809495': 1},
 'shmona': {'809495': 1},

{'hello': 1,
 'mate': 1,
 'nice': 1,
 'meet': 1,
 'whi': 1,
 'compani': 1,
 'doesnot': 1,
 'pay': 1,
 'more': 1,
 'salari': 1}