In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Inverted Index** - A mapping from terms (words) to the list of document IDs containing those terms.

# **Question 01** - **Part A**

In [None]:
import os
def build_inverted_index(folder_path): # Define function with folder path as input
    inverted_index = {} # Create empty dictionary to store word → list of document IDs

    for filename in os.listdir(folder_path): # Loop through all files in the given folder
        file_path = os.path.join(folder_path, filename) # Create full path to the file
        doc_id = filename.split('.')[0] # Use filename (without extension) as document ID

        with open(file_path, 'r', encoding='utf-8') as file: # Open and read the file
          content = file.read() # Read entire text from file content
          words = content.split() # Split into individual words

          for word in words: # For each word, update the inverted index
              if word not in inverted_index: # If word not already in dictionary
                  inverted_index[word] = []  # Create empty list for new word

              if doc_id not in inverted_index[word]: # If document ID not already listed
                  inverted_index[word].append(doc_id) # Add document ID to the word’s list

    return inverted_index # Return the complete inverted index

In [None]:
folder_path = '/content/drive/MyDrive/IRWA/inverted' # Path to folder containing text files
inverted_index = build_inverted_index(folder_path) # Build inverted index: word → list of document IDs

In [None]:
inverted_index # Print the inverted index

{'new': ['Doc3', 'Doc4', 'Doc2'],
 'approach': ['Doc3'],
 'for': ['Doc3', 'Doc4', 'Doc1'],
 'treatment': ['Doc3'],
 'of': ['Doc3'],
 'schizophrenia': ['Doc3', 'Doc4', 'Doc1', 'Doc2'],
 'hopes': ['Doc4'],
 'patients': ['Doc4'],
 'breakthrough': ['Doc1'],
 'drug': ['Doc1', 'Doc2']}

# **Part B** -  **I**

In [None]:
def AND_op(list1,list2):
    return set(list1).intersection(set(list2))   # Return common items (logical AND) between two lists as a set

In [None]:
for key in inverted_index: # Loop through every word (key) in the inverted index
    if key=='schizophrenia':  # If the word is 'schizophrenia', store its document list in List1
        List1=inverted_index[key]
    if key=='drug': # If the word is 'drug', store its document list in List2
        List2=inverted_index[key]

In [None]:
print(AND_op(List1,List2))

{'Doc1', 'Doc2'}


# **II**

In [None]:
def OR_op(list1,list2):  # Define function to combine two lists
    return set(list1).union(set(list2))  # Return all unique elements from both lists (union)

In [None]:
def NOT_op(a,b): # Return elements in list b that are NOT in list a (set difference)
    return set(b)-set(a)  # Return items in b that are NOT in a (set difference)

In [None]:
# For each filename, split it at '.' and take the part before the extension
# Collect these as a list of document IDs without file extensions
fileList=[name.split(".")[0] for name in os.listdir(folder_path)]
fileList

['Doc3', 'Doc4', 'Doc1', 'Doc2']

In [None]:
for key in inverted_index: # Loop through all words in the inverted index
    if key=='drug': # If the word is 'drug', get the list of documents containing 'drug' in List3
        List3=inverted_index[key]
    if key=='approach': # If the word is 'approach', get the list of documents containing 'approach' in List4
        List4=inverted_index[key]
    if key=='for': # If the word is 'for', get the list of documents containing 'for' in List5
        List5=inverted_index[key]

In [None]:
List6 = OR_op(List3, List4) # Combine documents containing 'drug' OR 'approach' (union)
List7 = NOT_op(List6,fileList) # Get documents NOT containing 'drug' or 'approach' from the full file list
List8 = AND_op(List5,List7) # Find documents that contain 'for' AND do NOT contain 'drug' or 'approach'
print(List8) # Print the final list of document ID

{'Doc4'}


# **Positional Index** -  An inverted index that also stores the positions of each term in each document.

In [None]:
import os
def build_positional_index(folder_path): # Define function with folder path as input
    positional_index = {} # Create new dictionary: word -> {doc_id: [positions]}

    for filename in os.listdir(folder_path): # Loop through all files in the given folder
        file_path = os.path.join(folder_path, filename) # Full path to file
        doc_id = filename.split('.')[0]  # Use filename (without extension) as document ID

        with open(file_path, 'r', encoding='utf-8') as file: # Open and read the file
            content = file.read() # Read the entire file content
            words = content.split() # Split content into a list of words

            for position, word in enumerate(words, start=1): # Enumerate over words with position starting at 1
                if word not in positional_index:
                    positional_index[word] = {} # Create nested dict for the word

                if doc_id not in positional_index[word]:
                    positional_index[word][doc_id] = [] # Create list for positions

                positional_index[word][doc_id].append(position) # Append position of word in document
    return positional_index # Return the positional index dictionary

In [None]:
folder_path = '/content/drive/MyDrive/IRWA/positional'
positional_index = build_positional_index(folder_path)

In [None]:
positional_index

{'\ufeffRemarks': {'doc_1': [1], 'doc_3': [1]},
 'circus': {'doc_1': [2]},
 'Announcing': {'doc_1': [3]},
 'Candidacy': {'doc_1': [4]},
 'for': {'doc_1': [5],
  'doc_3': [13, 26, 30, 51, 201, 254],
  'doc_2': [93, 138, 142, 260, 262, 265]},
 'President': {'doc_1': [6], 'doc_3': [219, 302]},
 'in': {'doc_1': [7, 42, 118, 148, 201],
  'doc_3': [10, 295, 305, 308, 355],
  'doc_2': [7, 106, 126, 216, 242, 294, 300]},
 'New': {'doc_1': [8, 46], 'doc_2': [48]},
 'York': {'doc_1': [9], 'doc_2': [49]},
 'City': {'doc_1': [10], 'doc_2': [50]},
 'Trump:': {'doc_1': [11]},
 'Wow.': {'doc_1': [12]},
 'Whoa.': {'doc_1': [13]},
 'That': {'doc_1': [14], 'doc_3': [124]},
 'is': {'doc_1': [15, 57, 117],
  'doc_3': [69],
  'doc_2': [229, 258, 274, 328, 333, 348]},
 'some': {'doc_1': [16, 70]},
 'group': {'doc_1': [17]},
 'of': {'doc_1': [18, 71],
  'doc_3': [76, 114, 129, 145, 218, 311, 347],
  'doc_2': [28, 60, 98, 103, 115, 128, 199, 222, 268, 283]},
 'people.': {'doc_1': [19]},
 'Thousands.So': {'doc

In [None]:
import os
import string

def build_positional_index(folder_path):
    positional_index = {}  # word -> {doc_id: [positions]}

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        doc_id = filename.split('.')[0]  # document ID from filename

        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

            # Lowercase and remove punctuation
            translator = str.maketrans('', '', string.punctuation)
            words = content.lower().translate(translator).split()

            for position, word in enumerate(words, start=1):
                if word not in positional_index:
                    positional_index[word] = {}

                if doc_id not in positional_index[word]:
                    positional_index[word][doc_id] = []

                positional_index[word][doc_id].append(position)

    return positional_index

In [None]:
folder_path = '/content/drive/MyDrive/IRWA/positional'
positional_index = build_positional_index(folder_path)

In [None]:
positional_index

{'Remarks': {'doc_1': [1], 'doc_2': [1], 'doc_3': [1]},
 'circus': {'doc_1': [2]},
 'Announcing': {'doc_1': [3]},
 'Candidacy': {'doc_1': [4]},
 'for': {'doc_1': [5],
  'doc_2': [93, 138, 142, 260, 262, 265],
  'doc_3': [13, 26, 30, 51, 201, 254]},
 'President': {'doc_1': [6], 'doc_3': [219, 302]},
 'in': {'doc_1': [7, 42, 118, 148, 201],
  'doc_2': [7, 106, 126, 216, 242, 294, 300],
  'doc_3': [10, 295, 305, 308, 355]},
 'New': {'doc_1': [8, 46], 'doc_2': [48]},
 'York': {'doc_1': [9], 'doc_2': [49]},
 'City': {'doc_1': [10], 'doc_2': [50]},
 'Trump:': {'doc_1': [11]},
 'Wow.': {'doc_1': [12]},
 'Whoa.': {'doc_1': [13]},
 'That': {'doc_1': [14], 'doc_3': [124]},
 'is': {'doc_1': [15, 57, 117],
  'doc_2': [229, 258, 274, 328, 333, 348],
  'doc_3': [69]},
 'some': {'doc_1': [16, 70]},
 'group': {'doc_1': [17]},
 'of': {'doc_1': [18, 71],
  'doc_2': [28, 60, 98, 103, 115, 128, 199, 222, 268, 283],
  'doc_3': [76, 114, 129, 145, 218, 311, 347]},
 'people.': {'doc_1': [19]},
 'Thousands.So