In [2]:
# Import necessary libraries
import pandas as pd
import json
import string 

# Load the CSV dataset
df = pd.read_csv("semi_strut.csv")

# Display the first few rows of the dataset
df.head()

Unnamed: 0,Document ID,Content
0,1,"{\n ""title"": ""Introduction to Python"",\n ""..."
1,2,"{\n ""title"": ""Data Analysis with Pandas"",\n ..."
2,3,"{\n ""title"": ""Web Development with Flask"",\n..."
3,4,"{\n ""title"": ""Machine Learning with Scikit-L..."
4,5,"{\n ""title"": ""Data Visualization with Matplo..."


In [4]:
# Tokenization function to extract terms from the JSON-like content
# Remember to exact both 
# 1 .Extract terms from various fields (title, author)
def tokenize_content(content):
    content_dict = json.loads(content)
    terms = []
    
    # Extract terms from various fields (title, author)
    terms.extend(content_dict.get("title", "").split())
    terms.extend(content_dict.get("author", "").split())
    terms.extend(content_dict.get('keywords', []))

    #  Extract terms from sections' titles and content
    sections = content_dict.get('sections', [])
    for section in sections:
        terms.extend(content_dict.get('title', '').split())
        terms.extend(content_dict.get('content', '').split())

    return terms
tokenize_content(df["Content"][0])
# 2. apply to all row in panda df , by create new column "Terms"
df["Terms"] = df['Content'].apply(tokenize_content)

# Display the DataFrame with the extracted terms
print(df[['Document ID', 'Terms']])


   Document ID                                              Terms
0            1  [Introduction, to, Python, John, Doe, Python, ...
1            2  [Data, Analysis, with, Pandas, Jane, Smith, Py...
2            3  [Web, Development, with, Flask, Mike, Johnson,...
3            4  [Machine, Learning, with, Scikit-Learn, Emily,...
4            5  [Data, Visualization, with, Matplotlib, Robert...


In [7]:
# 4. Implement a preprocessing function that converts terms to lowercase, removes punctuation, and removes common stop words.
    # Create another new column "Terms_preprocessed"
def preprocess_terms(terms):
    # Define a set of common stop words
    stop_words = set([
        "a", "an", "the", "and", "is", "in", "it", "to", "of", "for", "on", "with", "as"
    ])
    
    # Remove punctuation and convert to lowercase
    terms = [term.lower().strip(string.punctuation) for term in terms]
    
    # Remove stop words
    terms = [term for term in terms if term not in stop_words]
    
    return terms

df['Terms_prep'] = df['Terms'].apply(preprocess_terms)
df

Unnamed: 0,Document ID,Content,Terms,Terms_preprocessed,Terms_prep
0,1,"{\n ""title"": ""Introduction to Python"",\n ""...","[Introduction, to, Python, John, Doe, Python, ...","[introduction, python, john, doe, python, prog...","[introduction, python, john, doe, python, prog..."
1,2,"{\n ""title"": ""Data Analysis with Pandas"",\n ...","[Data, Analysis, with, Pandas, Jane, Smith, Py...","[data, analysis, pandas, jane, smith, python, ...","[data, analysis, pandas, jane, smith, python, ..."
2,3,"{\n ""title"": ""Web Development with Flask"",\n...","[Web, Development, with, Flask, Mike, Johnson,...","[web, development, flask, mike, johnson, pytho...","[web, development, flask, mike, johnson, pytho..."
3,4,"{\n ""title"": ""Machine Learning with Scikit-L...","[Machine, Learning, with, Scikit-Learn, Emily,...","[machine, learning, scikit-learn, emily, davis...","[machine, learning, scikit-learn, emily, davis..."
4,5,"{\n ""title"": ""Data Visualization with Matplo...","[Data, Visualization, with, Matplotlib, Robert...","[data, visualization, matplotlib, robert, clar...","[data, visualization, matplotlib, robert, clar..."


In [8]:
# Initialize an empty inverted index dictionary
# Build the inverted index
# Initialize an empty inverted index dictionary
inverted_index = {}

# Build the inverted index
for index, row in df.iterrows():
    document_id = row["Document ID"]
    terms = row["Terms_prep"]
    
    # Update the inverted index with terms and document IDs
    for term in terms:
        if term not in inverted_index:
            inverted_index[term] = set()
        inverted_index[term].add(document_id)

# Display the inverted index
inverted_index

{'introduction': {1},
 'python': {1, 2, 3, 4, 5},
 'john': {1},
 'doe': {1},
 'programming': {1},
 'beginner': {1},
 'data': {2, 5},
 'analysis': {2},
 'pandas': {2},
 'jane': {2},
 'smith': {2},
 'data analysis': {2},
 'web': {3},
 'development': {3},
 'flask': {3},
 'mike': {3},
 'johnson': {3},
 'web development': {3},
 'machine': {4},
 'learning': {4},
 'scikit-learn': {4},
 'emily': {4},
 'davis': {4},
 'machine learning': {4},
 'visualization': {5},
 'matplotlib': {5},
 'robert': {5},
 'clark': {5},
 'data visualization': {5}}

In [9]:
# perform boolean operations on postings lists for Boolean search operations
# 1. "Python" OR "Pandas"
def or_postings(posting1, posting2):
    p1 = 0
    p2 = 0
    result = list()
    while p1 < len(posting1) and p2 < len(posting2):
        if posting1[p1] == posting2[p2]:
            result.append(posting1[p1])
            p1 += 1
            p2 += 1
        elif posting1[p1] > posting2[p2]:
            result.append(posting2[p2])
            p2 += 1
        else:
            result.append(posting1[p1])
            p1 += 1
    while p1 < len(posting1):
        result.append(posting1[p1])
        p1 += 1
    while p2 < len(posting2):
        result.append(posting2[p2])
        p2 += 1
    return result
# 2. "Python" AND "data"
def and_postings(posting1, posting2):
    p1 = 0
    p2 = 0
    result = list()
    while p1 < len(posting1) and p2 < len(posting2):
        if posting1[p1] == posting2[p2]:
            result.append(posting1[p1])
            p1 += 1
            p2 += 1
        elif posting1[p1] > posting2[p2]:
            p2 += 1
        else:
            p1 += 1
    return result

# display 
pl_1 = list(inverted_index['python'])
pl_2 = list(inverted_index['pandas'])
print(or_postings(pl_1, pl_2))
print(and_postings(pl_1, pl_2))

[1, 2, 3, 4, 5]
[2]
