In [1]:
from google.colab import drive

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
doc1 = 'The global average jet fuel price last week fell 1.5% compared to the week before'
doc2 = 'Deep learning is a subset of machine learning'
doc3 = 'Artificial intelligence and machine learning are related fields'
doc4 = 'Neural networks are used in deep learning'
doc5 = 'Support vector machines are a type of machine learning algorithm'
doc6 = 'Nearly 100 years machine learning has been used in human life'
doc7 = 'Chat GPT  is an artificial intelligence model based on Natural Language Processing'
doc8 = 'Aviation fuels are petroleum-based fuels, or petroleum and synthetic fuel blends'

In [6]:
import re
import numpy as np
import pandas as pd
from collections import Counter
import math

# Documents
documents = {
    'doc1': 'The global average jet fuel price last week fell 1.5% compared to the week before',
    'doc2': 'Deep learning is a subset of machine learning',
    'doc3': 'Artificial intelligence and machine learning are related fields',
    'doc4': 'Neural networks are used in deep learning',
    'doc5': 'Support vector machines are a type of machine learning algorithm',
    'doc6': 'Nearly 100 years machine learning has been used in human life',
    'doc7': 'Chat GPT is an artificial intelligence model based on Natural Language Processing',
    'doc8': 'Aviation fuels are petroleum-based fuels, or petroleum and synthetic fuel blends'
}

# Define stopwords (a basic set)
stopwords = {'the', 'a', 'an', 'and', 'in', 'on', 'at', 'to', 'for', 'of', 'are', 'is', 'be', 'been', 'was',
             'were', 'that', 'this', 'those', 'these', 'or', 'by', 'before', 'after', 'as', 'with', 'from'}

# 1. Text preprocessing function with detailed step outputs
def preprocess_text_detailed(text):
    print("Original text:", text)

    # Case folding (convert to lowercase)
    text_lower = text.lower()
    print("After case folding:", text_lower)

    # Tokenizing (split by non-alphanumeric characters)
    tokens = re.findall(r'\b[a-zA-Z0-9]+\b', text_lower)
    print("After tokenizing:", tokens)

    # Stopword removal
    filtered_tokens = [token for token in tokens if token not in stopwords]
    print("After stopword removal:", filtered_tokens)
    print("-" * 80)

    return filtered_tokens

# Process each document with detailed output
print("DETAILED PREPROCESSING STEPS FOR EACH DOCUMENT")
print("=" * 80)
preprocessed_docs = {}
for doc_id, text in documents.items():
    print(f"\nProcessing {doc_id}:")
    preprocessed_docs[doc_id] = preprocess_text_detailed(text)

# Create vocabulary from all documents
vocabulary = set()
for tokens in preprocessed_docs.values():
    vocabulary.update(tokens)
vocabulary = sorted(list(vocabulary))

print("\nVOCABULARY")
print("=" * 80)
print(vocabulary)

# Word frequency in each document
print("\nWORD FREQUENCY IN EACH DOCUMENT")
print("=" * 80)
word_freq = {}
for doc_id, tokens in preprocessed_docs.items():
    word_freq[doc_id] = Counter(tokens)
    print(f"{doc_id}: {dict(word_freq[doc_id])}")

# Create binary matrix (word appears or not)
binary_matrix = pd.DataFrame(0, index=list(documents.keys()), columns=vocabulary)
for doc_id, tokens in preprocessed_docs.items():
    for token in set(tokens):  # Using set to count each unique token only once
        binary_matrix.loc[doc_id, token] = 1

print("\nBINARY MATRIX (WORD PRESENCE)")
print("=" * 80)
print(binary_matrix)

# Calculate Term Frequency (TF)
tf_matrix = pd.DataFrame(0.0, index=list(documents.keys()), columns=vocabulary)
for doc_id, counter in word_freq.items():
    for word, count in counter.items():
        # Normalized TF (divide by document length)
        tf_matrix.loc[doc_id, word] = count / len(preprocessed_docs[doc_id])

print("\nTERM FREQUENCY (TF) MATRIX")
print("=" * 80)
print(tf_matrix)

# Calculate Document Frequency (DF)
df_values = binary_matrix.sum(axis=0)

print("\nDOCUMENT FREQUENCY (DF)")
print("=" * 80)
print(df_values)

# Calculate Inverse Document Frequency (IDF) using 1+log formula
num_docs = len(documents)
idf_values = pd.Series(index=vocabulary)
for term in vocabulary:
    if df_values[term] > 0:
        idf_values[term] = 1 + np.log10(num_docs / df_values[term])
    else:
        idf_values[term] = 1  # For terms that don't appear in any document (shouldn't happen)

print("\nINVERSE DOCUMENT FREQUENCY (IDF) USING 1+log10(N/df)")
print("=" * 80)
print(idf_values)

# Calculate TF-IDF
tfidf_matrix = tf_matrix.copy()
for term in vocabulary:
    tfidf_matrix[term] = tf_matrix[term] * idf_values[term]

print("\nTF-IDF MATRIX")
print("=" * 80)
print(tfidf_matrix)

# Process query for Rocchio
query = "machine learning"
print("\nQUERY PREPROCESSING")
print("=" * 80)
preprocessed_query = preprocess_text_detailed(query)

# Create query vector
query_vector = pd.Series(0.0, index=vocabulary)
for term in preprocessed_query:
    if term in vocabulary:
        query_vector[term] = 1.0

print("\nORIGINAL QUERY VECTOR")
print("=" * 80)
print(query_vector)

# Calculate relevancy score using cosine similarity
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)

    if norm_vec1 == 0 or norm_vec2 == 0:
        return 0

    return dot_product / (norm_vec1 * norm_vec2)

# Calculate initial relevance scores
initial_scores = {}
for doc_id in documents.keys():
    initial_scores[doc_id] = cosine_similarity(query_vector, tfidf_matrix.loc[doc_id])

print("\nINITIAL RELEVANCE SCORES (COSINE SIMILARITY)")
print("=" * 80)
for doc_id, score in sorted(initial_scores.items(), key=lambda x: x[1], reverse=True):
    print(f"{doc_id}: {score:.4f}")

# Sort documents by initial score
sorted_docs = sorted(initial_scores.items(), key=lambda x: x[1], reverse=True)

# Assuming top 3 documents are relevant, rest are irrelevant
relevant_docs = [doc_id for doc_id, _ in sorted_docs[:3]]
irrelevant_docs = [doc_id for doc_id, _ in sorted_docs[3:]]

print("\nRELEVANT DOCUMENTS FOR QUERY EXPANSION")
print("=" * 80)
print(relevant_docs)

print("\nIRRELEVANT DOCUMENTS FOR QUERY EXPANSION")
print("=" * 80)
print(irrelevant_docs)

# Implement Rocchio algorithm with query expansion according to lecture
def rocchio_algorithm(original_query, relevant_docs, irrelevant_docs, alpha=1.0, beta=0.75, gamma=0.15):
    """
    Rocchio algorithm for query expansion based on lecture material

    Parameters:
    original_query: Original query vector
    relevant_docs: List of relevant document IDs
    irrelevant_docs: List of non-relevant document IDs
    alpha: Weight for original query (default 1.0)
    beta: Weight for relevant documents (default 0.75)
    gamma: Weight for non-relevant documents (default 0.15)

    Returns:
    Modified query vector
    """
    # Start with weighted original query
    modified_query = alpha * original_query

    # Add contribution from relevant documents (centroid of relevant docs)
    if relevant_docs:
        relevant_centroid = tfidf_matrix.loc[relevant_docs].mean(axis=0)
        modified_query += beta * relevant_centroid

    # Subtract contribution from irrelevant documents (centroid of non-relevant docs)
    if irrelevant_docs:
        irrelevant_centroid = tfidf_matrix.loc[irrelevant_docs].mean(axis=0)
        modified_query -= gamma * irrelevant_centroid

    # Set negative weights to 0 as mentioned in the lecture
    modified_query = modified_query.clip(lower=0)

    return modified_query

# Apply Rocchio algorithm to expand the query
# Using values from the lecture: α = 1.0, β = 0.75, γ = 0.15
expanded_query = rocchio_algorithm(query_vector, relevant_docs, irrelevant_docs)

print("\nEXPANDED QUERY VECTOR (AFTER ROCCHIO)")
print("=" * 80)
print(expanded_query)

# Show the top 10 terms in the expanded query
expanded_terms = expanded_query.sort_values(ascending=False).head(10)
print("\nTOP 10 TERMS IN EXPANDED QUERY")
print("=" * 80)
for term, weight in expanded_terms.items():
    if weight > 0:
        print(f"{term}: {weight:.4f}")

# Calculate final relevance scores with expanded query
final_scores = {}
for doc_id in documents.keys():
    final_scores[doc_id] = cosine_similarity(expanded_query, tfidf_matrix.loc[doc_id])

print("\nFINAL RELEVANCE SCORES WITH ROCCHIO EXPANSION")
print("=" * 80)
for doc_id, score in sorted(final_scores.items(), key=lambda x: x[1], reverse=True):
    print(f"{doc_id}: {score:.4f}")

# Show the difference in rankings
print("\nCOMPARISON OF RANKINGS")
print("=" * 80)
print("Original Ranking vs. Expanded Query Ranking")
original_ranking = [doc_id for doc_id, _ in sorted(initial_scores.items(), key=lambda x: x[1], reverse=True)]
expanded_ranking = [doc_id for doc_id, _ in sorted(final_scores.items(), key=lambda x: x[1], reverse=True)]

for i in range(len(documents)):
    print(f"Rank {i+1}: {original_ranking[i]} → {expanded_ranking[i]}")

DETAILED PREPROCESSING STEPS FOR EACH DOCUMENT

Processing doc1:
Original text: The global average jet fuel price last week fell 1.5% compared to the week before
After case folding: the global average jet fuel price last week fell 1.5% compared to the week before
After tokenizing: ['the', 'global', 'average', 'jet', 'fuel', 'price', 'last', 'week', 'fell', '1', '5', 'compared', 'to', 'the', 'week', 'before']
After stopword removal: ['global', 'average', 'jet', 'fuel', 'price', 'last', 'week', 'fell', '1', '5', 'compared', 'week']
--------------------------------------------------------------------------------

Processing doc2:
Original text: Deep learning is a subset of machine learning
After case folding: deep learning is a subset of machine learning
After tokenizing: ['deep', 'learning', 'is', 'a', 'subset', 'of', 'machine', 'learning']
After stopword removal: ['deep', 'learning', 'subset', 'machine', 'learning']
-----------------------------------------------------------------------