In [None]:
from collections import defaultdict

def exercise_one(documents):
    inverted_index = defaultdict(list)

    for doc_id, text in documents.items():
        terms = text.lower().split()
        for term in terms:
            if doc_id not in inverted_index[term]:
                inverted_index[term].append(doc_id)

    return {term: sorted(posting_list) for term, posting_list in sorted(inverted_index.items())}

documents = {
    1: "new home sales top forecasts",
    2: "home sales rise in july",
    3: "increase in home sales in july",
    4: "july new home sales rise"
}

index = exercise_one(documents)

def print_index(index):
    print("Inverted Index:")
    print("==============")
    for term, posting_list in index.items():
        print(f"{term}: {posting_list}")

print_index(index)

Inverted Index:
forecasts: [1]
home: [1, 2, 3, 4]
in: [2, 3]
increase: [3]
july: [2, 3, 4]
new: [1, 4]
rise: [2, 4]
sales: [1, 2, 3, 4]
top: [1]


In [None]:
from collections import defaultdict

def exercise_two(documents):
    terms = sorted(set(term for doc in documents.values() for term in doc.lower().split()))

    # Create term-document matrix using list comprehension
    matrix = [[0 for _ in range(len(documents))] for _ in range(len(terms))]
    for doc_id, text in documents.items():
        doc_terms = text.lower().split()
        for i, term in enumerate(terms):
            if term in doc_terms:
                matrix[i][doc_id-1] = 1

    # Create inverted index
    inverted_index = defaultdict(list)
    for doc_id, text in documents.items():
        for term in text.lower().split():
            if doc_id not in inverted_index[term]:
                inverted_index[term].append(doc_id)

    return terms, matrix, dict(sorted((term, sorted(postings)) for term, postings in inverted_index.items()))

# Test documents
documents = {
    1: "breakthrough drug for schizophrenia",
    2: "new schizophrenia drug",
    3: "new approach for treatment of schizophrenia",
    4: "new hopes for schizophrenia patients"
}

# Generate both representations
terms, matrix, index = exercise_two(documents)

# Print term-document matrix
print("Term-Document Incidence Matrix:")
print("==============================")
print("Terms/Docs:", end="\t")
for i in range(len(documents)):
    print(f"D{i+1}", end="\t")
print()
for i, term in enumerate(terms):
    print(f"{term:<12}", end="\t")
    for j in range(len(documents)):
        print(matrix[i][j], end="\t")
    print()

print("\nInverted Index:")
print("==============")
for term, posting_list in index.items():
    print(f"{term}: {posting_list}")

Term-Document Incidence Matrix:
Terms/Docs:	D1	D2	D3	D4	
approach    	0	0	1	0	
breakthrough	1	0	0	0	
drug        	1	1	0	0	
for         	1	0	1	1	
hopes       	0	0	0	1	
new         	0	1	1	1	
of          	0	0	1	0	
patients    	0	0	0	1	
schizophrenia	1	1	1	1	
treatment   	0	0	1	0	

Inverted Index:
approach: [3]
breakthrough: [1]
drug: [1, 2]
for: [1, 3, 4]
hopes: [4]
new: [2, 3, 4]
of: [3]
patients: [4]
schizophrenia: [1, 2, 3, 4]
treatment: [3]


In [None]:
def build_index(documents):
    inverted_index = defaultdict(list)
    for doc_id, text in documents.items():
        for term in text.lower().split():
            if doc_id not in inverted_index[term]:
                inverted_index[term].append(doc_id)
    return dict(sorted((term, sorted(postings)) for term, postings in inverted_index.items()))

def intersect(list1, list2):
    result = []
    i = j = 0
    while i < len(list1) and j < len(list2):
        if list1[i] == list2[j]:
            result.append(list1[i])
            i += 1
            j += 1
        elif list1[i] < list2[j]:
            i += 1
        else:
            j += 1
    return result

def union(list1, list2):
    result = []
    i = j = 0
    while i < len(list1) and j < len(list2):
        if list1[i] == list2[j]:
            result.append(list1[i])
            i += 1
            j += 1
        elif list1[i] < list2[j]:
            result.append(list1[i])
            i += 1
        else:
            result.append(list2[j])
            j += 1
    result.extend(list1[i:])
    result.extend(list2[j:])
    return result

def complement(posting_list, all_docs):
    return sorted(set(all_docs) - set(posting_list))

def exercise_three(documents):
    index = build_index(documents)
    all_docs = sorted(documents.keys())

    # Query a: schizophrenia AND drug
    result_a = intersect(index['schizophrenia'], index['drug'])

    # Query b: for AND NOT(drug OR approach)
    drug_or_approach = union(index['drug'], index['approach'])
    not_drug_or_approach = complement(drug_or_approach, all_docs)
    result_b = intersect(index['for'], not_drug_or_approach)

    return result_a, result_b

# Test documents
documents = {
    1: "breakthrough drug for schizophrenia",
    2: "new schizophrenia drug",
    3: "new approach for treatment of schizophrenia",
    4: "new hopes for schizophrenia patients"
}

result_a, result_b = exercise_three(documents)

print("Query Results:")
print("=============")
print("a. schizophrenia AND drug:", result_a)
print("b. for AND NOT(drug OR approach):", result_b)

# Verify results
print("\nVerification:")
print("============")
print("Documents:")
for doc_id, text in documents.items():
    print(f"Doc {doc_id}: {text}")

Query Results:
a. schizophrenia AND drug: [1, 2]
b. for AND NOT(drug OR approach): [4]

Verification:
Documents:
Doc 1: breakthrough drug for schizophrenia
Doc 2: new schizophrenia drug
Doc 3: new approach for treatment of schizophrenia
Doc 4: new hopes for schizophrenia patients
