In [None]:
from sklearn.feature_extraction.text import CountVectorizer

document1 = 'the man went out for a walk for'
document2 = 'the children sat around the fire'

vectorizer = CountVectorizer()
tf_matrix = vectorizer.fit_transform([document1, document2])
tf_array = tf_matrix.toarray()

tf_array

array([[0, 0, 0, 2, 1, 1, 0, 1, 1, 1],
       [1, 1, 1, 0, 0, 0, 1, 2, 0, 0]])

In [None]:
import numpy as np

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

document1 = 'the man went out for a walk'
document2 = 'the children sat around the fire'

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([document1, document2])
tfidf_array = tfidf_matrix.toarray()

print(np.round(tfidf_array,2))

[[0.   0.   0.   0.43 0.43 0.43 0.   0.3  0.43 0.43]
 [0.41 0.41 0.41 0.   0.   0.   0.41 0.58 0.   0.  ]]


# Question 02

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

document1 = 'the man went out for a walk'
document2 = 'the children sat around the fire'

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([document1, document2])
cosine_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])

print(cosine_sim)

[[0.17578608]]


# Task
Build an inverted index and a positional index for the given corpus. Then, perform the following retrieval tasks using the inverted index: "schizophrenia AND drug" and "for AND NOT(drug OR approach)".

## Build inverted index

### Subtask:
Create code to build an inverted index for the given corpus.


**Reasoning**:
The instructions require building an inverted index for a given corpus. I will define the corpus, initialize an empty dictionary for the inverted index, and then iterate through the documents and terms to populate the index according to the provided steps.



In [None]:
import string

corpus = [
    "Schizophrenia and other psychotic disorders are severe mental illnesses.",
    "Treatment often involves antipsychotic drug medications and psychotherapy.",
    "Research is ongoing to find new approaches to treat schizophrenia."
]

inverted_index = {}

for doc_id, document in enumerate(corpus):
    # Remove punctuation and convert to lowercase
    cleaned_document = document.translate(str.maketrans('', '', string.punctuation)).lower()
    terms = cleaned_document.split()

    for term in terms:
        if term not in inverted_index:
            inverted_index[term] = set()
        inverted_index[term].add(doc_id)

print(inverted_index)

{'schizophrenia': {0, 2}, 'and': {0, 1}, 'other': {0}, 'psychotic': {0}, 'disorders': {0}, 'are': {0}, 'severe': {0}, 'mental': {0}, 'illnesses': {0}, 'treatment': {1}, 'often': {1}, 'involves': {1}, 'antipsychotic': {1}, 'drug': {1}, 'medications': {1}, 'psychotherapy': {1}, 'research': {2}, 'is': {2}, 'ongoing': {2}, 'to': {2}, 'find': {2}, 'new': {2}, 'approaches': {2}, 'treat': {2}}


## Implement and query

### Subtask:
Write code to perform the retrieval task "schizophrenia AND drug" using the inverted index.


**Reasoning**:
Access the sets for "schizophrenia" and "drug" from the inverted index and perform an intersection to find documents containing both.



In [None]:
schizophrenia_docs = inverted_index.get('schizophrenia', set())
drug_docs = inverted_index.get('drug', set())

result_docs = schizophrenia_docs.intersection(drug_docs)

print(result_docs)

set()


## Implement and not(or) query

### Subtask:
Write code to perform the retrieval task "for AND NOT(drug OR approach)" using the inverted index.


**Reasoning**:
Get the document sets for the terms 'for', 'drug', and 'approach' from the inverted_index, find the union of the document sets for 'drug' and 'approach', find the difference between the document set for 'for' and the union of 'drug' and 'approach' document sets, and print the result.



In [None]:
for_docs = inverted_index.get('for', set())
drug_docs = inverted_index.get('drug', set())
approach_docs = inverted_index.get('approach', set())

drug_or_approach_docs = drug_docs.union(approach_docs)

result_docs = for_docs.difference(drug_or_approach_docs)

print(result_docs)

set()


## Build positional index

### Subtask:
Create code to build a positional index for the given corpus.


**Reasoning**:
Initialize the positional index and iterate through the corpus, processing each document to build the positional index based on the instructions.



In [None]:
import string

positional_index = {}

for doc_id, document in enumerate(corpus):
    cleaned_document = document.translate(str.maketrans('', '', string.punctuation)).lower()
    terms = cleaned_document.split()

    for pos, term in enumerate(terms):
        if term not in positional_index:
            positional_index[term] = {}
        if doc_id not in positional_index[term]:
            positional_index[term][doc_id] = []
        positional_index[term][doc_id].append(pos)

print(positional_index)

{'schizophrenia': {0: [0], 2: [9]}, 'and': {0: [1], 1: [6]}, 'other': {0: [2]}, 'psychotic': {0: [3]}, 'disorders': {0: [4]}, 'are': {0: [5]}, 'severe': {0: [6]}, 'mental': {0: [7]}, 'illnesses': {0: [8]}, 'treatment': {1: [0]}, 'often': {1: [1]}, 'involves': {1: [2]}, 'antipsychotic': {1: [3]}, 'drug': {1: [4]}, 'medications': {1: [5]}, 'psychotherapy': {1: [7]}, 'research': {2: [0]}, 'is': {2: [1]}, 'ongoing': {2: [2]}, 'to': {2: [3, 7]}, 'find': {2: [4]}, 'new': {2: [5]}, 'approaches': {2: [6]}, 'treat': {2: [8]}}


## Summary:

### Data Analysis Key Findings

*   The inverted index was successfully built, storing terms and the documents they appear in (e.g., 'schizophrenia' is in documents 0 and 2, 'drug' is in document 1).
*   The retrieval task "schizophrenia AND drug" using the inverted index resulted in an empty set, indicating no documents contain both terms.
*   The retrieval task "for AND NOT(drug OR approach)" using the inverted index also resulted in an empty set, indicating no documents contain 'for' but not 'drug' or 'approach'.
*   The positional index was successfully built, storing terms, the documents they appear in, and the specific positions within those documents (e.g., 'schizophrenia' appears at position 0 in document 0 and position 9 in document 2).

### Insights or Next Steps

*   The current corpus is small and results in limited document retrieval for complex queries. Expanding the corpus would provide more meaningful results for the retrieval tasks.
*   With the positional index built, the next step would be to implement and test phrase queries to demonstrate its additional capabilities beyond the inverted index.
