In [1]:
# ---------------------------------------------------------
#Importing some Python libraries
# ---------------------------------------------------------
import csv
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer
from nltk import word_tokenize

In [2]:
import nltk 
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/antonioduran/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/antonioduran/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
class StemTokenizer:
    def __init__(self):
        self.stemmer = PorterStemmer()
    def __call__(self, doc):
        return [self.stemmer.stem(t) for t in word_tokenize(doc)]

In [4]:
documents = []

# ---------------------------------------------------------
# Reading the data in a csv file
# ---------------------------------------------------------
with open('collection.csv', 'r') as csvfile:
  reader = csv.reader(csvfile)
  for i, row in enumerate(reader):
         if i > 0:  # skipping the header
            documents.append (row[0])

In [5]:
# ---------------------------------------------------------
# Print original documents
# ---------------------------------------------------------
# --> add your Python code here

print(documents)

['I love a dog and a cat.', 'She loves her cat and dogs.', 'They love their cat.']


In [6]:
# ---------------------------------------------------------
# Instantiate CountVectorizer informing 'word' as the analyzer, Porter stemmer as the tokenizer, stop_words as the identified stop words,
# unigrams and bigrams as the ngram_range, and binary representation as the weighting scheme
# ---------------------------------------------------------
# --> add your Python code here

vectorizer = CountVectorizer(
    analyzer    = 'word',
    tokenizer   = StemTokenizer(), 
    stop_words  = 'english',
    ngram_range = (1, 2),
    binary      = True
)

In [7]:
# ---------------------------------------------------------
# Fit the vectorizer to the documents and encode the them
# ---------------------------------------------------------
# --> add your Python code here

vectorizer.fit(documents)
document_matrix = vectorizer.transform(documents)



In [8]:
# ---------------------------------------------------------
# Inspect vocabulary
# ---------------------------------------------------------
print("Vocabulary:", vectorizer.get_feature_names_out().tolist())

Vocabulary: ['.', 'cat', 'cat .', 'cat dog', 'dog', 'dog .', 'dog cat', 'love', 'love cat', 'love dog']


In [9]:
# ---------------------------------------------------------
# Fit the vectorizer to the query and encode it
# ---------------------------------------------------------
# --> add your Python code here

query = ["I love dogs"]
query_vector = vectorizer.transform(query)

In [10]:
# ---------------------------------------------------------
# Convert matrices to plain Python lists
# ---------------------------------------------------------
# --> add your Python code here

doc_vectors = document_matrix.toarray()
query_vector = query_vector.toarray()[0]

In [11]:
# ---------------------------------------------------------
# Compute dot product
# ---------------------------------------------------------

scores = []
# --> add your Python code here
for doc_vector in doc_vectors:
    score = sum(doc_vector[i] * query_vector[i] for i in range(len(query_vector)))
    scores.append(score)

In [12]:
# ---------------------------------------------------------
# Sort documents by score (descending)
# ---------------------------------------------------------

ranking = []
# --> add your Python code here
for i, score in enumerate(scores):
    ranking.append((i, score))
ranking.sort(key=lambda x: x[1], reverse=True)

In [16]:
print("Query:", query[0])
print()
for doc_idx, score in ranking:
    print(f"Document {doc_idx}: {documents[doc_idx]} - Score: {score}")

Query: I love dogs

Document 0: I love a dog and a cat. - Score: 3
Document 1: She loves her cat and dogs. - Score: 2
Document 2: They love their cat. - Score: 1
