# Question 10

In [26]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

vi_data_df = pd.read_csv("./vi_text_retrieval.csv")
context = vi_data_df['text']
context = [doc.lower() for doc in context]

tfidf_vectorizer = TfidfVectorizer()

# Transform the text data into TF-IDF features
context_embedded = tfidf_vectorizer.fit_transform(context)

# Access the TF-IDF feature vector for the 8th document (index 7)
print(context_embedded.toarray()[7][0])

0.31126580760710637


# Question 11 - Compute cosine similarity

In [27]:
from sklearn.metrics.pairwise import cosine_similarity

def tfidf_search(question, tfidf_vectorizer, top_d=5):
    query_embedded = tfidf_vectorizer.transform([question.lower()])
    
    # Compute cosine similarity between the query and the document embeddings
    cosine_scores = cosine_similarity(query_embedded, context_embedded).reshape((-1,))
    
    # Get top k cosine scores and their indices
    results = []
    for idx in cosine_scores.argsort()[-top_d:][::-1]:
        doc_score = {
            'id': idx,
            'cosine_score': cosine_scores[idx]
        }
        results.append(doc_score)
    
    return results

question = vi_data_df.iloc[0]['question']
results = tfidf_search(question, tfidf_vectorizer, top_d=5)
print(results)
print(results[0]['cosine_score'])


[{'id': 0, 'cosine_score': 0.6279910475266972}, {'id': 97, 'cosine_score': 0.21145795862251632}, {'id': 136, 'cosine_score': 0.1771475314979447}, {'id': 384, 'cosine_score': 0.15971218395887335}, {'id': 118, 'cosine_score': 0.1578067474650028}]
0.6279910475266972


# Question 12 - Compute correlation

In [28]:
def corr_search(question, tfidf_vectorizer, top_d=5):
    query_embedded = tfidf_vectorizer.transform([question.lower()])
    
    # Compute correlation
    corr_scores = np.corrcoef(query_embedded.toarray()[0], context_embedded.toarray())

    corr_scores = corr_scores[0][1:] 
    
    # Get top k correlation scores and their indices
    results = []
    for idx in corr_scores.argsort()[-top_d:][::-1]:
        doc = {
            'id': idx,  
            'corr_score': corr_scores[idx]
        }
        results.append(doc)
    
    return results

question = vi_data_df.iloc[0]['question']
results = corr_search(question, tfidf_vectorizer, top_d=5)
print(results[1]['corr_score'])


0.20734246471973702
