In [None]:
pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.0-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.7/224.7 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transform

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# Download NLTK resources
nltk.download('punkt')

# Load the dataset in chunks and filter for facebook links
chunk_size = 1000
facebook_posts = []
for chunk in pd.read_csv('/content/Assignment_6_AI.csv', chunksize=chunk_size):
    facebook_chunk = chunk[chunk['link'].str.contains('facebook', na=False)]
    facebook_posts.append(facebook_chunk)

# Concatenate all facebook chunks into one DataFrame
facebook_df = pd.concat(facebook_posts)

# Tokenize posts
def tokenize_posts(posts):
    tokenized_posts = []
    for post in posts:
        if isinstance(post, str):
            tokens = word_tokenize(post)
            tokenized_posts.append(' '.join(tokens))
        else:
            tokenized_posts.append('')
    return tokenized_posts

facebook_df['tokenized_message'] = tokenize_posts(facebook_df['message'])

# Use SentenceTransformer to generate embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(facebook_df['tokenized_message'].tolist(), convert_to_numpy=True)

# Now embeddings is a numpy array containing the embeddings for the filtered posts

# Function to get most relevant posts
def get_most_relevant_posts(query, embeddings, facebook_df, model, top_n=3):
    query_embedding = model.encode([query], convert_to_numpy=True)
    similarities = cosine_similarity(query_embedding, embeddings).flatten()
    top_indices = np.argsort(similarities)[-top_n:][::-1]
    relevant_posts_with_similarity = [(facebook_df.iloc[idx]['message'], similarities[idx]) for idx in top_indices]
    return relevant_posts_with_similarity

  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [17]:
# Example usage
query = "Jupyter Notebooks"
relevant_posts = get_most_relevant_posts(query, embeddings, facebook_df, model)
for post, similarity in relevant_posts:
    print(f"Post: {post}\nSimilarity: {similarity}\n")

Post: A cartoon by P. C. Vey.
Similarity: 0.3361234664916992

Post: "Perhaps we might squeeze one in [a serious conversation about this] amid the national obsession with every James Comey memo-to-self?" -- Charles Krauthammer
Similarity: 0.3288462162017822

Post: Life has changed for the 8-year-old thanks to the "Fab Lab" at the University of Texas at Arlington.
Similarity: 0.295367956161499

