In [1]:
# MIND News Recommender with SQLite Google Colab Notebook

## Install Dependencies

!pip install transformers accelerate torch faiss-cpu bitsandbytes datasets gradio


Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting gradio
  Downloading gradio-5.23.1-py3-none-any.whl.metadata (16 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Coll

In [2]:
## Import Libraries and Login

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
import faiss
import numpy as np
import pandas as pd
import sqlite3
from huggingface_hub import login
import gradio as gr
import threading

login()  # Enter your Hugging Face token when prompted


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
## Load LLaMA 3.2 1B Model

model_name = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    load_in_4bit=True,
    torch_dtype=torch.float16
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

In [4]:
## Load MIND Dataset Files

news_file = "/content/drive/MyDrive/MIND Dataset/MINDsmall_train/news.tsv"
behaviors_file = "/content/drive/MyDrive/MIND Dataset/MINDsmall_train/behaviors.tsv"

news = pd.read_csv(news_file, sep="\t", names=["news_id", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"])
news['text'] = news['title'] + " " + news['abstract']

behaviors = pd.read_csv(behaviors_file, sep="\t", names=["impression_id", "user_id", "time", "history", "impressions"])


In [22]:
# Set up SQLite for user history tracking

conn = sqlite3.connect('user_interactions.db', check_same_thread=False)
c = conn.cursor()
c.execute('''CREATE TABLE IF NOT EXISTS user_history (
                user_id TEXT,
                category TEXT,
                clicks INTEGER,
                PRIMARY KEY (user_id, category))''')
conn.commit()

def get_db_connection():
    return sqlite3.connect('user_interactions.db', check_same_thread=False)


In [6]:
## Load Embedding Model

embed_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embed_tokenizer = AutoTokenizer.from_pretrained(embed_model_name)
embed_model = AutoModel.from_pretrained(embed_model_name).to('cuda')

def generate_embedding(text):
    if not isinstance(text, str):
        text = str(text)
    inputs = embed_tokenizer([text], return_tensors='pt', truncation=True, padding=True, max_length=512).to('cuda')
    with torch.no_grad():
        outputs = embed_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

news['embedding'] = news['text'].apply(generate_embedding)
news_embeddings = np.vstack(news['embedding'].values)


## Build FAISS Index

index = faiss.IndexFlatL2(news_embeddings.shape[1])
index.add(news_embeddings.astype('float32'))


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

In [28]:
## Update SQLite with user click interaction

def update_user_click(user_id, category):
    with get_db_connection() as conn_thread:
        c_thread = conn_thread.cursor()

        print(f"🔄 Checking if user {user_id} clicked on category {category}...")

        # Check if the user-category entry exists
        c_thread.execute("SELECT clicks FROM user_history WHERE user_id=? AND category=?", (user_id, category))
        row = c_thread.fetchone()

        if row:
            new_clicks = row[0] + 1
            c_thread.execute("UPDATE user_history SET clicks = ? WHERE user_id=? AND category=?", (new_clicks, user_id, category))
            print(f"✅ Updated {category} clicks for user {user_id}. New Count: {new_clicks}")
        else:
            c_thread.execute("INSERT INTO user_history (user_id, category, clicks) VALUES (?, ?, 1)", (user_id, category))
            print(f"✅ Inserted new entry: User {user_id} | Category {category} | Clicks: 1")

        conn_thread.commit()


## Thread-safe get weighted categories for recommendation

def get_user_preferred_categories(user_id):
    conn_thread = get_db_connection()
    c_thread = conn_thread.cursor()
    c_thread.execute("SELECT category, clicks FROM user_history WHERE user_id=?", (user_id,))
    results = c_thread.fetchall()
    conn_thread.close()
    user_categories = {row[0]: row[1] for row in results}
    return user_categories


def recommend_articles(user_query, user_id=None, top_k=5):
    user_categories = get_user_preferred_categories(user_id) if user_id else {}

    if not user_categories:
        # New user: Recommend random articles from different categories
        recs = news.groupby('category', group_keys=False).apply(lambda x: x.sample(1)).reset_index(drop=True)
        explanation_prompt = f"These articles were recommended based on general interest as you are a new user searching for: {user_query}."
    else:
        # Existing user: Recommend based on past interactions
        query_embedding = generate_embedding(user_query).reshape(1, -1).astype('float32')
        distances, indices = index.search(query_embedding, top_k * 10)
        candidates = news.iloc[indices[0]].copy()

        candidates['score'] = candidates['category'].map(lambda x: user_categories.get(x, 0))
        recs = candidates.sort_values(by='score', ascending=False).head(top_k)  # Assign recs here

        explanation_prompt = f"These articles were recommended based on your previous reading preferences and interests related to: {user_query}."

    # Generate explanation with LLaMA
    input_ids = tokenizer(explanation_prompt, return_tensors='pt').input_ids.to('cuda')
    output = model.generate(input_ids, max_length=150)
    explanation = tokenizer.decode(output[0], skip_special_tokens=True)

    formatted_recs = recs[['title', 'category', 'abstract', 'url']].to_dict(orient='records')
    return formatted_recs, explanation




## Test that groupby does not raise warnings

import warnings
with warnings.catch_warnings(record=True) as w:
    warnings.simplefilter("always")
    test_recs = news.groupby('category', group_keys=False).apply(lambda x: x.sample(1)).reset_index(drop=True)
    if w:
        print("Warnings detected:", w)
    else:
        print("No warnings; deprecation fix works correctly.")




In [29]:
## Test that groupby does not raise warnings

import warnings
with warnings.catch_warnings(record=True) as w:
    warnings.simplefilter("always")
    test_recs = news.groupby('category', group_keys=False).apply(lambda x: x.sample(1)).reset_index(drop=True)
    if w:
        print("Warnings detected:", w)
    else:
        print("No warnings; deprecation fix works correctly.")




In [None]:
## Gradio UI for testing with clickable links

def recommend_interface(query, user_id):
    recs, explanation = recommend_articles(query, user_id=user_id)

    # Ensure we update user history when submit is clicked
    if recs:
        # Get the top recommended article's category
        top_category = recs[0]['category']
        print(f"🖱️ User {user_id} submitted query. Updating clicks for category: {top_category}")

        # Update click history based on submit action
        update_user_click(user_id, top_category)

    output_text = "\n\n".join([
        f"**Title:** [{r['title']}]({r['url']})\n**Abstract:** {r['abstract']}"
        for r in recs
    ])
    return output_text, explanation


ui = gr.Interface(
    fn=recommend_interface,
    inputs=[gr.Textbox(label="Enter your query"), gr.Textbox(label="User ID")],
    outputs=[gr.Markdown(label="Recommended Articles with Links"), gr.Textbox(label="LLaMA Explanation")],
    title="News Recommendation System with LLM concepts",
    description="Recommends articles based on user click history stored in SQLite with proportional scoring for categories. New users get diverse suggestions. Includes clickable article links.")

ui.launch(debug=True)


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://3ac3dae3602ddae43f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


🖱️ User u1 submitted query. Updating clicks for category: sports
🔄 Checking if user u1 clicked on category sports...
✅ Updated sports clicks for user u1. New Count: 5


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


🖱️ User u1 submitted query. Updating clicks for category: sports
🔄 Checking if user u1 clicked on category sports...
✅ Updated sports clicks for user u1. New Count: 6


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


🖱️ User u1 submitted query. Updating clicks for category: sports
🔄 Checking if user u1 clicked on category sports...
✅ Updated sports clicks for user u1. New Count: 7


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


🖱️ User u2 submitted query. Updating clicks for category: news
🔄 Checking if user u2 clicked on category news...
✅ Inserted new entry: User u2 | Category news | Clicks: 1


  recs = news.groupby('category', group_keys=False).apply(lambda x: x.sample(1)).reset_index(drop=True)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


🖱️ User u10 submitted query. Updating clicks for category: autos
🔄 Checking if user u10 clicked on category autos...
✅ Inserted new entry: User u10 | Category autos | Clicks: 1


  recs = news.groupby('category', group_keys=False).apply(lambda x: x.sample(1)).reset_index(drop=True)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


🖱️ User 11 submitted query. Updating clicks for category: autos
🔄 Checking if user 11 clicked on category autos...
✅ Inserted new entry: User 11 | Category autos | Clicks: 1


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


🖱️ User 11 submitted query. Updating clicks for category: foodanddrink
🔄 Checking if user 11 clicked on category foodanddrink...
✅ Inserted new entry: User 11 | Category foodanddrink | Clicks: 1


In [10]:
c.execute('''SELECT * FROM user_history''')

<sqlite3.Cursor at 0x7f0d5de8c5c0>

In [19]:
c.execute('''SELECT * FROM user_history''')  # Execute the query

rows = c.fetchall()  # Fetch all rows


for row in rows:
    print(row)

OperationalError: no such table: user_history

In [13]:
c.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='user_history'")
table_exists = c.fetchone()
if table_exists:
    print("✅ Table 'user_history' exists.")
else:
    print("❌ Table 'user_history' was not created.")

✅ Table 'user_history' exists.
