# **SENTIMENT ANALYSIS**

In [None]:
import pandas as pd
comments_data2 = pd.read_csv("/kaggle/input/preprocessed/cleaned_sentiment2.csv")

In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(
    comments_data2[["sentiment_comment", "suggested_model"]].copy()
)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import softmax
from tqdm import tqdm
import pandas as pd

# ---------------------------
# Device
# ---------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

In [None]:
import os
# ---------------------------
# Prepare DataFrame
# ---------------------------
comments_data2 = comments_data2.reset_index(drop=True)
#results = [""] * len(comments_data2)

# Try to resume from checkpoint
checkpoint_file = "/kaggle/working/sentiment_checkpoint.csv"
if os.path.exists(checkpoint_file):
    print("🔄 Loading checkpoint...")
    comments_data2 = pd.read_csv(checkpoint_file)
else:
    comments_data2["sentiment_prediction"] = ""  # init empty column
    
# ---------------------------
# Define models
# ---------------------------
MODEL_NAMES = {
    "Twitter-RoBERTa-Latest": "cardiffnlp/twitter-roberta-base-sentiment-latest",
    "Multilingual-BERT": "nlptown/bert-base-multilingual-uncased-sentiment"
}

tokenizers = {}
models = {}

for name, path in MODEL_NAMES.items():
    tokenizers[name] = AutoTokenizer.from_pretrained(path)
    models[name] = AutoModelForSequenceClassification.from_pretrained(path).to(device)
    models[name].eval()  # turn off dropout

In [None]:
BATCH_SIZE = 8  # adjust based on GPU memory
SAVE_EVERY = 10000  # save every 10000 batches (tweak this)

# ---------------------------
# Batch inference per model
# ---------------------------
for model_name in MODEL_NAMES.keys():
    idxs = comments_data2.index[
        (comments_data2["suggested_model"] == model_name) &
        (comments_data2["sentiment_prediction"].isna() | (comments_data2["sentiment_prediction"] == ""))
    ].tolist()  # unfinished rows

    if not idxs:
        print(f"ℹ️ No unfinished rows for {model_name}")
        continue

    print(f"Processing {len(idxs)} comments for model {model_name}...")
    print(f"👉 Resuming {model_name} from idx {idxs[0]} to {idxs[-1]}")

    tokenizer = tokenizers[model_name]
    model = models[model_name]

    for batch_num, start in enumerate(tqdm(range(0, len(idxs), BATCH_SIZE))):
        end = min(start + BATCH_SIZE, len(idxs))
        batch_idxs = idxs[start:end]
        batch_texts = [comments_data2.at[i, "sentiment_comment"] for i in batch_idxs]

        # Tokenize
        inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Forward pass
        with torch.no_grad():
            logits = model(**inputs).logits
            probs = softmax(logits, dim=-1)
            pred_labels = torch.argmax(probs, dim=-1).cpu().tolist()
            pred_scores = probs.max(dim=-1).values.cpu().tolist()

        # Map label indices to string labels
        if model_name == "Twitter-RoBERTa-Latest":
            label_map = {0: "negative", 1: "neutral", 2: "positive"}
        else:  # Multilingual-BERT 1–5 stars
            label_map = {0: "1 star", 1: "2 stars", 2: "3 stars", 3: "4 stars", 4: "5 stars"}

        # Save results back into DataFrame
        for i, label_idx, score in zip(batch_idxs, pred_labels, pred_scores):
            comments_data2.at[i, "sentiment_prediction"] = f"{label_map[label_idx]} ({round(score, 3)})"

        # Save checkpoint every SAVE_EVERY batches
        if (batch_num + 1) % SAVE_EVERY == 0:
            comments_data2.to_csv(checkpoint_file, index=False)
            print(f"💾 Checkpoint saved at batch {batch_num+1}")

# Final save
comments_data2.to_csv(checkpoint_file, index=False)
print("✅ All done! Results saved to", checkpoint_file)

In [None]:
BATCH_SIZE = 8  # adjust based on GPU memory
SAVE_EVERY = 10000  # save every 10000 batches (tweak this)

# ---------------------------
# Batch inference per model
# ---------------------------
for model_name in MODEL_NAMES.keys():
    idxs = comments_data2.index[
        (comments_data2["suggested_model"] == model_name) &
        (comments_data2["sentiment_prediction"].isna() | (comments_data2["sentiment_prediction"] == ""))
    ].tolist()  # unfinished rows

    if not idxs:
        print(f"ℹ️ No unfinished rows for {model_name}")
        continue

    print(f"Processing {len(idxs)} comments for model {model_name}...")
    print(f"👉 Resuming {model_name} from idx {idxs[0]} to {idxs[-1]}")

    tokenizer = tokenizers[model_name]
    model = models[model_name]

    for batch_num, start in enumerate(tqdm(range(0, len(idxs), BATCH_SIZE))):
        end = min(start + BATCH_SIZE, len(idxs))
        batch_idxs = idxs[start:end]
        batch_texts = [comments_data2.at[i, "sentiment_comment"] for i in batch_idxs]

        # Tokenize
        inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Forward pass
        with torch.no_grad():
            logits = model(**inputs).logits
            probs = softmax(logits, dim=-1)
            pred_labels = torch.argmax(probs, dim=-1).cpu().tolist()
            pred_scores = probs.max(dim=-1).values.cpu().tolist()

        # Map label indices to string labels
        if model_name == "Twitter-RoBERTa-Latest":
            label_map = {0: "negative", 1: "neutral", 2: "positive"}
        else:  # Multilingual-BERT 1–5 stars
            label_map = {0: "1 star", 1: "2 stars", 2: "3 stars", 3: "4 stars", 4: "5 stars"}

        # Save results back into DataFrame
        for i, label_idx, score in zip(batch_idxs, pred_labels, pred_scores):
            comments_data2.at[i, "sentiment_prediction"] = f"{label_map[label_idx]} ({round(score, 3)})"

        # Save checkpoint every SAVE_EVERY batches
        if (batch_num + 1) % SAVE_EVERY == 0:
            comments_data2.to_csv(checkpoint_file, index=False)
            print(f"💾 Checkpoint saved at batch {batch_num+1}")

# Final save
comments_data2.to_csv(checkpoint_file, index=False)
print("✅ All done! Results saved to", checkpoint_file)

In [None]:
import pandas as pd
data = pd.read_csv('/kaggle/working/sentiment_checkpoint.csv')
data.tail()

In [None]:
import pandas as pd
data = pd.read_csv('/kaggle/working/sentiment_checkpoint.csv')
data.tail()


In [None]:

# ---------------------------
# 7️⃣ Save back to DataFrame
# ---------------------------
comments_data2["sentiment_prediction"] = data["sentiment_prediction"]

In [None]:
comments_data2.to_csv("/kaggle/working/sentiment_prediction.csv", index=False)

In [None]:
comments_data2.head()

In [None]:
comments_data2.tail()

# **RELEVANCE ANALYSIS**

In [None]:
joined_data = pd.read_csv("/kaggle/input/relevant-data-test/comments_with_video_text.csv")

In [None]:
joined_data.head()

In [None]:
import os
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
import torch

# -------------------------
# CONFIG
# -------------------------
CHECKPOINT_FILE = "/kaggle/working/joined_data_with_similarity.csv"
SAVE_EVERY = 10000  
BATCH_SIZE = 32     # GPU mini-batch size
MODEL_NAME = "paraphrase-multilingual-MiniLM-L12-v2"

In [None]:
# -------------------------
# LOAD MODEL (GPU if available)
# -------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

model = SentenceTransformer(MODEL_NAME, device=device)

In [None]:
# If checkpoint exists, resume
if os.path.exists(CHECKPOINT_FILE):
    joined_data = pd.read_csv(CHECKPOINT_FILE)
    print(f"Resumed from checkpoint: {CHECKPOINT_FILE}")
else:
    print(f"⚠️ No checkpoint found. Creating new file: {CHECKPOINT_FILE}")
    # if you are creating new, you must load the raw joined_data first
    # e.g., joined_data = pd.read_csv("joined_data.csv")
    joined_data["similarity_score"] = None
    joined_data["relevance"] = None

# Add missing columns
if "similarity_score" not in joined_data.columns:
    joined_data["similarity_score"] = None
if "relevance" not in joined_data.columns:
    joined_data["relevance"] = None

# Ensure text columns are strings (replace NaN/float with empty string)
joined_data["video_text"] = joined_data["video_text"].astype(str).fillna("")
joined_data["sentiment_comment"] = joined_data["sentiment_comment"].astype(str).fillna("")

In [None]:
# -------------------------
# FIND unfinished rows
# -------------------------
idxs = joined_data.index[
    (joined_data["similarity_score"].isna()) |
    (joined_data["similarity_score"] == "")
].tolist()

if not idxs:
    print("ℹ️ No unfinished rows — everything already processed.")
else:
    print(f"Processing {len(idxs)} rows for model {MODEL_NAME}...")
    print(f"👉 Resuming from idx {idxs[0]} to {idxs[-1]}")

    # -------------------------
    # PROCESS IN BATCHES
    # -------------------------
    for batch_num, start in enumerate(
        tqdm(range(0, len(idxs), BATCH_SIZE), desc="Batches")
    ):
        end = min(start + BATCH_SIZE, len(idxs))
        batch_idxs = idxs[start:end]
        batch = joined_data.loc[batch_idxs]

        # Encode
        video_vecs = model.encode(
            batch["video_text"].tolist(),
            batch_size=BATCH_SIZE,
            convert_to_tensor=True,
            device=device,
            show_progress_bar=False,
        )
        comment_vecs = model.encode(
            batch["sentiment_comment"].tolist(),
            batch_size=BATCH_SIZE,
            convert_to_tensor=True,
            device=device,
            show_progress_bar=False,
        )

        # Similarity
        sims = util.cos_sim(video_vecs, comment_vecs).diagonal().cpu().tolist()

        # Save results
        joined_data.loc[batch_idxs, "similarity_score"] = sims
        joined_data.loc[batch_idxs, "relevance"] = [
            "Relevant" if s > 0.4 else "Not relevant" for s in sims
        ]

        # Save checkpoint every SAVE_EVERY batches
        if (batch_num + 1) % SAVE_EVERY == 0:
            joined_data.to_csv(CHECKPOINT_FILE, index=False)
            rows_done = (batch_num + 1) * BATCH_SIZE
            rows_left = len(idxs) - rows_done
            print(f"💾 Checkpoint saved at batch {batch_num+1} "
                  f"({rows_done}/{len(idxs)} rows, {rows_left} left)")

    # -------------------------
    # FINAL SAVE
    # -------------------------
    joined_data.to_csv(CHECKPOINT_FILE, index=False)
    print(f"✅ All done! Results saved to {CHECKPOINT_FILE}")