In [2]:
import re
import json
import os
from typing import Dict, List, Optional
from pathlib import Path


# Define the expected path for the external slang dictionary file
#SLANG_FILE_PATH = os.path.join(os.path.dirname(__file__), '..', '..', 'data', 'slangs.json')
# Project root (assuming this script is anywhere inside the project)
PROJECT_ROOT = Path(__file__).resolve().parent
while not (PROJECT_ROOT / "data").exists():
    PROJECT_ROOT = PROJECT_ROOT.parent

# Constant path to your JSON
SLANG_FILE_PATH = PROJECT_ROOT / "data" / "slangs.json"

def _load_slang_map() -> Dict[str, str]:
    """
    Attempts to load the slang map from the JSON file path.
    Falls back to a hardcoded dictionary if the file is not found or fails to load.
    """
    # Hardcoded fallback dictionary (development/testing)
    FALLBACK_SLANG_MAP: Dict[str, str] = {
        "lol": "laughing out loud",
        "brb": "be right back",
        "btw": "by the way",
        "imo": "in my opinion",
        "imho": "in my humble opinion",
        "thx": "thanks",
        "ty": "thank you",
        "tldr": "too long didn't read",
        "ikr": "i know, right",
        "fomo": "fear of missing out",
        "smh": "shaking my head",
        #"omg": "oh my god",
        "wtf": "what the hell",
        "lmao": "laughing my a** off",
        "np": "no problem",
        "afk": "away from keyboard",
        "gg": "good game",
        "gtg": "got to go",
        "rofl": "rolling on the floor laughing",
    }
    
    try:
        # Load the external JSON file
        with open(SLANG_FILE_PATH, 'r', encoding='utf-8') as f:
            print(f"INFO: Successfully loaded slang map from {SLANG_FILE_PATH}")
            return json.load(f)
    except FileNotFoundError:
        print(f"WARNING: Slang file not found at {SLANG_FILE_PATH}. Using hardcoded fallback map.")
        return FALLBACK_SLANG_MAP
    except json.JSONDecodeError:
        print(f"ERROR: Failed to decode JSON from {SLANG_FILE_PATH}. Using hardcoded fallback map.")
        return FALLBACK_SLANG_MAP
    except Exception as e:
        print(f"An unexpected error occurred during slang map loading: {e}. Using hardcoded fallback map.")
        return FALLBACK_SLANG_MAP

# Load the slang map once when the module is imported
SLANG_MAP = _load_slang_map()

def expand_slang_and_abbreviations(text: str) -> str:
    """
    Standardizes text by expanding common internet slang and abbreviations 
    based on the loaded map. This function is case-insensitive during matching.

    Args:
        text: The input string (e.g., "This is awesome lol btw I'm late").

    Returns:
        The string with slang expanded (e.g., "This is awesome laughing out loud by the way I'm late").
    """
    if not text:
        return text

    # 1. Lowercase the entire text and split into words
    words: List[str] = text.lower().split()
    
    # 2. Process words and replace slang
    expanded_words: List[str] = []
    
    for word in words:
        # Check if the word, after stripping any trailing punctuation, is in the slang map.
        # We need to preserve punctuation (like periods or commas) if they are attached 
        # to the slang, but we match the core word.
        clean_word = re.sub(r'[\W_]+$', '', word) # Remove trailing non-word characters
        
        if clean_word in SLANG_MAP:
            # Replace the word with the expanded version
            expanded_slang = SLANG_MAP[clean_word]
            
            # If the original word had trailing punctuation (e.g., "lol."), 
            # we re-attach it to the expanded phrase.
            trailing_punc = re.search(r'([\W_]+$)', word)
            if trailing_punc:
                # We need to correctly handle multi-word expansion (e.g., "laughing out loud")
                # and re-attach punctuation to the last word of the expanded phrase.
                expanded_slang_list = expanded_slang.split()
                if expanded_slang_list:
                    expanded_slang_list[-1] += trailing_punc.group(1)
                expanded_words.extend(expanded_slang_list)
            else:
                expanded_words.extend(expanded_slang.split())
        else:
            expanded_words.append(word)

    return ' '.join(expanded_words)

# Example Usage to test the function
if __name__ == "__main__":
    test_comment_1 = "OMG this is the best tutorial ever lol! IKR?"
    test_comment_2 = "BTW, I think this is too long. TLDR. SMH."
    test_comment_3 = "thx for the help. np."
    from pathlib import Path
    
    print(f"Original 1: {test_comment_1}")
    print(f"Expanded 1: {expand_slang_and_abbreviations(test_comment_1)}\n")
    
    print(f"Original 2: {test_comment_2}")
    print(f"Expanded 2: {expand_slang_and_abbreviations(test_comment_2)}\n")

    print(f"Original 3: {test_comment_3}")
    print(f"Expanded 3: {expand_slang_and_abbreviations(test_comment_3)}\n")


NameError: name '__file__' is not defined

In [9]:
import json

# Load the JSON file
file_path = os.path.join("..", "data", "slang_words_normalized.json")
with open(file_path, "r", encoding="utf-8") as f:
    slang_dict = json.load(f)

# Check a few slangs
print(slang_dict.get("omg"))   # Example: Check "OMG"
print(slang_dict.get("BTW"))   # Example: Check "BRB"
print(slang_dict.get("omg"))   # Example: Check "LOL"
num_entries = len(slang_dict)
print(f"Total entries in slang JSON: {num_entries}")

oh my god
None
oh my god
Total entries in slang JSON: 2886


In [1]:
from transformers import MarianMTModel, MarianTokenizer
import torch

# Model name for multi-language ‚Üí English
MODEL_NAME = "Helsinki-NLP/opus-mt-mul-en"

# Load tokenizer and model
tokenizer = MarianTokenizer.from_pretrained(MODEL_NAME)
model = MarianMTModel.from_pretrained(MODEL_NAME)

# Use GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print(f"Model loaded on device: {device}")

  from .autonotebook import tqdm as notebook_tqdm


Model loaded on device: cpu


In [2]:
def translate_offline(sentences, batch_size=8):
    """
    Translate a list of sentences to English using MarianMT offline.
    """
    translations = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i+batch_size]
        # Tokenize
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to(device)
        # Generate translation IDs
        translated = model.generate(**inputs)
        # Decode translation
        translated_texts = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
        translations.extend(translated_texts)
    return translations

In [3]:
examples = [
    "Bonjour, comment √ßa va ?",        # French
    "Hola amigo, ¬øc√≥mo est√°s?",        # Spanish
    "‡§Ø‡§π ‡§¨‡§π‡•Å‡§§ ‡§Ö‡§ö‡•ç‡§õ‡§æ ‡§π‡•à",                 # Hindi
    "Dies ist ein Test",               # German
    "Ciao, come stai?",               # Italian
]

translated = translate_offline(examples)
for original, trans in zip(examples, translated):
    print(f"Original: {original}\nTranslated: {trans}\n")

Original: Bonjour, comment √ßa va ?
Translated: Hello, how's it going?

Original: Hola amigo, ¬øc√≥mo est√°s?
Translated: Hey, buddy, how are you?

Original: ‡§Ø‡§π ‡§¨‡§π‡•Å‡§§ ‡§Ö‡§ö‡•ç‡§õ‡§æ ‡§π‡•à
Translated: It's very good.

Original: Dies ist ein Test
Translated: This is a test

Original: Ciao, come stai?
Translated: Hey, how are you?



In [3]:
import pandas as pd

# Load both CSV files
df1 = pd.read_csv("C:/Users/Vineeth/Desktop/SVRE_P1/data/raw/comments.csv")
df2 = pd.read_csv("C:/Users/Vineeth/Desktop/SVRE_P1/data/raw/raw_comments.csv")

# Concatenate row-wise
df_combined = pd.concat([df1, df2], ignore_index=True)

# Save to a new CSV
df_combined.to_csv("C:/Users/Vineeth/Desktop/SVRE_P1/data/processed/all_comments_combined.csv", index=False)

In [1]:
import torch

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

PyTorch version: 2.5.1+cu121
CUDA available: True
CUDA version: 12.1
GPU: NVIDIA GeForce RTX 3050 Laptop GPU
GPU Memory: 4.00 GB


In [None]:
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

# ---------------------- Configuration ----------------------
MODEL_PATH = "C:\Users\Vineeth\Desktop\SVRE_P1\models\distillbert"  # Path to your saved model folder
device = torch.device("cpu")  # Use CPU for inference

# ---------------------- Load model and tokenizer ----------------------
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_PATH)
model = DistilBertForSequenceClassification.from_pretrained(MODEL_PATH)
model.to(device)
model.eval()  # Set model to evaluation mode

# ---------------------- Label mappings ----------------------
# Make sure these match the mapping used during training
label2id = {"negative": 0, "neutral": 1, "positive": 2}
id2label = {v: k for k, v in label2id.items()}

# ---------------------- Test sentences ----------------------
sentences = [
    "I love this product! It's amazing.",
    "The movie was okay, not great but not bad either.",
    "This is the worst service I have ever received."
]

# ---------------------- Tokenization ----------------------
inputs = tokenizer(
    sentences,
    padding=True,
    truncation=True,
    return_tensors="pt"
)

# Move inputs to the correct device
inputs = {k: v.to(device) for k, v in inputs.items()}

# ---------------------- Inference ----------------------
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_indices = torch.argmax(logits, dim=-1).cpu().numpy()

# Map predicted indices back to labels
predicted_labels = [id2label[idx] for idx in predicted_indices]

# ---------------------- Display results ----------------------
for sentence, label in zip(sentences, predicted_labels):
    print(f"Sentence: {sentence}")
    print(f"Predicted Sentiment: {label}")
    print("-" * 50)


In [3]:
from transformers import DistilBertForSequenceClassification

model_path = r"C:\Users\Vineeth\Desktop\SVRE_P1\models\distillbert"
model = DistilBertForSequenceClassification.from_pretrained(model_path)

print("id2label:", model.config.id2label)
print("label2id:", model.config.label2id)


  from .autonotebook import tqdm as notebook_tqdm


id2label: {0: 'LABEL_0', 1: 'LABEL_1', 2: 'LABEL_2'}
label2id: {'LABEL_0': 0, 'LABEL_1': 1, 'LABEL_2': 2}


In [4]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast
import torch

model_path = r"C:\Users\Vineeth\Desktop\SVRE_P1\models\distillbert"
model = DistilBertForSequenceClassification.from_pretrained(model_path)
tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

test_sentences = [
    "I love this product!",    # positive
    "This is terrible.",       # negative
    "It's okay, not bad."      # neutral
]

inputs = tokenizer(test_sentences, padding=True, truncation=True, return_tensors="pt").to(device)
with torch.no_grad():
    logits = model(**inputs).logits
    preds = torch.argmax(logits, dim=1).cpu().numpy()

for s, p in zip(test_sentences, preds):
    print(f"{s} ‚Üí {p}")


I love this product! ‚Üí 2
This is terrible. ‚Üí 0
It's okay, not bad. ‚Üí 1


In [5]:
from transformers import DistilBertForSequenceClassification

model_path = r"C:\Users\Vineeth\Desktop\SVRE_P1\models\distillbert"
model = DistilBertForSequenceClassification.from_pretrained(model_path)

# Map old labels to numeric scores directly
id2label = {0: -1, 1: 0, 2: 1}  # 0->Negative(-1), 1->Neutral(0), 2->Positive(1)
label2id = {v: k for k, v in id2label.items()}

# Update model config
model.config.id2label = id2label
model.config.label2id = label2id

# Save updated model
model.save_pretrained(model_path)
print("‚úÖ Model labels updated to -1, 0, 1 and saved.")


SafetensorError: Error while serializing: I/O error: The requested operation cannot be performed on a file with a user-mapped section open. (os error 1224)

In [None]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast
import torch

# Load model and tokenizer
model_path = r"C:\Users\Vineeth\Desktop\SVRE_P1\models\distillbert"
model = DistilBertForSequenceClassification.from_pretrained(model_path)
tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Test sentences
test_sentences = [
    "I love this product!",    # positive
    "This is terrible.",       # negative
    "It's okay, not bad."      # neutral
]

# Tokenize inputs
inputs = tokenizer(test_sentences, padding=True, truncation=True, return_tensors="pt").to(device)

# Get predictions
with torch.no_grad():
    logits = model(**inputs).logits
    preds = torch.argmax(logits, dim=1).cpu().numpy()

# Map model IDs to -1, 0, 1
id2score = {0: -1, 1: 0, 2: 1}

# Print results
for s, p in zip(test_sentences, preds):
    score = id2score[p]
    print(f"{s} ‚Üí {score}")


I love this product! ‚Üí 1
This is terrible. ‚Üí -1
It's okay, not bad. ‚Üí 0


In [2]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast
import torch
import pandas as pd

# -------------------- Load model and tokenizer --------------------
model_path = r"C:\Users\Vineeth\Desktop\SVRE_P1\models\distillbert_finetuned_v2"
model = DistilBertForSequenceClassification.from_pretrained(model_path)
tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# -------------------- Load inference data --------------------
df = pd.read_csv(r"C:\Users\Vineeth\Desktop\SVRE_P1\data\inference\inference_new.csv")  # CSV must have a column 'Comment'
comments = df['text'].tolist()

# -------------------- Tokenize and predict --------------------
inputs = tokenizer(comments, padding=True, truncation=True, return_tensors="pt").to(device)
with torch.no_grad():
    logits = model(**inputs).logits
    preds = torch.argmax(logits, dim=1).cpu().numpy()

# -------------------- Map model IDs to sentiment scores --------------------
id2score = {0: -1, 1: 0, 2: 1}
scores = [id2score[p] for p in preds]

# -------------------- Add predictions to DataFrame --------------------
df['Predicted_Score'] = scores
df['Predicted_Label'] = ['Negative' if s==-1 else 'Neutral' if s==0 else 'Positive' for s in scores]

# -------------------- Calculate overall score --------------------
overall_score = sum(scores)

# -------------------- Save results --------------------
output_path = r"C:\Users\Vineeth\Desktop\SVRE_P1\data\inference_results2.csv"
df.to_csv(output_path, index=False)

# -------------------- Print results --------------------
for comment, label, score in zip(df['text'], df['Predicted_Label'], df['Predicted_Score']):
    print(f"Comment: {comment}\nPredicted Label: {label}, Score: {score}\n")

print(f"Overall sentiment score: {overall_score}")


Comment: excellent tutorial the visuals were great
Predicted Label: Positive, Score: 1

Comment: i loved spending an hour just to get to the main point brilliant pacing
Predicted Label: Positive, Score: 1

Comment: oh yes very clear i totally understand how that works now what a masterpiece
Predicted Label: Positive, Score: 1

Comment: the instructor explained the formula clearly and effectively
Predicted Label: Positive, Score: 1

Comment: i feel much more confident about the topic now
Predicted Label: Positive, Score: 1

Comment: a solid explanation of a complex subject
Predicted Label: Neutral, Score: 0

Comment: the content was a bit advanced but still very useful
Predicted Label: Neutral, Score: 0

Comment: i appreciate the effort put into this lesson
Predicted Label: Positive, Score: 1

Comment: my brain cells are having a party of pure confusion this is top-tier education
Predicted Label: Positive, Score: 1

Comment: the audio quality is so bad i can barely hear what the instruc

In [4]:
# üß© Step 1: Install dependencies (if not already)
!pip install google-api-python-client tqdm


Collecting google-api-python-client
  Using cached google_api_python_client-2.184.0-py3-none-any.whl.metadata (7.0 kB)
Collecting google-auth-httplib2<1.0.0,>=0.2.0 (from google-api-python-client)
  Using cached google_auth_httplib2-0.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0,>=1.31.5 (from google-api-python-client)
  Using cached google_api_core-2.26.0-py3-none-any.whl.metadata (3.2 kB)
Using cached google_api_python_client-2.184.0-py3-none-any.whl (14.3 MB)
Using cached google_api_core-2.26.0-py3-none-any.whl (162 kB)
Using cached google_auth_httplib2-0.2.0-py2.py3-none-any.whl (9.3 kB)
Installing collected packages: google-auth-httplib2, google-api-core, google-api-python-client

   ------------- -------------------------- 1/3 [google-api-core]
   ------------- -------------------------- 1/3 [google-api-core]
   ------------- -------------------------- 1/3 [google-api-core]
   ------------- -------------------------- 

In [13]:
import os
import pandas as pd
from googleapiclient.discovery import build
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast
import torch
from src.components.data_preprocessing import TextPreprocessor
from dotenv import load_dotenv


ModuleNotFoundError: No module named 'src'

In [6]:
# üß© Step 3: Set up YouTube API
API_KEY = "AIzaSyCoz9NrmBu5mFRm_-qD4XoTFaqu7AGvGeU"  # üîπ Replace with your YouTube API key
youtube = build('youtube', 'v3', developerKey=API_KEY)

# Function to extract video ID from a full YouTube URL
def extract_video_id(url):
    import re
    pattern = r"(?:v=|\/)([0-9A-Za-z_-]{11}).*"
    match = re.search(pattern, url)
    return match.group(1) if match else url


In [7]:
# üß© Step 4: Fetch YouTube comments
def fetch_youtube_comments(video_url, max_comments=100):
    video_id = extract_video_id(video_url)
    comments = []
    
    request = youtube.commentThreads().list(
        part="snippet",
        videoId=video_id,
        maxResults=100,
        textFormat="plainText"
    )
    response = request.execute()

    while response:
        for item in response["items"]:
            comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
            comments.append(comment)
            if len(comments) >= max_comments:
                return comments
        
        if "nextPageToken" in response:
            response = youtube.commentThreads().list(
                part="snippet",
                videoId=video_id,
                pageToken=response["nextPageToken"],
                maxResults=100,
                textFormat="plainText"
            ).execute()
        else:
            break
    return comments


In [8]:
# üß© Step 5: Test comment fetching
video_url = "https://www.youtube.com/watch?v=liBWJp2OfUU&list=PLXj4XH7LcRfDlQklXu3Hrtru-bm2dJ9Df&index=2"  # üîπ Replace with an educational video link
comments = fetch_youtube_comments(video_url, max_comments=200)
print(f"‚úÖ Fetched {len(comments)} comments.")
pd.DataFrame(comments, columns=["comment"]).head(10)


‚úÖ Fetched 12 comments.


Unnamed: 0,comment
0,happy teachers day sir
1,Thanku sir
2,Thank you sir for ur easy explanation
3,Thanku so much
4,Is pop and structured programing same
5,üòçüòçüòçplease do make these videos.
6,Plz explain us concept of oop in c++
7,Sir i learned this from u r class only it is v...
8,Nice sir tq for explaining the opp and pop
9,Sir this video is super good to know differenc...


In [9]:
# üß© Step 6: Load your fine-tuned model
model_path = r"C:\Users\Vineeth\Desktop\SVRE_P1\models\distillbert_finetuned_v2"

model = DistilBertForSequenceClassification.from_pretrained(model_path)
tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Label mapping
id2label = {0: "Negative", 1: "Neutral", 2: "Positive"}
id2score = {0: -1, 1: 0, 2: 1}


In [10]:
# üß© Step 7: Predict sentiment for comments
def predict_sentiment(comments):
    results = []
    batch_size = 16  # Adjust for GPU memory
    
    for i in tqdm(range(0, len(comments), batch_size)):
        batch = comments[i:i + batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to(device)
        with torch.no_grad():
            logits = model(**inputs).logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
        
        for text, pred in zip(batch, preds):
            results.append({
                "comment": text,
                "pred_label": id2label[pred],
                "score": id2score[pred]
            })
    return pd.DataFrame(results)


In [11]:
# üß© Step 8: Run inference on fetched comments
df_results = predict_sentiment(comments)
df_results.head(10)


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  3.14it/s]


Unnamed: 0,comment,pred_label,score
0,happy teachers day sir,Neutral,0
1,Thanku sir,Positive,1
2,Thank you sir for ur easy explanation,Positive,1
3,Thanku so much,Positive,1
4,Is pop and structured programing same,Neutral,0
5,üòçüòçüòçplease do make these videos.,Neutral,0
6,Plz explain us concept of oop in c++,Neutral,0
7,Sir i learned this from u r class only it is v...,Positive,1
8,Nice sir tq for explaining the opp and pop,Neutral,0
9,Sir this video is super good to know differenc...,Neutral,0


In [None]:
# üß© Step 9: Calculate overall sentiment score
overall_score = df_results["score"].sum()
print(f"üéØ Overall sentiment score: {overall_score}")

# Show 10 random predictions
sample_predictions = df_results.sample(10, random_state=42)
sample_predictions
