In [None]:
!pip install -q pandas scikit-learn google-generativeai

In [None]:
%ls -R

In [None]:
from google.colab import files

uploaded = files.upload()   # this will open a file picker


# .ZIP File Extraction & Datasets Loading

In [None]:
import zipfile

zip_path = "sentiment labelled sentences.zip"   # file is in /content/

with zipfile.ZipFile(zip_path, "r") as zip_ref:
    zip_ref.extractall("sentiment_labelled_sentences")  # extract into this folder

print("✅ Extracted successfully!")
!ls sentiment_labelled_sentences


In [None]:
import pandas as pd

amazon_df = pd.read_csv(
    "sentiment_labelled_sentences/sentiment labelled sentences/amazon_cells_labelled.txt",
    delimiter="\t", header=None, names=["text", "label"]
)

print("Amazon shape:", amazon_df.shape)
print(amazon_df.head())


In [None]:
imdb_df = pd.read_csv(
    "sentiment_labelled_sentences/sentiment labelled sentences/imdb_labelled.txt",
    delimiter="\t", header=None, names=["text", "label"]
)

print("IMDb shape:", imdb_df.shape)
print(imdb_df.head())


In [None]:
yelp_df = pd.read_csv(
    "sentiment_labelled_sentences/sentiment labelled sentences/yelp_labelled.txt",
    delimiter="\t", header=None, names=["text", "label"]
)

print("Yelp shape:", yelp_df.shape)
print(yelp_df.head())


# Step # 1:- The Classic ML Classifier

In [None]:
# ===============================
# 📌 Step 1: Import Libraries
# ===============================
import pandas as pd
import numpy as np
import re
import string
import nltk

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

nltk.download("stopwords")
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

In [None]:
# ===============================
# 📌 Step 2: Load the datasets
# (already extracted earlier)
# ===============================
amazon_df = pd.read_csv(
    "sentiment_labelled_sentences/sentiment labelled sentences/amazon_cells_labelled.txt",
    delimiter="\t", header=None, names=["text", "label"]
)

imdb_df = pd.read_csv(
    "sentiment_labelled_sentences/sentiment labelled sentences/imdb_labelled.txt",
    delimiter="\t", header=None, names=["text", "label"]
)

yelp_df = pd.read_csv(
    "sentiment_labelled_sentences/sentiment labelled sentences/yelp_labelled.txt",
    delimiter="\t", header=None, names=["text", "label"]
)

# Combine into one dataset
df = pd.concat([amazon_df, imdb_df, yelp_df], ignore_index=True)
print("✅ Dataset loaded. Shape:", df.shape)

In [None]:
# ===============================
# 📌 Step 3: Preprocess text
# ===============================
def clean_text(text):
    text = text.lower()                           # lowercase
    text = re.sub(r"http\S+|www\S+", "", text)    # remove URLs
    text = re.sub(r"[^a-z\s]", "", text)          # keep only letters
    tokens = text.split()                         # tokenize
    tokens = [t for t in tokens if t not in stop_words]  # remove stopwords
    return " ".join(tokens)

df["clean_text"] = df["text"].apply(clean_text)
print("✅ Text preprocessing complete.")

In [None]:
# ===============================
# 📌 Step 4: Vectorization (TF-IDF)
# ===============================
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["clean_text"])
y = df["label"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("✅ Data vectorized and split. Train size:", X_train.shape, " Test size:", X_test.shape)


In [None]:
# ===============================
# 📌 Step 5: Train Classic ML Models
# ===============================
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "SVM": LinearSVC(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    results[name] = [acc, prec, rec, f1]

    print(f"\n📌 {name} Results")
    print(classification_report(y_test, y_pred))

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title(f"{name} - Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

In [None]:
# ===============================
# 📌 Step 6: Compare Models
# ===============================
results_df = pd.DataFrame(results, index=["Accuracy", "Precision", "Recall", "F1"])
print("\n✅ Model Comparison:")
print(results_df)

# Step # 2:- The Modern LLM Analyzer

In [None]:
# ===============================
# 📌 Step 1: Install Gemini SDK
# ===============================
!pip install -q google-generativeai


In [None]:
# ===============================
# 📌 Step 2: Import Libraries
# ===============================
import os
import pandas as pd
import google.generativeai as genai
from sklearn.metrics import accuracy_score, classification_report

In [None]:
from google.colab import userdata
userdata.get('GOOGLE_API_KEY')

In [None]:
# ===============================
# 📌 Step 3: Configure Gemini API
# ===============================
# Use the environment variable name exactly as given in project PDF
from google.colab import userdata
import google.generativeai as genai

# Retrieve securely from Colab Secrets
api_key = userdata.get("GOOGLE_API_KEY")

if not api_key:
    raise RuntimeError("❌ GOOGLE_API_KEY not found in Colab secrets")

# Configure Gemini client
genai.configure(api_key=api_key)
print("✅ Gemini API configured successfully (key loaded securely)")


In [None]:
# ===============================
# 📌 Step 4: Define Sentiment Function
# ===============================
def gemini_sentiment(text):
    """
    Ask Gemini to classify sentiment.
    Output: 1 (Positive), 0 (Negative)
    """
    prompt = f"Classify the sentiment of this review as Positive (1) or Negative (0):\n\n{text}"
    response = gemini_model.generate_content(prompt)
    prediction = response.text.strip()

    # Normalize Gemini’s response to numeric label
    if "1" in prediction.lower() or "positive" in prediction.lower():
        return 1
    elif "0" in prediction.lower() or "negative" in prediction.lower():
        return 0
    else:
        return -1  # fallback for ambiguous cases

In [None]:
# ===============================
# 📌 Step 4.1: Initialize Gemini Model
# ===============================
# Initialize the Gemini model you want to use
# For example, 'gemini-pro'
gemini_model = genai.GenerativeModel('models/gemini-1.5-flash-latest')

print("✅ Gemini model initialized.")

In [None]:
# ===============================
# 📌 Step 5: Run on Sample Dataset
# ===============================
# (df must already be defined from Classic ML section)
# Reduce the sample size significantly to avoid quota issues

# Load the datasets (already extracted earlier)
import pandas as pd

amazon_df = pd.read_csv(
    "sentiment_labelled_sentences/sentiment labelled sentences/amazon_cells_labelled.txt",
    delimiter="\t", header=None, names=["text", "label"]
)

imdb_df = pd.read_csv(
    "sentiment_labelled_sentences/sentiment labelled sentences/imdb_labelled.txt",
    delimiter="\t", header=None, names=["text", "label"]
)

yelp_df = pd.read_csv(
    "sentiment_labelled_sentences/sentiment labelled sentences/yelp_labelled.txt",
    delimiter="\t", header=None, names=["text", "label"]
)

# Combine into one dataset
df = pd.concat([amazon_df, imdb_df, yelp_df], ignore_index=True)
print("✅ Dataset loaded. Shape:", df.shape)

sample_df = df.sample(3, random_state=42).reset_index(drop=True)

# Apply Gemini sentiment analysis
sample_df["gemini_pred"] = sample_df["text"].apply(gemini_sentiment)

print(sample_df[["text", "label", "gemini_pred"]])

In [None]:
# ===============================
# 📌 Step 6: Evaluate Performance
# ===============================
y_true = sample_df["label"]
y_pred = sample_df["gemini_pred"]

print("\n✅ Gemini Accuracy:", accuracy_score(y_true, y_pred))
print("\nClassification Report:\n", classification_report(y_true, y_pred))

In [None]:
import google.generativeai as genai

for m in genai.list_models():
  if 'generateContent' in m.supported_generation_methods:
    print(m.name)