<a href="https://colab.research.google.com/github/syedrizwan-afk/AI_Auto_Reply/blob/main/nlp_sentiment_noauth.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLP Sentiment Project ‚Äî No Auth (use_auth_token=False)

This notebook forces Hugging Face model downloads to run anonymously (`use_auth_token=False`) so it won't ask for an API key for public models.

## 1. Setup & Imports

In [7]:
# !pip install -q pandas numpy matplotlib seaborn scikit-learn nltk joblib requests transformers datasets evaluate sentence-transformers accelerate
import os, re, string, joblib, requests, traceback
from io import StringIO
import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
import nltk; nltk.download('stopwords', quiet=True)
print('‚úÖ Imports ready')

‚úÖ Imports ready


## 2. Robust Loader (Local ‚Üí Signed URL ‚Üí Fallback) + Automatic Column Detection

In [8]:
local_path = 'data/my_reviews.csv'   # put your local CSV here if you have it
signed_url = "https://storage.googleapis.com/kagglesdsdata/datasets/605165/1085454/test.txt?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20251101%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20251101T201945Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=8c9db2a3261d8b204babfacc6ff03deb7543f04d33b5dad7d6b3da7402d04de1513826c749245635687a6d7cec7a011418f665abe1316024ec5e22dcd3b5bf765f051b91bf1ec89c3628f268e20b01491844f729874d16fe9a0ed4aaaa6732bd9073df25505d11c6c4c4a60d323a6de9e776d6420754fc57723cf7cb13ab7743048c8694b6b671c7113654f35f19f90cc26ad5add3d6df29c7b8c46f8df84e8c204b498010b48b7e78e230416b041a10ef0344b260a7a457db2c8b2471aafc1401f7d8404334cdf9d5d7e10912e9344afce598e8c39bc2546e65222f55e5c0b27af6b8469e57ae30b5a562cb59d2b60896c908d65635aa61f29d1348eb492ee6"  # replace with your signed URL or leave as ""
fallback_url = 'https://raw.githubusercontent.com/kolaveridi/kaggle-Twitter-US-Airline-Sentiment-/master/Tweets.csv'
save_path = 'data/reviews_real.csv'
os.makedirs('data', exist_ok=True)

# === helpers ===
def try_parse_text(content):
    """Try parsing CSV with common separators, then JSON lines."""
    for sep in [',','\t','|',';']:
        try:
            df = pd.read_csv(StringIO(content), sep=sep)
            if df.shape[1] > 1:
                return df
        except Exception:
            continue
    # try json lines
    try:
        return pd.read_json(StringIO(content), lines=True)
    except Exception:
        pass
    raise ValueError("Unable to parse downloaded content into a DataFrame.")

# === load ===
df = None

# 1) local file preferred
if os.path.exists(local_path):
    print("üìÇ Using local dataset:", local_path)
    df = pd.read_csv(local_path)
else:
    # 2) try signed_url if provided
    if signed_url and signed_url.strip():
        try:
            print("üåê Attempting download from signed URL...")
            resp = requests.get(signed_url, timeout=30)
            resp.raise_for_status()
            print("‚úÖ Download succeeded from signed URL (status_code=%s)" % resp.status_code)
            content = resp.content.decode('utf-8', errors='replace')
            df = try_parse_text(content)
        except Exception as e:
            print("‚ùå Signed URL download/parse failed:", type(e).__name__, str(e))
            traceback.print_exc()
            df = None

    # 3) fallback public dataset if signed URL not available or failed
    if df is None:
        try:
            print("‚¨áÔ∏è Downloading fallback public dataset (Twitter Airline)...")
            resp = requests.get(fallback_url, timeout=30)
            resp.raise_for_status()
            df_f = pd.read_csv(StringIO(resp.content.decode('utf-8', errors='replace')))
            # expected columns: 'text' and 'airline_sentiment'
            if {'text','airline_sentiment'}.issubset(df_f.columns):
                df = df_f[['text','airline_sentiment']].rename(columns={'airline_sentiment':'label'})
                print("‚úÖ Fallback dataset loaded. Shape:", df.shape)
            else:
                # fallback: take first two columns as text,label
                if df_f.shape[1] >= 2:
                    df = df_f.iloc[:, :2].copy()
                    df.columns = ['text','label']
                    print("‚ö†Ô∏è Fallback dataset: using first two columns as text,label. Shape:", df.shape)
                else:
                    raise ValueError("Fallback dataset could not be parsed (too few columns).")
        except Exception as e:
            print("‚ùå Failed to load fallback dataset. See traceback:")
            traceback.print_exc()
            raise RuntimeError("All dataset loading attempts failed. Provide a local CSV or a working signed URL.") from e

# === automatic column detection & normalization ===
print("Columns detected:", list(df.columns))
candidate_text_cols = ['text','tweet','review','content','message','body','sentence','comment']
candidate_label_cols = ['label','sentiment','airline_sentiment','sentiment_label','polarity','target']

cols_lower = {c.lower(): c for c in df.columns}
found_text = next((cols_lower[t] for t in candidate_text_cols if t in cols_lower), None)
found_label = next((cols_lower[l] for l in candidate_label_cols if l in cols_lower), None)

# heuristics for text column if not found
if found_text is None:
    obj_cols = [c for c in df.columns if df[c].dtype == object]
    if not obj_cols:
        raise ValueError("No string-like column found for text.")
    avg_len = {c: df[c].astype(str).map(len).mean() for c in obj_cols}
    found_text = max(avg_len, key=avg_len.get)
    print(f"‚ö†Ô∏è Heuristic chosen text column: '{found_text}' (largest average length).")

# heuristics for label column if not found
if found_label is None:
    other_cols = [c for c in df.columns if c != found_text]
    if not other_cols:
        raise ValueError("No candidate label column found.")
    # pick column with small unique count (likely labels)
    unique_counts = sorted([(c, df[c].nunique()) for c in other_cols], key=lambda x: (x[1], x[0]))
    chosen = next((c for c, uq in unique_counts if uq <= max(100, max(1, int(len(df)/10)))), None)
    if chosen is None:
        chosen = unique_counts[0][0]
    found_label = chosen
    print(f"‚ö†Ô∏è Heuristic chosen label column: '{found_label}' (unique values={df[found_label].nunique()}).")

# rename to standard names and final clean
df = df.rename(columns={found_text: 'text', found_label: 'label'})
if 'airline_sentiment' in df.columns and 'label' not in df.columns:
    df = df.rename(columns={'airline_sentiment':'label'})

df['text'] = df['text'].astype(str).str.strip()
df = df[df['text'] != ''].reset_index(drop=True)

# save normalized dataset
df.to_csv(save_path, index=False)
print("‚úÖ Normalized dataset saved to", save_path)
print("Shape:", df.shape)
print("Label distribution:\n", df['label'].value_counts().head(20))

üåê Attempting download from signed URL...
‚úÖ Download succeeded from signed URL (status_code=200)
Columns detected: ['im feeling rather rotten so im not very ambitious right now', 'sadness']
‚ö†Ô∏è Heuristic chosen text column: 'im feeling rather rotten so im not very ambitious right now' (largest average length).
‚ö†Ô∏è Heuristic chosen label column: 'sadness' (unique values=6).
‚úÖ Normalized dataset saved to data/reviews_real.csv
Shape: (1999, 2)
Label distribution:
 label
joy         695
sadness     580
anger       275
fear        224
love        159
surprise     66
Name: count, dtype: int64


## 3. TF-IDF Baseline

In [9]:
from sklearn.preprocessing import LabelEncoder
stop_words = set(stopwords.words('english'))
def preprocess(t):
    t = re.sub(r"http\S+|www\S+", "", t.lower())
    t = re.sub(r"@\w+", "", t)
    t = t.translate(str.maketrans('', '', string.punctuation))
    return " ".join([w for w in t.split() if w not in stop_words])
df['clean_text'] = df['text'].apply(preprocess)
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X = tfidf.fit_transform(df['clean_text'])
le = LabelEncoder(); y = le.fit_transform(df['label'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)
svm = LinearSVC().fit(X_train, y_train)
pred = svm.predict(X_test)
print(classification_report(y_test, pred, target_names=le.classes_))

              precision    recall  f1-score   support

       anger       0.80      0.62      0.70        69
        fear       0.83      0.61      0.70        56
         joy       0.70      0.87      0.78       174
        love       0.77      0.42      0.55        40
     sadness       0.77      0.85      0.81       145
    surprise       0.71      0.31      0.43        16

    accuracy                           0.75       500
   macro avg       0.76      0.61      0.66       500
weighted avg       0.76      0.75      0.74       500



## 4. DistilBERT Fine-tuning (use_auth_token=False)

In [10]:
from datasets import Dataset
print("Preparing HF Dataset...")
ds = Dataset.from_pandas(df[['text','label']])
ds = ds.class_encode_column('label')
ds = ds.train_test_split(test_size=0.2, seed=42, stratify_by_column='label')
train_ds, test_ds = ds['train'], ds['test']
print("Train/Test:", len(train_ds), len(test_ds))

Preparing HF Dataset...


Casting to class labels:   0%|          | 0/1999 [00:00<?, ? examples/s]

Train/Test: 1599 400


In [17]:
# Tokenize the datasets
def tokenize_fn(ex):
    return tokenizer(ex['text'], truncation=True, padding='max_length', max_length=128)

train_ds = train_ds.map(tokenize_fn, batched=True)
test_ds = test_ds.map(tokenize_fn, batched=True)

# Set the format for PyTorch
train_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

print("‚úÖ Datasets tokenized and formatted for PyTorch")

Map:   0%|          | 0/1599 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

‚úÖ Datasets tokenized and formatted for PyTorch


In [12]:
# Robust TrainingArguments construction (works across transformers versions)
from transformers import TrainingArguments
import transformers, traceback

print("transformers version:", transformers.__version__)

try:
    # Preferred modern form (works on most 4.x versions)
    args = TrainingArguments(
        output_dir='models/bert',
        num_train_epochs=1,
        per_device_train_batch_size=8,
        eval_strategy='epoch',   # preferred
        save_strategy='epoch',
        logging_strategy='epoch'
    )
    print("‚úÖ TrainingArguments created with eval_strategy='epoch'")
except TypeError as e:
    # Fallback for environments that reject evaluation_strategy
    print("‚ö†Ô∏è eval_strategy not accepted by TrainingArguments (falling back).")
    traceback.print_exc()
    # Choose reasonable step counts based on dataset size heuristic if available
    # If train dataset exists and has a notion of length, compute steps; else use defaults
    try:
        train_len = len(train_ds) if 'train_ds' in globals() else None
        per_device = 8
        if train_len:
            # approximate steps per epoch (floor)
            steps_per_epoch = max(1, (train_len // per_device))
            eval_steps = max(50, min(500, steps_per_epoch // 2))
            save_steps = eval_steps
        else:
            eval_steps = 500
            save_steps = 500
    except Exception:
        eval_steps = 500
        save_steps = 500

    args = TrainingArguments(
        output_dir='models/bert',
        num_train_epochs=1,
        per_device_train_batch_size=8,
        do_eval=True,
        evaluation_steps=eval_steps,
        save_steps=save_steps,
        logging_steps=eval_steps
    )
    print(f"‚úÖ Fallback TrainingArguments created (evaluation every {eval_steps} steps).")

# You can print args to inspect
print(args)

transformers version: 4.57.1
‚úÖ TrainingArguments created with eval_strategy='epoch'
TrainingArguments(
_n_gpu=0,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=True,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False

In [16]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()  # uncomment to train (GPU recommended)

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted
1,1.1072,0.697341,0.79,0.750746


TrainOutput(global_step=200, training_loss=1.1072147369384766, metrics={'train_runtime': 1273.1326, 'train_samples_per_second': 1.256, 'train_steps_per_second': 0.157, 'total_flos': 52957620039168.0, 'train_loss': 1.1072147369384766, 'epoch': 1.0})

In [15]:
# Tokenize the datasets
def tokenize_fn(ex):
    return tokenizer(ex['text'], truncation=True, padding='max_length', max_length=128)

train_ds = train_ds.map(tokenize_fn, batched=True)
test_ds = test_ds.map(tokenize_fn, batched=True)

# Set the format for PyTorch
train_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

print("‚úÖ Datasets tokenized and formatted for PyTorch")

Map:   0%|          | 0/1599 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

‚úÖ Datasets tokenized and formatted for PyTorch


In [18]:
metrics = trainer.evaluate()
print("üìä Evaluation results:", metrics)

trainer.save_model("models/distilbert_sentiment")
print("‚úÖ Model saved to models/distilbert_sentiment/")




üìä Evaluation results: {'eval_loss': 0.6973409056663513, 'eval_accuracy': 0.79, 'eval_f1_weighted': 0.7507463618343658, 'eval_runtime': 109.1169, 'eval_samples_per_second': 3.666, 'eval_steps_per_second': 0.458, 'epoch': 1.0}
‚úÖ Model saved to models/distilbert_sentiment/
