In [1]:
print(8+9)

17


In [2]:
print(8+9)

17


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
import numpy as np
import time
import torch
import gc

print("Script started...")
start_time = time.time()


# --- 0. CHECK FOR GPU ---
if torch.cuda.is_available():
    print(" GPU is available! We will use it for training.")
    print(f"Found {torch.cuda.device_count()} GPUs.")
    torch.cuda.empty_cache()
else:
    print(" FATAL: GPU is NOT available. Exiting.")
    exit() # Exit if no GPU

# --- 1. File Paths ---
# These are local paths, assuming the files are in the same folder
PATH_TRAIN = 'D:/vvv/clean_train.csv'
PATH_TEST = 'D:/vvv/clean_test.csv'
# ------------------------------------------------

# --- 2. Load Data (THE CORRECT WAY) ---
try:
    print("Loading 100,000 random rows from clean_train.csv...")
    df_train_full = pd.read_csv(PATH_TRAIN)
    train_df = df_train_full.sample(n=100000, random_state=42)
    del df_train_full
    
    print("Loading 20,000 random rows from clean_test.csv...")
    df_test_full = pd.read_csv(PATH_TEST)
    test_df = df_test_full.sample(n=20000, random_state=42)
    del df_test_full
    
    print(f"Loaded {len(train_df)} training rows.")
    print(f"Loaded {len(test_df)} test rows.")

except FileNotFoundError as e:
    print(f" Error: File not found. Make sure {PATH_TRAIN} and {PATH_TEST} are in the same folder.")
    print(f"Details: {e}")
    raise e

# --- 3. Fix the Labels ---
print("\nMapping labels to integers...")
label_map = {
    'negative': 0,
    'neutral': 1,
    'positive': 2
}
id2label = {v: k for k, v in label_map.items()}

train_df['label'] = train_df['label'].map(label_map)
test_df['label'] = test_df['label'].map(label_map)
train_df = train_df.dropna(subset=['text', 'label'])
test_df = test_df.dropna(subset=['text', 'label'])
train_df['text'] = train_df['text'].astype(str)
test_df['text'] = test_df['text'].astype(str)
train_df['label'] = train_df['label'].astype(int)
test_df['label'] = test_df['label'].astype(int)
print("Label mapping complete.")

# --- 4. Convert to Hugging Face 'Dataset' format ---
print("\nConverting pandas data to Hugging Face Dataset...")
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
raw_datasets = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})
del train_df, test_df
gc.collect()
print("Conversion complete. Final DataFrames deleted.")

# --- 5. Load Tokenizer & Tokenize ---
model_checkpoint = "xlm-roberta-base"
print(f"\nLoading tokenizer for {model_checkpoint}...")
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

print("Tokenizing the datasets...")
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
print("Tokenization complete.")

# --- 6. Load the Model ---
print(f"\nLoading model {model_checkpoint}...")
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, 
    num_labels=3,
    id2label=id2label,
    label2id=label_map
)
print("Model loading complete.")

# --- 7. Set Up Training ---
print("\nSetting up training arguments...")
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# We are training for 3 epochs (3 passes over the data)
training_args = TrainingArguments(
    output_dir="xlm-r-results",
    eval_strategy="epoch",
    save_strategy="no",
    num_train_epochs=3,  # Train for 3 epochs
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=100,
    weight_decay=0.01,
    logging_strategy="epoch",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

# --- 8. Train! ---
print("\n--- STARTING MODEL TRAINING (3 Epochs) ---")
trainer.train()
print("--- TRAINING COMPLETE ---")

# --- 9. Evaluate! ---
print("\n--- EVALUATING MODEL ---")
eval_results = trainer.evaluate()

print("\n--- (TRUE) XLM-R MODEL RESULTS ---")
print(f"Overall Accuracy: {eval_results['eval_accuracy'] * 100:.2f}%")
print(eval_results)

total_time = time.time() - start_time
print(f"\nTotal script time: {total_time:.2f} seconds.")
print("--- END OF SCRIPT ---")

print("\n--- SAVING THE MODEL ---")
# This saves the trained model and tokenizer to a folder named 'my_final_model'
trainer.save_model("my_final_model")
tokenizer.save_pretrained("my_final_model")
print("Model saved to 'my_final_model' folder.")

  from .autonotebook import tqdm as notebook_tqdm


Script started...
 GPU is available! We will use it for training.
Found 1 GPUs.
Loading 100,000 random rows from clean_train.csv...
Loading 20,000 random rows from clean_test.csv...
Loaded 100000 training rows.
Loaded 20000 test rows.

Mapping labels to integers...
Label mapping complete.

Converting pandas data to Hugging Face Dataset...
Conversion complete. Final DataFrames deleted.

Loading tokenizer for xlm-roberta-base...
Tokenizing the datasets...


Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 100000/100000 [00:21<00:00, 4593.82 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 20000/20000 [00:05<00:00, 3538.84 examples/s]


Tokenization complete.

Loading model xlm-roberta-base...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loading complete.

Setting up training arguments...

--- STARTING MODEL TRAINING (3 Epochs) ---


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0941,1.089427,0.36095
2,1.088,1.118532,0.37605
3,1.063,1.109724,0.34445


--- TRAINING COMPLETE ---

--- EVALUATING MODEL ---



--- (TRUE) XLM-R MODEL RESULTS ---
Overall Accuracy: 34.45%
{'eval_loss': 1.1097241640090942, 'eval_accuracy': 0.34445, 'eval_runtime': 66.9834, 'eval_samples_per_second': 298.581, 'eval_steps_per_second': 37.323, 'epoch': 3.0}

Total script time: 7746.66 seconds.
--- END OF SCRIPT ---

--- SAVING THE MODEL ---
Model saved to 'my_final_model' folder.


In [5]:
import pandas as pd
from datasets import Dataset, DatasetDict
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
import numpy as np
import time
import torch
import gc

print("Script started...")
start_time = time.time()

# --- 0. CHECK FOR GPU ---
if torch.cuda.is_available():
    print("âœ… GPU is available! We will use it for training.")
    print(f"Found {torch.cuda.device_count()} GPUs.")
    torch.cuda.empty_cache()
else:
    print("ðŸ›‘ FATAL: GPU is NOT available. Exiting.")
    exit()

# --- 1. File Paths ---
PATH_TRAIN = 'clean_train.csv'
PATH_TEST = 'clean_test.csv'

# --- 2. Load Data (YOUR BALANCED METHOD) ---
try:
    print("Loading full training data to balance it...")
    # 1. Load the full training file
    df_full = pd.read_csv(PATH_TRAIN)
    
    # 2. Separate into 3 groups
    df_pos = df_full[df_full['label'] == 'positive']
    df_neg = df_full[df_full['label'] == 'negative']
    df_neu = df_full[df_full['label'] == 'neutral']
    
    print(f"Found: {len(df_pos)} Pos, {len(df_neg)} Neg, {len(df_neu)} Neu")
    
    # 3. Take 33,333 from EACH (Total ~100k)
    # We use min() just in case one class has fewer than 33k rows
    n_samples = 33333
    df_pos_sample = df_pos.sample(n=n_samples, random_state=42)
    df_neg_sample = df_neg.sample(n=n_samples, random_state=42)
    df_neu_sample = df_neu.sample(n=n_samples, random_state=42)
    
    # 4. Combine and SHUFFLE (Critical!)
    train_df = pd.concat([df_pos_sample, df_neg_sample, df_neu_sample])
    train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Clear memory
    del df_full, df_pos, df_neg, df_neu, df_pos_sample, df_neg_sample, df_neu_sample
    gc.collect()
    
    print("\n--- BALANCED TRAINING DATA ---")
    print(train_df['label'].value_counts())
    print("------------------------------")

    # Load Test Data (We keep this random to represent real world)
    print("Loading test data...")
    df_test_full = pd.read_csv(PATH_TEST)
    test_df = df_test_full.sample(n=20000, random_state=42)
    del df_test_full
    
except FileNotFoundError as e:
    print(f"ðŸ›‘ Error: {e}")
    raise e

# --- 3. Fix Labels ---
print("\nProcessing labels...")
# Clean labels
train_df['label'] = train_df['label'].astype(str).str.strip()
test_df['label'] = test_df['label'].astype(str).str.strip()

label_map = {'negative': 0, 'neutral': 1, 'positive': 2}
id2label = {v: k for k, v in label_map.items()}

train_df['label'] = train_df['label'].map(label_map)
test_df['label'] = test_df['label'].map(label_map)

train_df = train_df.dropna(subset=['text', 'label'])
test_df = test_df.dropna(subset=['text', 'label'])

train_df['text'] = train_df['text'].astype(str)
test_df['text'] = test_df['text'].astype(str)
train_df['label'] = train_df['label'].astype(int)
test_df['label'] = test_df['label'].astype(int)

# --- 4. Convert to Dataset ---
print("Converting to Hugging Face Dataset...")
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
raw_datasets = DatasetDict({'train': train_dataset, 'test': test_dataset})
del train_df, test_df
gc.collect()

# --- 5. Tokenize ---
model_checkpoint = "xlm-roberta-base"
print(f"\nLoading tokenizer ({model_checkpoint})...")
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

print("Tokenizing...")
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

# --- 6. Model ---
print(f"\nLoading model ({model_checkpoint})...")
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, 
    num_labels=3,
    id2label=id2label,
    label2id=label_map
)

# --- 7. Training Setup ---
print("\nSetting up training...")
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="xlm-r-results-balanced",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4, # Smooths out the learning
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_strategy="steps",
    logging_steps=50,
    report_to="none",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

# --- 8. Train ---
print("\n--- STARTING BALANCED TRAINING ---")
trainer.train()
print("--- TRAINING COMPLETE ---")

# --- 9. Evaluate & Save ---
print("\n--- EVALUATING ---")
eval_results = trainer.evaluate()
print(f"Overall Accuracy: {eval_results['eval_accuracy'] * 100:.2f}%")

print("\n--- SAVING FINAL MODEL ---")
trainer.save_model("my_final_model")
tokenizer.save_pretrained("my_final_model")
print("Model saved to 'my_final_model'")

total_time = time.time() - start_time
print(f"\nTotal script time: {total_time:.2f} seconds.")

Script started...
âœ… GPU is available! We will use it for training.
Found 1 GPUs.
Loading full training data to balance it...
Found: 1165038 Pos, 1149231 Neg, 833209 Neu

--- BALANCED TRAINING DATA ---
label
positive    33333
neutral     33333
negative    33333
Name: count, dtype: int64
------------------------------
Loading test data...

Processing labels...
Converting to Hugging Face Dataset...

Loading tokenizer (xlm-roberta-base)...
Tokenizing...


Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 99999/99999 [00:10<00:00, 9484.68 examples/s] 
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 20000/20000 [00:02<00:00, 9477.22 examples/s] 



Loading model (xlm-roberta-base)...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Setting up training...

--- STARTING BALANCED TRAINING ---


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6234,0.602264,0.74475
2,0.5203,0.580835,0.756
3,0.4106,0.589362,0.7682


--- TRAINING COMPLETE ---

--- EVALUATING ---


Overall Accuracy: 75.60%

--- SAVING FINAL MODEL ---
Model saved to 'my_final_model'

Total script time: 4947.76 seconds.
