In [None]:
# STEP 1: Mount Google Drive and set paths
from google.colab import drive
drive.mount('/content/drive')
!pip install -U transformers --quiet

# Set paths
data_path = '/content/drive/MyDrive/training new 1.xlsx'  #  CHANGE if needed
model_name = 'roberta-base'  #  RoBERTa model name
output_path = '/content/drive/MyDrive/models/roberta_finetuned/'

#  STEP 2: Load and encode data
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch

df = pd.read_excel(data_path)
texts = df['input'].astype(str).tolist()
labels = df['Class'].tolist()

label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)
num_labels = len(set(encoded_labels))

train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, encoded_labels, test_size=0.1, stratify=encoded_labels, random_state=42
)

#  STEP 3: Tokenization and Dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)

class RoBERTaDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_len)
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.labels[idx])
        }

train_dataset = RoBERTaDataset(train_texts, train_labels, tokenizer)
val_dataset = RoBERTaDataset(val_texts, val_labels, tokenizer)

#  STEP 4: Fine-tune the model
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score

model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=num_labels, output_hidden_states=True
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return {"accuracy": accuracy_score(labels, preds)}

training_args = TrainingArguments(
    output_dir=output_path,
    eval_steps=100,
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,
    learning_rate=2e-5,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir=f"{output_path}/logs",
    logging_steps=100,
    report_to="none",
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

#  Save fine-tuned model and tokenizer
trainer.save_model(output_path)
tokenizer.save_pretrained(output_path)

# STEP 5: Extract and Save Fine-Tuned Embeddings
from tqdm import tqdm

model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

finetuned_embeddings = []

with torch.no_grad():
    for text in tqdm(texts, desc="Extracting Fine-Tuned Embeddings"):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding='max_length', max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        outputs = model(**inputs)
        mean_emb = outputs.hidden_states[-1].mean(dim=1)
        finetuned_embeddings.append(mean_emb.squeeze(0).cpu())

finetuned_df = pd.DataFrame([e.numpy() for e in finetuned_embeddings])
finetuned_df['Class'] = labels
finetuned_df.to_excel(f'{output_path}/roberta_finetuned_embeddings.xlsx', index=False)

#  STEP 6: Extract and Save Pretrained Embeddings
from transformers import AutoModel

pre_model = AutoModel.from_pretrained(model_name, output_hidden_states=True).to(device)
pre_model.eval()

pretrained_embeddings = []

with torch.no_grad():
    for text in tqdm(texts, desc="Extracting Pretrained Embeddings"):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding='max_length', max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        outputs = pre_model(**inputs)
        mean_emb = outputs.hidden_states[-1].mean(dim=1)
        pretrained_embeddings.append(mean_emb.squeeze(0).cpu())

pretrained_df = pd.DataFrame([e.numpy() for e in pretrained_embeddings])
pretrained_df['Class'] = labels
pretrained_df.to_excel(f'{output_path}/roberta_pretrained_embeddings.xlsx', index=False)

print("ALL DONE SUCCESSFULLY!")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
100,1.0993
200,1.0266
300,1.0274
400,0.9974
500,0.9922
600,0.9943
700,0.884
800,0.946
900,0.851
1000,0.909


Extracting Fine-Tuned Embeddings: 100%|██████████| 1680/1680 [00:29<00:00, 56.93it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Extracting Pretrained Embeddings: 100%|██████████| 1680/1680 [00:52<00:00, 31.86it/s]


✅ ALL DONE SUCCESSFULLY!


In [None]:
# ===============================
# ✅ RoBERTa Classifier Evaluation Pipeline in Colab
# ===============================

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Install required packages
!pip install imbalanced-learn xgboost openpyxl

# Imports
import pandas as pd
import numpy as np

# Models
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

# Preprocessing & Evaluation
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE
from collections import defaultdict

# Load dataset
file_path = "/content/drive/MyDrive/models/roberta_finetuned/roberta_finetuned_embeddings.xlsx"  # <-- Change if needed
data = pd.read_excel(file_path)

X = data.drop(columns=['Class'])
y = data['Class']

# Handle missing and scale
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Define models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=200, max_depth=15, min_samples_split=5, class_weight='balanced', random_state=42),
    "SVM": SVC(C=1, kernel='rbf', gamma='scale', probability=True, class_weight='balanced', random_state=42),
    "XGBoost": XGBClassifier(learning_rate=0.05, max_depth=8, n_estimators=150, subsample=0.8, colsample_bytree=0.8, use_label_encoder=False, eval_metric='mlogloss', random_state=42),
    "Logistic Regression": LogisticRegression(C=1.0, penalty='l2', solver='liblinear', class_weight='balanced', max_iter=1000, random_state=42),
    "Naive Bayes": GaussianNB(var_smoothing=1e-9),
    "Decision Tree": DecisionTreeClassifier(max_depth=10, min_samples_split=5, class_weight='balanced', random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=7, metric='minkowski', p=2),
    "MLP": MLPClassifier(hidden_layer_sizes=(100, 50), activation='relu', alpha=0.0001, max_iter=300, random_state=42),
    "AdaBoost": AdaBoostClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
}

# Store results
results_cv = {}
results_precision_recall = {}
per_class_report = defaultdict(dict)

# Stratified K-Fold
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for model_name, model in models.items():
    accs, f1s, precs, recs = [], [], [], []
    classwise_reports = []

    for train_idx, test_idx in skf.split(X_scaled, y):
        X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # SMOTE
        sm = SMOTE(random_state=42)
        X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

        model.fit(X_train_res, y_train_res)
        y_pred = model.predict(X_test)

        accs.append(accuracy_score(y_test, y_pred))
        f1s.append(f1_score(y_test, y_pred, average='weighted'))
        precs.append(precision_score(y_test, y_pred, average='weighted', zero_division=0))
        recs.append(recall_score(y_test, y_pred, average='weighted'))

        # Per-class report
        report_dict = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        classwise_reports.append(report_dict)

    # Aggregate macro metrics
    results_cv[model_name] = {
        "Mean Accuracy": np.mean(accs),
        "Std Accuracy": np.std(accs),
        "Mean F1-Score": np.mean(f1s),
        "Std F1-Score": np.std(f1s),
    }

    results_precision_recall[model_name] = {
        "Mean Precision": np.mean(precs),
        "Std Precision": np.std(precs),
        "Mean Recall": np.mean(recs),
        "Std Recall": np.std(recs),
    }

    # Per-class mean report
    all_labels = y.unique()
    for label in all_labels:
        label = str(label)
        avg_precision = np.mean([fold[label]['precision'] for fold in classwise_reports if label in fold])
        avg_recall = np.mean([fold[label]['recall'] for fold in classwise_reports if label in fold])
        avg_f1 = np.mean([fold[label]['f1-score'] for fold in classwise_reports if label in fold])
        per_class_report[model_name][f"Class {label} Precision"] = avg_precision
        per_class_report[model_name][f"Class {label} Recall"] = avg_recall
        per_class_report[model_name][f"Class {label} F1"] = avg_f1

# Convert to DataFrames
cv_df = pd.DataFrame(results_cv).T.sort_values(by='Mean F1-Score', ascending=False)
prec_rec_df = pd.DataFrame(results_precision_recall).T.sort_values(by='Mean Precision', ascending=False)
per_class_df = pd.DataFrame(per_class_report).T

# Save results to Excel
output_path = "/content/drive/MyDrive/roberta_model_results.xlsx"
with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
    cv_df.to_excel(writer, sheet_name='CrossVal_Accuracy_F1')
    prec_rec_df.to_excel(writer, sheet_name='Precision_Recall')
    per_class_df.to_excel(writer, sheet_name='Per_Class_Report')

print("\n✅ Results saved to:", output_path)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.




✅ Results saved to: /content/drive/MyDrive/roberta_model_results.xlsx


In [None]:
# ===============================
# ✅ RoBERTa Classifier Evaluation Pipeline in Colab
# ===============================

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Install required packages
!pip install imbalanced-learn xgboost openpyxl

# Imports
import pandas as pd
import numpy as np

# Models
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

# Preprocessing & Evaluation
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE
from collections import defaultdict

# Load dataset
file_path = "/content/drive/MyDrive/roberta_pretrained_embeddings.xlsx"  # <-- Change if needed
data = pd.read_excel(file_path)

X = data.drop(columns=['Class'])
y = data['Class']

# Handle missing and scale
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Define models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=200, max_depth=15, min_samples_split=5, class_weight='balanced', random_state=42),
    "SVM": SVC(C=1, kernel='rbf', gamma='scale', probability=True, class_weight='balanced', random_state=42),
    "XGBoost": XGBClassifier(learning_rate=0.05, max_depth=8, n_estimators=150, subsample=0.8, colsample_bytree=0.8, use_label_encoder=False, eval_metric='mlogloss', random_state=42),
    "Logistic Regression": LogisticRegression(C=1.0, penalty='l2', solver='liblinear', class_weight='balanced', max_iter=1000, random_state=42),
    "Naive Bayes": GaussianNB(var_smoothing=1e-9),
    "Decision Tree": DecisionTreeClassifier(max_depth=10, min_samples_split=5, class_weight='balanced', random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=7, metric='minkowski', p=2),
    "MLP": MLPClassifier(hidden_layer_sizes=(100, 50), activation='relu', alpha=0.0001, max_iter=300, random_state=42),
    "AdaBoost": AdaBoostClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
}

# Store results
results_cv = {}
results_precision_recall = {}
per_class_report = defaultdict(dict)

# Stratified K-Fold
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for model_name, model in models.items():
    accs, f1s, precs, recs = [], [], [], []
    classwise_reports = []

    for train_idx, test_idx in skf.split(X_scaled, y):
        X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # SMOTE
        sm = SMOTE(random_state=42)
        X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

        model.fit(X_train_res, y_train_res)
        y_pred = model.predict(X_test)

        accs.append(accuracy_score(y_test, y_pred))
        f1s.append(f1_score(y_test, y_pred, average='weighted'))
        precs.append(precision_score(y_test, y_pred, average='weighted', zero_division=0))
        recs.append(recall_score(y_test, y_pred, average='weighted'))

        # Per-class report
        report_dict = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        classwise_reports.append(report_dict)

    # Aggregate macro metrics
    results_cv[model_name] = {
        "Mean Accuracy": np.mean(accs),
        "Std Accuracy": np.std(accs),
        "Mean F1-Score": np.mean(f1s),
        "Std F1-Score": np.std(f1s),
    }

    results_precision_recall[model_name] = {
        "Mean Precision": np.mean(precs),
        "Std Precision": np.std(precs),
        "Mean Recall": np.mean(recs),
        "Std Recall": np.std(recs),
    }

    # Per-class mean report
    all_labels = y.unique()
    for label in all_labels:
        label = str(label)
        avg_precision = np.mean([fold[label]['precision'] for fold in classwise_reports if label in fold])
        avg_recall = np.mean([fold[label]['recall'] for fold in classwise_reports if label in fold])
        avg_f1 = np.mean([fold[label]['f1-score'] for fold in classwise_reports if label in fold])
        per_class_report[model_name][f"Class {label} Precision"] = avg_precision
        per_class_report[model_name][f"Class {label} Recall"] = avg_recall
        per_class_report[model_name][f"Class {label} F1"] = avg_f1

# Convert to DataFrames
cv_df = pd.DataFrame(results_cv).T.sort_values(by='Mean F1-Score', ascending=False)
prec_rec_df = pd.DataFrame(results_precision_recall).T.sort_values(by='Mean Precision', ascending=False)
per_class_df = pd.DataFrame(per_class_report).T

# Save results to Excel
output_path = "/content/drive/MyDrive/preroberta_model_results.xlsx"
with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
    cv_df.to_excel(writer, sheet_name='CrossVal_Accuracy_F1')
    prec_rec_df.to_excel(writer, sheet_name='Precision_Recall')
    per_class_df.to_excel(writer, sheet_name='Per_Class_Report')

print("\n✅ Results saved to:", output_path)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.




✅ Results saved to: /content/drive/MyDrive/preroberta_model_results.xlsx
