!!!!!!Section I: Sentiment Analysis on Car Reviews!!!!!!

In [None]:
# Logistic Regression Model
# Training and validation sets

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

# Load training data
train_data = pd.read_excel('/content/Train_data.xlsx')
reviews = train_data['Review']
targets = train_data['Target']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(reviews, targets, test_size=0.2, random_state=42)

# Vectorize the text using TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=500)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)

# Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Predict on the validation set
val_predictions = model.predict(X_val_tfidf)

# Calculate accuracy and F1 score
accuracy = accuracy_score(y_val, val_predictions)
f1 = f1_score(y_val, val_predictions, average='weighted')

print("Validation Accuracy:", accuracy)
print("Validation F1 Score:", f1)


Validation Accuracy: 0.7560975609756098
Validation F1 Score: 0.6813486370157819


In [None]:
# Predictions for test set using logistic regression

# Load testing data
test_data = pd.read_excel('/content/Test_features.xlsx')
test_reviews = test_data['Review']

# Transform test reviews to match the training vectorizer
X_test_tfidf = vectorizer.transform(test_reviews)

# Predict using the trained model
test_predictions = model.predict(X_test_tfidf)

# Save predictions on excel
output = pd.DataFrame({'review': test_reviews, 'predicted_target': test_predictions})
output.to_excel('test_predictions.xlsx', index=False)
print("Test predictions saved to 'test_predictions.xlsx'")


Test predictions saved to 'test_predictions.xlsx'


In [None]:
#Save predictions as a txt file

import pandas as pd

# Load the test predictions file
predictions_df = pd.read_excel('/content/test_predictions.xlsx')

# Extract only the predicted labels
predicted_labels = predictions_df['predicted_target']

# Save the predicted labels to a .txt file
predicted_labels.to_csv('predicted_labels.txt', index=False, header=False)
print("Predicted labels saved to 'predicted_labels.txt'")


Predicted labels saved to 'predicted_labels.txt'


In [None]:
# First SVM model (Best performance)

# Training and validation sets

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score

# Load training data
train_data = pd.read_excel('/content/Train_data.xlsx')
reviews = train_data['Review']
targets = train_data['Target']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(reviews, targets, test_size=0.2, random_state=42)

# Vectorize the text using TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=500)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)

# Train an SVM classifier
model = SVC(kernel='linear')
model.fit(X_train_tfidf, y_train)

# Predict on the validation set
val_predictions = model.predict(X_val_tfidf)

# Calculate accuracy and F1 score
accuracy = accuracy_score(y_val, val_predictions)
f1 = f1_score(y_val, val_predictions, average='weighted')

print("Validation Accuracy:", accuracy)
print("Validation F1 Score:", f1)


Validation Accuracy: 0.8048780487804879
Validation F1 Score: 0.7516629711751662


In [None]:
# This is the Preds.txt file that was submitted on Codalab and got the highest score

# Predictions for test set using the first SVM model

# Load testing data
test_data = pd.read_excel('/content/Test_features.xlsx')
test_reviews = test_data['Review']

# Transform test reviews to match the training vectorizer
X_test_tfidf = vectorizer.transform(test_reviews)

# Predict using the trained model
test_predictions = model.predict(X_test_tfidf)

# Save predictions to a .txt file
with open('preds.txt', 'w') as f:
    for label in test_predictions:
        f.write(f"{label}\n")
print("Predicted labels saved to 'preds.txt'")


Predicted labels saved to 'preds.txt'


In [None]:
# Optimized SVM model (did not outperform the first SVM model)

# Training and validation sets

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Load training data
train_data = pd.read_excel('/content/Train_data.xlsx')
reviews = train_data['Review']
targets = train_data['Target']

# Vectorize the text using TfidfVectorizer with more features and n-grams
vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(reviews)

# Set up the parameter grid for GridSearch
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']  }

# Initialize the SVM model
svc = SVC()

# Use GridSearchCV to find the best parameters
grid_search = GridSearchCV(svc, param_grid, cv=5, scoring='f1_weighted', verbose=2)
grid_search.fit(X_train_tfidf, targets)

# Get the best estimator and parameters
best_model = grid_search.best_estimator_
print("Best SVM Model:", best_model)
print("Best Parameters:", grid_search.best_params_)


Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.0s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.0s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.0s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.0s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.0s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END ....................C=0.1, gamma=scale, kernel=poly; total time=   0.0s
[CV] END ....................C=0.1, gamma=scale,

In [None]:
# Predictions for test set using the optimized SVM model

# Load test data
test_data = pd.read_excel('/content/Test_features.xlsx')
test_reviews = test_data['Review']
X_test_tfidf = vectorizer.transform(test_reviews)

# Predict with the best SVM model
test_predictions = best_model.predict(X_test_tfidf)

# Save predictions to a .txt file
with open('optimized_predicted_labels.txt', 'w') as f:
    for label in test_predictions:
        f.write(f"{label}\n")
print("Optimized predicted labels saved to 'optimized_predicted_labels.txt'")


Optimized predicted labels saved to 'optimized_predicted_labels.txt'


In [None]:
# BERT (poorest performance on Codalab)

# Training, validation and test sets

!pip install transformers datasets torch scikit-learn

import torch
from sklearn.metrics import accuracy_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd

# Load training data
train_data = pd.read_excel('/content/Train_data.xlsx')
reviews = train_data['Review'].tolist()
targets = train_data['Target'].tolist()

# Load testing data
test_data = pd.read_excel('/content/Test_features.xlsx')
test_reviews = test_data['Review'].tolist()

# Load the pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Preprocess the data (tokenize)
def tokenize_function(examples):
    return tokenizer(examples['Review'], padding='max_length', truncation=True, max_length=128)

# Convert the data into the Dataset format required by Hugging Face
train_dataset = Dataset.from_dict({'Review': reviews, 'Target': targets})
test_dataset = Dataset.from_dict({'Review': test_reviews})

# Apply the tokenization to the dataset
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.rename_column('Target', 'labels')

# Set up the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    report_to="none",
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=train_dataset,
)

# Train the model
trainer.train()

# Predict on the test set
predictions = trainer.predict(test_dataset)

# Get the predicted labels
predicted_labels = predictions.predictions.argmax(axis=1)

# Save predictions to a .txt file
with open('bert_predictions.txt', 'w') as f:
    for label in predicted_labels:
        f.write(f"{label}\n")



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/201 [00:00<?, ? examples/s]

Map:   0%|          | 0/44 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,1.1018,1.00742
2,0.9121,0.861982
3,0.7919,0.76851


In [None]:
# Optimized BERT (got the same predictions as the initial BERT model)

# Training, validation and test sets

import torch
from sklearn.metrics import accuracy_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Load training data
train_data = pd.read_excel('/content/Train_data.xlsx')
reviews = train_data['Review'].tolist()
targets = train_data['Target'].tolist()

# Load testing data
test_data = pd.read_excel('/content/Test_features.xlsx')
test_reviews = test_data['Review'].tolist()

# Preprocess the data
def preprocess_text(text):
    return " ".join(text.lower().split())

train_data['Review'] = train_data['Review'].apply(preprocess_text)
test_data['Review'] = test_data['Review'].apply(preprocess_text)

# Load the pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples['Review'], padding='max_length', truncation=True, max_length=128)

# Convert the data into the Dataset format required by Hugging Face
train_dataset = Dataset.from_dict({'Review': reviews, 'Target': targets})
test_dataset = Dataset.from_dict({'Review': test_reviews})

# Apply tokenization to the dataset
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.rename_column('Target', 'labels')

# Compute class weights to handle imbalanced classes
class_weights = compute_class_weight('balanced', classes=np.array([0, 1, 2]), y=targets)
class_weights = torch.tensor(class_weights, dtype=torch.float)

# Set up the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=1000,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    report_to="none",
    lr_scheduler_type="linear",
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=train_dataset,
    compute_metrics=lambda p: {
        'accuracy': accuracy_score(p.predictions.argmax(axis=1), p.label_ids),
        'f1': f1_score(p.predictions.argmax(axis=1), p.label_ids, average='weighted')
    },
)

# Train the model
trainer.train()

# Predict on the test set
predictions = trainer.predict(test_dataset)

# Get the predicted labels
predicted_labels = predictions.predictions.argmax(axis=1)

# Save predictions to a .txt file
with open('bert_predictions.txt', 'w') as f:
    for label in predicted_labels:
        f.write(f"{label}\n")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/201 [00:00<?, ? examples/s]

Map:   0%|          | 0/44 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.1054,1.092291,0.318408,0.297542
2,1.0879,1.048477,0.587065,0.677925
3,1.0606,0.99449,0.631841,0.77439
4,0.9872,0.917439,0.631841,0.77439
5,0.9007,0.855772,0.631841,0.77439


!!!!!!Section II: Multi-Label Docker Commit Classification!!!!!!

In [None]:
# Random Forest with OneVsRest Classification

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import hamming_loss, f1_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV

# Load Data
labeled_train = pd.read_csv("/content/training_label.csv")
unlabeled_train = pd.read_csv("/content/training_data.csv")

labeled_val = pd.read_csv("/content/validation_label.csv")
unlabeled_val = pd.read_csv("/content/validation_data.csv")

test_df = pd.read_csv("/content/testing_data.csv")

# Merge Labeled and Unlabeled Data
train_df = pd.merge(labeled_train, unlabeled_train, on="id")
val_df = pd.merge(labeled_val, unlabeled_val, on="id")

# Combine text features for training and validation
def combine_text(df):
    return (
        df['Subject'].fillna('') + ' ' +
        df['Message'].fillna('') + ' ' +
        df['Old Contents'].fillna('') + ' ' +
        df['New Contents'].fillna('')
    )

X_train = combine_text(train_df)
X_val = combine_text(val_df)
X_test = combine_text(test_df)

# Process Labels
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(train_df['Ground truth'].str.split(','))
y_val = mlb.transform(val_df['Ground truth'].str.split(','))

# Feature Extraction
tfidf = TfidfVectorizer(max_features=10000, stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)
X_test_tfidf = tfidf.transform(X_test)

# Model: Try RandomForestClassifier or other ensemble methods for better performance
rf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
model = OneVsRestClassifier(rf)

# CalibratedClassifierCV to improve probability estimates
calibrated_model = CalibratedClassifierCV(model, method='sigmoid', cv='prefit')

# Fit the model
model.fit(X_train_tfidf, y_train)

# Predict probabilities to tune thresholds
y_val_prob = model.predict_proba(X_val_tfidf)

# Custom Thresholding to Improve Predictions
threshold = 0.5
y_val_pred = (y_val_prob > threshold).astype(int)

# Validation Evaluation
hamming = hamming_loss(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred, average='micro')
print(f"Hamming Loss: {hamming:.4f}")
print(f"F1 Score (Micro): {f1:.4f}")

# Predict Test Set
y_test_prob = model.predict_proba(X_test_tfidf)
y_test_pred = (y_test_prob > threshold).astype(int)

# Format Predictions
predicted_labels = mlb.inverse_transform(y_test_pred)

# Ensure every id in test set gets at least one label
test_df['Ground truth'] = [','.join(labels) if labels else 'Maintenance/Other' for labels in predicted_labels]

submission = test_df[['id', 'Ground truth']]
submission.to_csv("submission.csv", index=False)
print("Predictions saved to submission.csv")


Hamming Loss: 0.0719
F1 Score (Micro): 0.5352
Predictions saved to submission.csv


In [None]:
# Gradient Boosting Classifier (Best performance)
# This is the Preds.txt file that was submitted on Codalab and got the highest score

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import hamming_loss, f1_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.calibration import CalibratedClassifierCV
import numpy as np

# Load Data
labeled_train = pd.read_csv("/content/training_label.csv")
unlabeled_train = pd.read_csv("/content/training_data.csv")
labeled_val = pd.read_csv("/content/validation_label.csv")
unlabeled_val = pd.read_csv("/content/validation_data.csv")

test_df = pd.read_csv("/content/testing_data.csv")

# Merge DataFrames
train_df = pd.merge(labeled_train, unlabeled_train, on="id")
val_df = pd.merge(labeled_val, unlabeled_val, on="id")

# Combine Text Features
def combine_text(df):
    return (
        df['Subject'].fillna('') + ' ' +
        df['Message'].fillna('') + ' ' +
        df['Old Contents'].fillna('') + ' ' +
        df['New Contents'].fillna('')
    )

X_train = combine_text(train_df)
X_val = combine_text(val_df)
X_test = combine_text(test_df)

# Process Labels
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(train_df['Ground truth'].str.split(','))
y_val = mlb.transform(val_df['Ground truth'].str.split(','))

# Feature Extraction
tfidf = TfidfVectorizer(max_features=15000, ngram_range=(1, 2), stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)
X_test_tfidf = tfidf.transform(X_test)

# Use Gradient Boosting for better performance
gb = GradientBoostingClassifier(n_estimators=200, max_depth=7, random_state=42)
model = OneVsRestClassifier(gb)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'estimator__learning_rate': [0.05, 0.1, 0.2],
    'estimator__n_estimators': [100, 200],
    'estimator__max_depth': [5, 7, 10]
}
grid_search = GridSearchCV(model, param_grid, cv=3, n_jobs=-1, scoring='f1_micro')
grid_search.fit(X_train_tfidf, y_train)

# Get the best model from GridSearchCV
best_model = grid_search.best_estimator_

# Calibrate the model
calibrated_model = CalibratedClassifierCV(best_model, method='sigmoid', cv='prefit')

# Predict probabilities on the validation set
y_val_prob = best_model.predict_proba(X_val_tfidf)

# Custom thresholding
threshold = 0.4
y_val_pred = (y_val_prob > threshold).astype(int)

# Evaluate the model
hamming = hamming_loss(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred, average='micro')
print(f"Hamming Loss: {hamming:.4f}")
print(f"F1 Score (Micro): {f1:.4f}")

# Predict Test Set
y_test_prob = best_model.predict_proba(X_test_tfidf)
y_test_pred = (y_test_prob > threshold).astype(int)

# Format Predictions
predicted_labels = mlb.inverse_transform(y_test_pred)
test_df['Ground truth'] = [','.join(labels) if labels else 'Maintenance/Other' for labels in predicted_labels]

# Save submission
submission = test_df[['id', 'Ground truth']]
submission.to_csv("submission.csv", index=False)
print("Predictions saved to submission.csv")


Hamming Loss: 0.0708
F1 Score (Micro): 0.6798
Predictions saved to submission.csv


!!!!!!Section III: HIPA-AI!!!!!!

In [None]:
# Logistic Regression (baseline model) (first approach)

# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import make_pipeline
from sklearn.utils.class_weight import compute_class_weight

# Load the datasets
train_data = pd.read_csv('/content/train.csv', encoding='latin-1')
dev_data = pd.read_csv('/content/dev.csv', encoding='latin-1')
test_data = pd.read_csv('/content/test_data.csv', encoding='latin-1')

# Preprocessing and Feature Extraction
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)

# Train/Test split
X_train = train_data['Features']
y_train = train_data['Label']
X_dev = dev_data['Features']
y_dev = dev_data['Label']

# Handle class imbalance by using class weights
classes = np.unique(y_train)
class_weights = compute_class_weight('balanced', classes=classes, y=y_train)
class_weight_dict = {classes[i]: class_weights[i] for i in range(len(classes))}

# Model Building and Training
model = make_pipeline(vectorizer, LogisticRegression(class_weight='balanced', random_state=42))

# Train the model
model.fit(X_train, y_train)

# Evaluate the model on the development set
y_pred_dev = model.predict(X_dev)

# Calculate metrics
accuracy = accuracy_score(y_dev, y_pred_dev)
precision = precision_score(y_dev, y_pred_dev, pos_label='yes')
recall = recall_score(y_dev, y_pred_dev, pos_label='yes')
f1 = f1_score(y_dev, y_pred_dev, pos_label='yes')

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

# Predict on the test data
X_test = test_data['Features']
test_predictions = model.predict(X_test)

# Save predictions to preds.txt
with open('preds.txt', 'w') as f:
    for label in test_predictions:
        f.write(label + '\n')

print("Predictions saved to preds.txt")


Accuracy: 0.5000
Precision: 1.0000
Recall: 0.1667
F1 Score: 0.2857
Predictions saved to preds.txt


In [None]:
# DistilBERT (second approach)
# This is the Preds.txt file that was submitted on Codalab and got the best scores for the overall evaluation metrics

!pip install datasets
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
import gc

# Load the datasets
train_data = pd.read_csv('/content/train.csv', encoding='latin-1')
dev_data = pd.read_csv('/content/dev.csv', encoding='latin-1')
test_data = pd.read_csv('/content/test_data.csv', encoding='latin-1')

# Preprocess the data
train_data = train_data[['Features', 'Label']]
dev_data = dev_data[['Features', 'Label']]

# Initialize DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['Features'], padding='max_length', truncation=True, max_length=128)

# Convert DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_data)
dev_dataset = Dataset.from_pandas(dev_data)

# Tokenize the datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
dev_dataset = dev_dataset.map(tokenize_function, batched=True)

# Prepare labels (convert 'yes' and 'no' to binary)
train_dataset = train_dataset.map(lambda examples: {'labels': [1 if label == 'yes' else 0 for label in examples['Label']]}, batched=True)
dev_dataset = dev_dataset.map(lambda examples: {'labels': [1 if label == 'yes' else 0 for label in examples['Label']]}, batched=True)

# Load Pretrained DistilBERT model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Set up the Trainer with smaller batch size and gradient accumulation
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    disable_tqdm=True,
    report_to="none",
    gradient_accumulation_steps=2,
    save_steps=100,
    eval_steps=100,
    max_grad_norm=1.0,
    dataloader_num_workers=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=lambda p: {
        'accuracy': accuracy_score(p.predictions.argmax(axis=-1), p.label_ids),
        'precision': precision_score(p.predictions.argmax(axis=-1), p.label_ids),
        'recall': recall_score(p.predictions.argmax(axis=-1), p.label_ids),
        'f1': f1_score(p.predictions.argmax(axis=-1), p.label_ids)
    }
)

# Train the model
trainer.train()

# Evaluate the model on the development set
results = trainer.evaluate()

print(f"Evaluation Results: {results}")

# Clear cache after evaluation to free up memory
torch.cuda.empty_cache()
gc.collect()

# Predict on the test set
test_data = test_data['Features']
test_encodings = tokenizer(test_data.tolist(), padding=True, truncation=True, max_length=128, return_tensors='pt')
test_preds = model(**test_encodings).logits.argmax(axis=-1)

# Save predictions to preds.txt
with open('preds.txt', 'w') as f:
    for pred in test_preds:
        label = 'yes' if pred == 1 else 'no'
        f.write(label + '\n')

print("Predictions saved to preds.txt")





Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.6995152235031128, 'eval_accuracy': 0.4, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_runtime': 1.7939, 'eval_samples_per_second': 5.574, 'eval_steps_per_second': 0.557, 'epoch': 1.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.692905843257904, 'eval_accuracy': 0.4, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_runtime': 2.5438, 'eval_samples_per_second': 3.931, 'eval_steps_per_second': 0.393, 'epoch': 2.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.6871377229690552, 'eval_accuracy': 0.4, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_runtime': 1.7789, 'eval_samples_per_second': 5.622, 'eval_steps_per_second': 0.562, 'epoch': 3.0}
{'train_runtime': 169.6393, 'train_samples_per_second': 1.415, 'train_steps_per_second': 0.088, 'train_loss': 0.6628288269042969, 'epoch': 3.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.6871377229690552, 'eval_accuracy': 0.4, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_runtime': 1.7617, 'eval_samples_per_second': 5.676, 'eval_steps_per_second': 0.568, 'epoch': 3.0}
Evaluation Results: {'eval_loss': 0.6871377229690552, 'eval_accuracy': 0.4, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_runtime': 1.7617, 'eval_samples_per_second': 5.676, 'eval_steps_per_second': 0.568, 'epoch': 3.0}
Predictions saved to preds.txt


In [None]:
# DistilBERT (thrid approach)
# Got the same scores as the initial DistilBERT model

!pip install datasets
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
import gc

# Load the datasets
train_data = pd.read_csv('/content/train.csv', encoding='latin-1')
dev_data = pd.read_csv('/content/dev.csv', encoding='latin-1')
test_data = pd.read_csv('/content/test_data.csv', encoding='latin-1')

# Preprocess the data
train_data = train_data[['Features', 'Label']]
dev_data = dev_data[['Features', 'Label']]

# Initialize DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['Features'], padding='max_length', truncation=True, max_length=128)

# Convert DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_data)
dev_dataset = Dataset.from_pandas(dev_data)

# Tokenize the datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
dev_dataset = dev_dataset.map(tokenize_function, batched=True)

# Prepare labels
train_dataset = train_dataset.map(lambda examples: {'labels': [1 if label == 'yes' else 0 for label in examples['Label']]}, batched=True)
dev_dataset = dev_dataset.map(lambda examples: {'labels': [1 if label == 'yes' else 0 for label in examples['Label']]}, batched=True)

# Load Pretrained DistilBERT model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Set up the Trainer with hyperparameters for tuning
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    disable_tqdm=True,
    report_to="none",
    gradient_accumulation_steps=2,
    save_steps=500,
    eval_steps=500,
    max_grad_norm=1.0,
    dataloader_num_workers=2
)

# Define the compute_metrics function
def compute_metrics(p):
    preds = p.predictions.argmax(axis=-1)
    return {
        'accuracy': accuracy_score(p.label_ids, preds),
        'precision': precision_score(p.label_ids, preds),
        'recall': recall_score(p.label_ids, preds),
        'f1': f1_score(p.label_ids, preds)
    }

# Set up the Trainer with the model and datasets
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate the model on the development set
results = trainer.evaluate()

print(f"Evaluation Results: {results}")

# Clear cache after evaluation to free up memory
torch.cuda.empty_cache()
gc.collect()

# Predict on the test set
test_data = test_data['Features']
test_encodings = tokenizer(test_data.tolist(), padding=True, truncation=True, max_length=128, return_tensors='pt')
test_preds = model(**test_encodings).logits.argmax(axis=-1)

# Save predictions to preds.txt
with open('preds.txt', 'w') as f:
    for pred in test_preds:
        label = 'yes' if pred == 1 else 'no'
        f.write(label + '\n')

print("Predictions saved to preds.txt")




Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.6942738890647888, 'eval_accuracy': 0.5, 'eval_precision': 1.0, 'eval_recall': 0.16666666666666666, 'eval_f1': 0.2857142857142857, 'eval_runtime': 1.7971, 'eval_samples_per_second': 5.564, 'eval_steps_per_second': 0.556, 'epoch': 0.8}
{'eval_loss': 0.7013977766036987, 'eval_accuracy': 0.4, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_runtime': 1.7651, 'eval_samples_per_second': 5.666, 'eval_steps_per_second': 0.567, 'epoch': 2.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.7010788321495056, 'eval_accuracy': 0.4, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_runtime': 2.6698, 'eval_samples_per_second': 3.746, 'eval_steps_per_second': 0.375, 'epoch': 2.8}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.6970375776290894, 'eval_accuracy': 0.4, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_runtime': 1.7906, 'eval_samples_per_second': 5.585, 'eval_steps_per_second': 0.558, 'epoch': 4.0}
{'train_runtime': 228.8483, 'train_samples_per_second': 1.748, 'train_steps_per_second': 0.044, 'train_loss': 0.679990291595459, 'epoch': 4.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.6970375776290894, 'eval_accuracy': 0.4, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_runtime': 2.7878, 'eval_samples_per_second': 3.587, 'eval_steps_per_second': 0.359, 'epoch': 4.0}
Evaluation Results: {'eval_loss': 0.6970375776290894, 'eval_accuracy': 0.4, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_runtime': 2.7878, 'eval_samples_per_second': 3.587, 'eval_steps_per_second': 0.359, 'epoch': 4.0}
Predictions saved to preds.txt


In [None]:
# RoBERTa (fourth approach)

!pip install datasets
!pip install transformers

import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
import gc
import torch.nn as nn

# Load the datasets
train_data = pd.read_csv('/content/train.csv', encoding='latin-1')
dev_data = pd.read_csv('/content/dev.csv', encoding='latin-1')
test_data = pd.read_csv('/content/test_data.csv', encoding='latin-1')

# Preprocessing
train_data['Label'] = train_data['Label'].apply(lambda x: 1 if x == 'yes' else 0)
dev_data['Label'] = dev_data['Label'].apply(lambda x: 1 if x == 'yes' else 0)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.iloc[index]['Features']
        label = self.data.iloc[index]['Label'] if 'Label' in self.data.columns else -1
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['label'] = torch.tensor(label, dtype=torch.long)
        return item

train_dataset = CustomDataset(train_data, tokenizer, max_len=128)
dev_dataset = CustomDataset(dev_data, tokenizer, max_len=128)

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(train_data['Label']), y=train_data['Label'])
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to('cuda')

# Model
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
model.to('cuda')

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    save_total_limit=1
)

# Metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    prec = precision_score(labels, preds, zero_division=1)
    rec = recall_score(labels, preds, zero_division=1)
    f1 = f1_score(labels, preds, zero_division=1)
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1}

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train
trainer.train()

# Predict on test set
test_encodings = tokenizer(test_data['Features'].tolist(), padding=True, truncation=True, max_length=128, return_tensors='pt')
test_encodings = {key: val.to('cuda') for key, val in test_encodings.items()}

with torch.no_grad():
    logits = model(**test_encodings).logits
    probs = torch.softmax(logits, dim=1)
    preds = (probs[:, 1] > 0.6).int()

# Save predictions
pred_labels = ['yes' if pred == 1 else 'no' for pred in preds.cpu().numpy()]
with open("preds.txt", "w") as f:
    f.write("\n".join(pred_labels))


# Preprocess the data
train_data = train_data[['Features', 'Label']]
dev_data = dev_data[['Features', 'Label']]

# Initialize RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def tokenize_function(examples):
    return tokenizer(examples['Features'], padding='max_length', truncation=True, max_length=128)

# Convert DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_data)
dev_dataset = Dataset.from_pandas(dev_data)

# Tokenize the datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
dev_dataset = dev_dataset.map(tokenize_function, batched=True)

# Prepare labels (convert 'yes' and 'no' to binary)
train_dataset = train_dataset.map(lambda examples: {'labels': [1 if label == 'yes' else 0 for label in examples['Label']]}, batched=True)
dev_dataset = dev_dataset.map(lambda examples: {'labels': [1 if label == 'yes' else 0 for label in examples['Label']]}, batched=True)

# Load Pretrained RoBERTa model for sequence classification
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

# Define class weights
class_weights = torch.tensor([1.0, 2.0])

# Modify the model's forward pass to include class weights in the loss calculation
class CustomRobertaForSequenceClassification(RobertaForSequenceClassification):
    def __init__(self, config, class_weights=None):
        super().__init__(config)
        self.class_weights = class_weights

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        outputs = super().forward(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(weight=self.class_weights)
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            return (loss, logits)
        return logits

# Use the modified model with class weights
model = CustomRobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2, class_weights=class_weights)

# Set up the Trainer with hyperparameters for tuning
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    weight_decay=0.01,
    logging_dir='./logs',
    disable_tqdm=False,
    report_to="none",
    gradient_accumulation_steps=4,
    save_steps=500,
    eval_steps=500,
    max_grad_norm=1.0,
    dataloader_num_workers=2,
    load_best_model_at_end=True,
    fp16=True,
    no_cuda=False,
)

# Define the compute_metrics function
def compute_metrics(p):
    preds = p.predictions.argmax(axis=-1)
    return {
        'accuracy': accuracy_score(p.label_ids, preds),
        'precision': precision_score(p.label_ids, preds),
        'recall': recall_score(p.label_ids, preds),
        'f1': f1_score(p.label_ids, preds)
    }

# Set up the Trainer with the model and datasets
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate the model on the development set
results = trainer.evaluate()

print(f"Evaluation Results: {results}")

# Clear cache after evaluation to free up memory
torch.cuda.empty_cache()
gc.collect()

# Predict on the test set
test_data = test_data['Features']
test_encodings = tokenizer(test_data.tolist(), padding=True, truncation=True, max_length=128, return_tensors='pt')

# Ensure the model returns the correct output
with torch.no_grad():
    outputs = model(**test_encodings)

    # If outputs is a tensor, directly use it for prediction
    if isinstance(outputs, torch.Tensor):
        logits = outputs
    else:
        logits = outputs.logits

# Use the logits to make predictions
test_preds = logits.argmax(axis=-1)

# Save predictions to preds.txt
with open('preds.txt', 'w') as f:
    for pred in test_preds:
        label = 'yes' if pred == 1 else 'no'
        f.write(label + '\n')

print("Predictions saved to preds.txt")



Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of CustomRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
0,No log,0.653208,0.6,0.6,1.0,0.75
2,No log,0.623153,0.6,0.6,1.0,0.75
4,No log,0.611031,0.6,0.6,1.0,0.75


Evaluation Results: {'eval_loss': 0.6110305786132812, 'eval_accuracy': 0.6, 'eval_precision': 0.6, 'eval_recall': 1.0, 'eval_f1': 0.75, 'eval_runtime': 3.5665, 'eval_samples_per_second': 2.804, 'eval_steps_per_second': 0.28, 'epoch': 4.8}
Predictions saved to preds.txt


In [None]:
# RoBERTa (fifth approach)

!pip install transformers
!pip install datasets
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load Datasets
train_data = pd.read_csv('/content/train.csv', encoding='latin-1')
dev_data = pd.read_csv('/content/dev.csv', encoding='latin-1')
test_data = pd.read_csv('/content/test_data.csv', encoding='latin-1')

# Preprocessing
train_data['Label'] = train_data['Label'].apply(lambda x: 1 if x == 'yes' else 0)
dev_data['Label'] = dev_data['Label'].apply(lambda x: 1 if x == 'yes' else 0)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")


class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.iloc[index]['Features']
        label = self.data.iloc[index]['Label'] if 'Label' in self.data.columns else -1
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['label'] = torch.tensor(label, dtype=torch.long)
        return item


train_dataset = CustomDataset(train_data, tokenizer, max_len=128)
dev_dataset = CustomDataset(dev_data, tokenizer, max_len=128)
test_dataset = CustomDataset(test_data, tokenizer, max_len=128)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

# Model
class CustomRobertaModel(nn.Module):
    def __init__(self):
        super(CustomRobertaModel, self).__init__()
        self.model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(2, 2)

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.dropout(outputs.logits)
        logits = self.fc(logits)
        return logits


device = torch.device("cpu")
model = CustomRobertaModel().to(device)

# Training Setup
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
criterion = nn.CrossEntropyLoss()

# Training Loop
def train_model(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(dataloader)

# Validation Loop
def evaluate_model(model, dataloader):
    model.eval()
    preds, true_labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            _, predicted = torch.max(outputs, dim=1)

            preds.extend(predicted.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, preds)
    precision = precision_score(true_labels, preds, zero_division=1)
    recall = recall_score(true_labels, preds, zero_division=1)
    f1 = f1_score(true_labels, preds, zero_division=1)
    return accuracy, precision, recall, f1


# Training and Validation
epochs = 6
for epoch in range(epochs):
    train_loss = train_model(model, train_loader, optimizer, criterion)
    accuracy, precision, recall, f1 = evaluate_model(model, dev_loader)
    print(f"Epoch {epoch+1}/{epochs}")
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(f"Validation Precision: {precision:.4f}")
    print(f"Validation Recall: {recall:.4f}")
    print(f"Validation F1 Score: {f1:.4f}")
    print("-" * 30)

# Testing and Saving Predictions
model.eval()
test_preds = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids, attention_mask)
        _, predicted = torch.max(outputs, dim=1)

        test_preds.extend(predicted.cpu().numpy())

# Save predictions
pred_labels = ['yes' if pred == 1 else 'no' for pred in test_preds]
with open("preds.txt", "w") as f:
    f.write("\n".join(pred_labels))

print("Predictions saved to preds.txt")




Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/6
Train Loss: 0.7026
Validation Accuracy: 0.6000
Validation Precision: 0.6000
Validation Recall: 1.0000
Validation F1 Score: 0.7500
------------------------------
Epoch 2/6
Train Loss: 0.6913
Validation Accuracy: 0.4000
Validation Precision: 1.0000
Validation Recall: 0.0000
Validation F1 Score: 0.0000
------------------------------
Epoch 3/6
Train Loss: 0.6909
Validation Accuracy: 0.4000
Validation Precision: 1.0000
Validation Recall: 0.0000
Validation F1 Score: 0.0000
------------------------------
Epoch 4/6
Train Loss: 0.6603
Validation Accuracy: 0.4000
Validation Precision: 1.0000
Validation Recall: 0.0000
Validation F1 Score: 0.0000
------------------------------
Epoch 5/6
Train Loss: 0.6790
Validation Accuracy: 0.4000
Validation Precision: 1.0000
Validation Recall: 0.0000
Validation F1 Score: 0.0000
------------------------------
Epoch 6/6
Train Loss: 0.6427
Validation Accuracy: 0.5000
Validation Precision: 1.0000
Validation Recall: 0.1667
Validation F1 Score: 0.2857
-----