In [None]:
!pip install datasets
!pip install transformers
!pip install accelerate>=0.21.0 -
!pip install optuna
!pip install optuna_integration

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] ='0'

In [None]:
# ignore warnings
import warnings
import logging
warnings.filterwarnings("ignore")

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


## **Read dataset**

In [None]:
project_path = '/content/drive/My Drive/Code Documentation Project/Issue Classification/'

In [None]:
lit_class_id_map= {0: 'Action on Issue',
 1: 'Bug Reproduction',
 2: 'Contribution and Commitment',
 3: 'Expected Behaviour',
 4: 'Investigation and Exploration',
 5: 'Motivation',
 6: 'Observed Bug Behaviour',
 7: 'Potential New Issues and Requests',
 8: 'Social Conversation',
 9: 'Solution Discussion',
 10: 'Task Progress',
 11: 'Usage',
 12: 'Workarounds'}

In [None]:
    test_predictions = trainer.predict(tokenized_issue_dataset["test"])
    # For each prediction, create the label with argmax
    test_predictions_argmax = np.argmax(test_predictions[0], axis=1)

    print(classification_report(np.array(test_df['label'].to_list()), test_predictions_argmax))
    report= classification_report(np.array(test_df['label'].to_list()), test_predictions_argmax, output_dict=True)
    report_df = pd.DataFrame(report).transpose()

    print(report_df)

In [None]:
import optuna
from transformers import Trainer, TrainingArguments

def objective(trial):
    # Suggest values for the hyperparameters
    learning_rate = trial.suggest_categorical('learning_rate', [1e-5, 2e-5,3e-5])
    per_device_train_batch_size = trial.suggest_categorical('per_device_train_batch_size', [8, 16])

    training_args = TrainingArguments(
        output_dir='./results',
        learning_rate=learning_rate,
        per_device_train_batch_size=per_device_train_batch_size,
        num_train_epochs=1,
        weight_decay=0.01,
        report_to="none"  # Avoid cluttering console outputs
    )


    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_issue_dataset["train"],
        eval_dataset=tokenized_issue_dataset["val"],
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    trainer.train()
    eval_results = trainer.evaluate()
    print(eval_results)
    # Optuna optimizes for a returned value, here we assume it's accuracy
    return eval_results["eval_loss"]

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=6)

print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")


In [None]:
    test_predictions = trainer.predict(tokenized_issue_dataset["test"])
    # For each prediction, create the label with argmax
    test_predictions_argmax = np.argmax(test_predictions[0], axis=1)

    print(classification_report(np.array(test_df['label'].to_list()), test_predictions_argmax))
    report= classification_report(np.array(test_df['label'].to_list()), test_predictions_argmax, output_dict=True)
    report_df = pd.DataFrame(report).transpose()

    print(report_df)

In [None]:
import optuna
from transformers import Trainer, TrainingArguments

def objective(trial):
    # Suggest values for the hyperparameters
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 2e-5, log=True)
    per_device_train_batch_size = trial.suggest_categorical('per_device_train_batch_size', [8, 16])

    training_args = TrainingArguments(
        output_dir='./results',
        learning_rate=learning_rate,
        per_device_train_batch_size=per_device_train_batch_size,
        num_train_epochs=10,
        weight_decay=0.01,
        report_to="none"  # Avoid cluttering console outputs
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_issue_dataset["train"],
        eval_dataset=tokenized_issue_dataset["val"],
        tokenizer=tokenizer,
        data_collator=data_collator
    )

    trainer.train()
    eval_results = trainer.evaluate()

    # Optuna optimizes for a returned value, here we assume it's accuracy
    return eval_results["eval_accuracy"]

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")


In [None]:
from transformers import AutoTokenizer
from torch import nn
from transformers import Trainer
import torch
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
import numpy as np
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset
import datasets
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV

param_grid = {
    'learning_rate': [1e-5, 2e-5, 3e-5],
    'per_device_train_batch_size': [8, 16],
}

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") #roberta-base
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
def preprocess_function(examples, text_column_name = "Text Content"):
    return tokenizer(examples[text_column_name], truncation=True)

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=13) ##roberta-base


for fold in range(10,11):
    print(fold)
    #print("TRAIN:", train_index, "TEST:", test_index)

    train_df=pd.read_csv(project_path+'/KFold/Literature/train_fold'+str(fold)+'.csv')
    val_df=pd.read_csv(project_path+'/KFold/Literature/val_fold'+str(fold)+'.csv')
    test_df=pd.read_csv(project_path+'/KFold/Literature/test_fold'+str(fold)+'.csv')

    train_df = train_df[['Text Content', 'Code']]
    val_df = val_df[['Text Content', 'Code']]
    test_df = test_df[['Text Content', 'Code']]

    train_df = train_df.rename(columns={'Code': 'label'})
    val_df = val_df.rename(columns={'Code': 'label'})
    test_df = test_df.rename(columns={'Code': 'label'})

    train_dataset = Dataset.from_dict(train_df)
    val_dataset =  Dataset.from_dict(val_df)
    test_dataset = Dataset.from_dict(test_df)

    issue_dataset = datasets.DatasetDict({"train":train_dataset,"val":val_dataset,"test":test_dataset})

    count_0 = len(train_df[train_df['label'] == 0])
    count_1 = len(train_df[train_df['label'] == 1])
    count_2 = len(train_df[train_df['label'] == 2])
    count_3 = len(train_df[train_df['label'] == 3])
    count_4 = len(train_df[train_df['label'] == 4])
    count_5 = len(train_df[train_df['label'] == 5])
    count_6 = len(train_df[train_df['label'] == 6])
    count_7 = len(train_df[train_df['label'] == 7])
    count_8 = len(train_df[train_df['label'] == 8])
    count_9 = len(train_df[train_df['label'] == 9])
    count_10 = len(train_df[train_df['label'] == 10])
    count_11 = len(train_df[train_df['label'] == 11])
    count_12 = len(train_df[train_df['label'] == 12])


    class_weight_0 = (1 / count_0) * (len(train_df) / len(set(train_df['label'])))
    class_weight_1 = (1 / count_1) * (len(train_df) / len(set(train_df['label'])))
    class_weight_2 = (1 / count_2) * (len(train_df) / len(set(train_df['label'])))
    class_weight_3 = (1 / count_3) * (len(train_df) / len(set(train_df['label'])))
    class_weight_4 = (1 / count_4) * (len(train_df) / len(set(train_df['label'])))
    class_weight_5 = (1 / count_5) * (len(train_df) / len(set(train_df['label'])))
    class_weight_6 = (1 / count_6) * (len(train_df) / len(set(train_df['label'])))
    class_weight_7 = (1 / count_7) * (len(train_df) / len(set(train_df['label'])))
    class_weight_8 = (1 / count_8) * (len(train_df) / len(set(train_df['label'])))
    class_weight_9 = (1 / count_9) * (len(train_df) / len(set(train_df['label'])))
    class_weight_10 = (1 / count_10) * (len(train_df) / len(set(train_df['label'])))
    class_weight_11 = (1 / count_11) * (len(train_df) / len(set(train_df['label'])))
    class_weight_12 = (1 / count_12) * (len(train_df) / len(set(train_df['label'])))

    tokenized_issue_dataset = issue_dataset.map(preprocess_function, batched=True)



    class CustomTrainer(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False):
            device = model.device
            labels = inputs.get("labels").to(device)
            # forward pass
            outputs = model(**inputs)

            logits = outputs.get("logits").to(device)
            # compute custom loss (suppose one has 3 labels with different weights)
            loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([class_weight_0,class_weight_1,class_weight_2,class_weight_3,class_weight_4,class_weight_5,class_weight_6,class_weight_7,class_weight_8,class_weight_9,class_weight_10,class_weight_11,class_weight_12])).to(device)
            loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
            return (loss, outputs) if return_outputs else loss





    training_args = TrainingArguments(
        output_dir="./results",
        learning_rate=2e-5,
        per_device_train_batch_size=32, #16
        per_device_eval_batch_size=32, #16
        num_train_epochs=10,
        weight_decay=0.01,
    )

    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_issue_dataset["train"],
        eval_dataset=tokenized_issue_dataset["val"],
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    break
    trainer.train()



    # Use the model to get predictions
    test_predictions = trainer.predict(tokenized_issue_dataset["test"])
    # For each prediction, create the label with argmax
    test_predictions_argmax = np.argmax(test_predictions[0], axis=1)

    print(classification_report(np.array(test_df['label'].to_list()), test_predictions_argmax))
    report= classification_report(np.array(test_df['label'].to_list()), test_predictions_argmax, output_dict=True)
    report_df = pd.DataFrame(report).transpose()

    print(report_df)
    #report_df.to_csv(project_path+'classification report/Literature/R_CW/rcw_fold'+str(fold)+'.csv', index = False)

    #pred_df=pd.read_csv(project_path+'/Ensemble Data/Literature/fold'+str(fold)+'.csv')
    #pred_df['BERTCW']=test_predictions_argmax
    #pred_df.to_csv(project_path+'/Ensemble Data/Literature/fold'+str(fold)+'.csv', index=False)

