#Installation of necessary libabries

In [None]:
!pip install datasets
!pip install transformers

In [2]:
import os 
os.environ['CUDA_VISIBLE_DEVICES'] ='0' #gpu

In [None]:
datapath='Dataset.csv'

In [5]:
from transformers import AutoTokenizer
from torch import nn
from transformers import Trainer
import torch
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
import numpy as np
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset
import datasets
from sklearn import preprocessing
from sklearn.model_selection import KFold,StratifiedKFold

#Tokenizer & Model

In [6]:
#Constructs a RoBERTa tokenizer, derived from the GPT-2 tokenizer, using byte-level Byte-Pair-Encoding.
#This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will be encoded differently whether it is at the beginning of the sentence (without space) or not
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

#Data collators are objects that will form a batch by using a list of dataset elements as input.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def preprocess_function(examples, text_column_name = "text"):
    return tokenizer(examples[text_column_name], truncation=True)

#Roberta-base: Pretrained model on English language using a masked language modeling (MLM) objective. 
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=12)

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [8]:
issue_df = pd.read_csv(datapath)
issue_df = issue_df[['Text Content','Code']] #will only  consider text and label for this problem
issue_df = issue_df.rename(columns={'Text Content': 'text', 'Code': 'label'}) #rename the columns
print(issue_df.info())

label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(issue_df['label'])

issue_df['label'] = label_encoder.transform(issue_df['label'])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4241 entries, 0 to 4240
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    4241 non-null   object
 1   label   4241 non-null   object
dtypes: object(2)
memory usage: 66.4+ KB
None


# Training and Testing
We have used cross fold validation where the dataset has been split into 10-folds using a stratified sampling. Each fold contains the data of each class in the same proportion and no two folds has any overlap between themselves


In [9]:
#cross-fold validation
skf = StratifiedKFold(n_splits=10, random_state=10, shuffle=True)

fold=0
for trainval_index, test_index in skf.split(issue_df,issue_df['label']):  
    fold=fold+1

    trainval_df = issue_df.iloc[trainval_index]
    test_df = issue_df.iloc[test_index]

    #split for getting validation dataset
    train_df, val_df = train_test_split(trainval_df, test_size=0.05, random_state=10, stratify=trainval_df['label'])

    train_df = train_df[['text', 'label']]
    val_df = val_df[['text', 'label']]
    test_df = test_df[['text', 'label']]

    #create dataset
    train_dataset = Dataset.from_dict(train_df)
    val_dataset =  Dataset.from_dict(val_df)
    test_dataset = Dataset.from_dict(test_df)

    issue_dataset = datasets.DatasetDict({"train":train_dataset,"val":val_dataset,"test":test_dataset})

    #for class weight 
    count_0 = len(train_df[train_df['label'] == 0])
    count_1 = len(train_df[train_df['label'] == 1])
    count_2 = len(train_df[train_df['label'] == 2])
    count_3 = len(train_df[train_df['label'] == 3])
    count_4 = len(train_df[train_df['label'] == 4])
    count_5 = len(train_df[train_df['label'] == 5])
    count_6 = len(train_df[train_df['label'] == 6])
    count_7 = len(train_df[train_df['label'] == 7])
    count_8 = len(train_df[train_df['label'] == 8])
    count_9 = len(train_df[train_df['label'] == 9])
    count_10 = len(train_df[train_df['label'] == 10])
    count_11 = len(train_df[train_df['label'] == 11])


    class_weight_0 = (1 / count_0) * (len(train_df) / len(set(train_df['label'])))
    class_weight_1 = (1 / count_1) * (len(train_df) / len(set(train_df['label'])))
    class_weight_2 = (1 / count_2) * (len(train_df) / len(set(train_df['label'])))
    class_weight_3 = (1 / count_3) * (len(train_df) / len(set(train_df['label'])))
    class_weight_4 = (1 / count_4) * (len(train_df) / len(set(train_df['label'])))
    class_weight_5 = (1 / count_5) * (len(train_df) / len(set(train_df['label'])))
    class_weight_6 = (1 / count_6) * (len(train_df) / len(set(train_df['label'])))
    class_weight_7 = (1 / count_7) * (len(train_df) / len(set(train_df['label'])))
    class_weight_8 = (1 / count_8) * (len(train_df) / len(set(train_df['label'])))
    class_weight_9 = (1 / count_9) * (len(train_df) / len(set(train_df['label'])))
    class_weight_10 = (1 / count_10) * (len(train_df) / len(set(train_df['label'])))
    class_weight_11 = (1 / count_11) * (len(train_df) / len(set(train_df['label'])))

    tokenized_issue_dataset = issue_dataset.map(preprocess_function, batched=True)



    #trainer
    class CustomTrainer(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False):
            device = model.device
            labels = inputs.get("labels").to(device)
            # forward pass
            outputs = model(**inputs)
            logits = outputs.get("logits").to(device)
            # compute custom loss (suppose one has 3 labels with different weights)
            loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([class_weight_0,class_weight_1,class_weight_2,class_weight_3,class_weight_4,class_weight_5,class_weight_6,class_weight_7,class_weight_8,class_weight_9,class_weight_10,class_weight_11])).to(device)
            loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
            return (loss, outputs) if return_outputs else loss




    #Training Arguments
    training_args = TrainingArguments(
        output_dir="./results",
        learning_rate=2e-5,
        per_device_train_batch_size=16, #16
        per_device_eval_batch_size=16, #16
        num_train_epochs=10,
        weight_decay=0.01,
    )

    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_issue_dataset["train"],
        eval_dataset=tokenized_issue_dataset["val"],
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    #Training
    trainer.train()


    #Testing
    
    # Use the model to get predictions
    test_predictions = trainer.predict(tokenized_issue_dataset["test"])
    # For each prediction, create the label with argmax
    test_predictions_argmax = np.argmax(test_predictions[0], axis=1)

    print(classification_report(np.array(test_df['label'].to_list()), test_predictions_argmax))
    report= classification_report(np.array(test_df['label'].to_list()), test_predictions_argmax, output_dict=True)
    report_df = pd.DataFrame(report).transpose()
    report_df.to_csv('fold'+str(fold)+'.csv', index = False)

Map:   0%|          | 0/3625 [00:00<?, ? examples/s]

Map:   0%|          | 0/191 [00:00<?, ? examples/s]

Map:   0%|          | 0/425 [00:00<?, ? examples/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,1.8017
1000,0.808
1500,0.4116
2000,0.2042


              precision    recall  f1-score   support

           0       0.55      0.60      0.57        10
           1       0.33      0.33      0.33        18
           2       0.78      0.64      0.70        11
           3       0.33      0.33      0.33         3
           4       0.71      0.71      0.71       189
           5       1.00      0.50      0.67         2
           6       0.46      0.67      0.55         9
           7       0.64      0.57      0.61        47
           8       0.59      0.62      0.61        85
           9       0.56      0.39      0.46        23
          10       0.52      0.58      0.55        26
          11       0.67      1.00      0.80         2

    accuracy                           0.63       425
   macro avg       0.59      0.58      0.57       425
weighted avg       0.63      0.63      0.63       425



Map:   0%|          | 0/3626 [00:00<?, ? examples/s]

Map:   0%|          | 0/191 [00:00<?, ? examples/s]

Map:   0%|          | 0/424 [00:00<?, ? examples/s]



Step,Training Loss
500,0.4246
1000,0.1629
1500,0.0683
2000,0.0253


              precision    recall  f1-score   support

           0       0.91      1.00      0.95        10
           1       0.94      0.89      0.91        18
           2       0.90      0.90      0.90        10
           3       1.00      1.00      1.00         2
           4       0.88      0.89      0.89       189
           5       0.67      1.00      0.80         2
           6       0.89      1.00      0.94         8
           7       0.96      0.94      0.95        47
           8       0.79      0.73      0.76        85
           9       0.92      0.92      0.92        24
          10       0.90      1.00      0.95        27
          11       1.00      1.00      1.00         2

    accuracy                           0.88       424
   macro avg       0.90      0.94      0.91       424
weighted avg       0.88      0.88      0.88       424



Map:   0%|          | 0/3626 [00:00<?, ? examples/s]

Map:   0%|          | 0/191 [00:00<?, ? examples/s]

Map:   0%|          | 0/424 [00:00<?, ? examples/s]



Step,Training Loss
500,0.1554
1000,0.0765
1500,0.0234


Step,Training Loss
500,0.1554
1000,0.0765
1500,0.0234
2000,0.0062


              precision    recall  f1-score   support

           0       1.00      0.89      0.94         9
           1       0.94      0.94      0.94        18
           2       1.00      1.00      1.00        11
           3       1.00      1.00      1.00         2
           4       0.98      0.98      0.98       189
           5       1.00      1.00      1.00         2
           6       1.00      1.00      1.00         8
           7       0.98      1.00      0.99        47
           8       0.98      0.99      0.98        85
           9       0.96      0.92      0.94        24
          10       0.96      1.00      0.98        26
          11       1.00      1.00      1.00         3

    accuracy                           0.98       424
   macro avg       0.98      0.98      0.98       424
weighted avg       0.98      0.98      0.98       424



In [10]:
#Calculation of weighted F1-score for all samples
sup=[0 for i in range(15)] 
pr=[0 for i in range(15)] 
rc=[0 for i in range(15)] 
f1=[0 for i in range(15)] 

TOTAL_FOLD=3
for fold in range(1,TOTAL_FOLD+1):
  report_df=pd.read_csv('fold'+str(fold)+'.csv')
  for i in range(0,15):
    s=report_df['support'][i]
    sup[i]=sup[i]+report_df['support'][i]
    pr[i]=pr[i]+(s*report_df['precision'][i])
    rc[i]=rc[i]+(s*report_df['recall'][i])
    f1[i]=f1[i]+(s*report_df['f1-score'][i])

for i in range(0,15):
  pr[i]=pr[i]/sup[i]
  rc[i]=rc[i]/sup[i]
  f1[i]=f1[i]/sup[i]
  sup[i]=int(sup[i])

pr = [ round(i, 2) for i in pr ]
rc = [ round(i, 2) for i in rc ]
f1 = [ round(i, 2) for i in f1 ]

df = pd.DataFrame({'pr':pr,'rc':rc,'f1':f1,'sup':sup})
print(df)

df.to_csv('result.csv')



      pr    rc    f1   sup
0   0.81  0.83  0.82    29
1   0.74  0.72  0.73    54
2   0.89  0.84  0.87    32
3   0.71  0.71  0.71     7
4   0.86  0.86  0.86   567
5   0.89  0.83  0.82     6
6   0.77  0.88  0.82    25
7   0.86  0.84  0.85   141
8   0.79  0.78  0.78   255
9   0.82  0.75  0.78    71
10  0.79  0.86  0.83    79
11  0.90  1.00  0.94     7
12  0.86  0.86  0.86     2
13  0.82  0.83  0.82  1273
14  0.83  0.83  0.83  1273
