In [1]:
!pip install transformers



In [2]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from transformers import pipeline
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
import torch
from torch.nn import BCEWithLogitsLoss, BCELoss
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, accuracy_score
from torch.nn.utils.rnn import pad_sequence



In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [4]:
data = pd.read_csv("eclipse_bug_report_data.csv")
data = data.dropna()
two_rows = data[['long_description', 'severity_category']].copy()
two_rows.head()
comments = list(two_rows.long_description.values)
labels = list(two_rows.severity_category.values)

In [5]:
encoder = OneHotEncoder(sparse=False)
# transform data
labels = encoder.fit_transform(np.array(labels).reshape(-1,1))

In [6]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2ForSequenceClassification.from_pretrained("gpt2", problem_type="multi_label_classification", num_labels=6)

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/523M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
model = model.to(device)

In [8]:
tokenized_comments_temp = []
labels_temp = []
for i, x in enumerate(comments):
    encoded = tokenizer(x, return_tensors='pt')['input_ids']
    if encoded.shape[1] < 1000:
        tokenized_comments_temp.append(encoded)
        labels_temp.append(labels[i])
tokenized_comments = np.array(tokenized_comments_temp)
labels = np.array(labels_temp)


Token indices sequence length is longer than the specified maximum sequence length for this model (4414 > 1024). Running this sequence through the model will result in indexing errors
  tokenized_comments = np.array(tokenized_comments_temp)
  tokenized_comments = np.array(tokenized_comments_temp)


In [9]:
train_x, test_x, train_y, test_y = train_test_split(tokenized_comments_temp, labels, test_size=0.1, random_state=22)
test_y = torch.tensor(test_y)
train_y = torch.tensor(train_y)

In [10]:
test_y = test_y.to(device)
train_y = train_y.to(device)

In [11]:
test_y[0]

tensor([0., 0., 0., 0., 1., 0.], device='cuda:0', dtype=torch.float64)

In [12]:
# setting custom optimization parameters. You may implement a scheduler here as well.
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [13]:
optimizer = torch.optim.AdamW(optimizer_grouped_parameters,lr=2e-5)

In [14]:
num_labels=6


In [16]:
# Store our loss and accuracy for plotting
train_loss_set = []
val_f1_accuracy = []
classification_reports = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 50

# trange is a tqdm wrapper around the normal python range
for _ in range(epochs):

  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()

  # Tracking variables
  tr_loss = 0 #running loss
  nb_tr_examples, nb_tr_steps = 0, 0
  
  # Train the data for one epoch
  for step, x in enumerate(train_x):
        
    # Clear out the gradients (by default they accumulate)
    x = x.to(device)
    optimizer.zero_grad()

#######################
    # set the pad token of the model's configuration
    model.config.pad_token_id = model.config.eos_token_id

#######################

    # Forward pass for multilabel classification
    outputs = model(x, token_type_ids=None)
    logits = outputs[0]
    loss_func = BCEWithLogitsLoss()
    loss = loss_func(logits.view(-1,num_labels),train_y[step].type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
    # loss_func = BCELoss() 
    # loss = loss_func(torch.sigmoid(logits.view(-1,num_labels)),b_labels.type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
    train_loss_set.append(loss.item())    

    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    # scheduler.step()
    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += train_y[step].size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))

###############################################################################
  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Variables to gather full output
  logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

  # Predict
  for i, x in enumerate(test_x):
    x = x.to(device)
    with torch.no_grad():
      # Forward pass
      outs = model(x, token_type_ids=None)
      b_logit_pred = outs[0]
      pred_label = torch.sigmoid(b_logit_pred)

      b_logit_pred = b_logit_pred.cpu().numpy()
      pred_label = pred_label.cpu().numpy()
      b_labels = test_y[i].cpu().numpy()

    tokenized_texts.append(x)
    logit_preds.append(b_logit_pred)
    true_labels.append(test_y[i])
    pred_labels.append(pred_label)
  pred_labels_new = [item for sublist in pred_labels for item in sublist]
  max_idx = np.argmax(pred_labels_new, 1)
  pred_label = np.zeros_like(pred_labels_new)
  pred_label[np.arange(len(max_idx)), max_idx] = 1
  true_label = []
  for each in true_labels:
    true_label.append(each.cpu().numpy())
  val_f1_accuracy.append(f1_score(true_label,pred_label,average=None)*100)
  classification_reports.append(classification_report(true_label, pred_label, output_dict=True))



Train loss: 0.22302209290673863


  _warn_prf(average, modifier, msg_start, len(result))


Train loss: 0.2110807224026213


  _warn_prf(average, modifier, msg_start, len(result))


Train loss: 0.1749935153835563


  _warn_prf(average, modifier, msg_start, len(result))


Train loss: 0.11791655707280761
Train loss: 0.0704906864973867
Train loss: 0.04664115622893446
Train loss: 0.03215968785694683
Train loss: 0.025281219289452768
Train loss: 0.01805752213502225
Train loss: 0.01670685666316454
Train loss: 0.01543137244132148
Train loss: 0.013477892886057165
Train loss: 0.012467140060678021
Train loss: 0.01100715667423694
Train loss: 0.010641700620171706
Train loss: 0.008929608683022712
Train loss: 0.009320284151917333
Train loss: 0.008775985699539052
Train loss: 0.008132969001299162
Train loss: 0.00723331716629985
Train loss: 0.00724500090606248
Train loss: 0.005855904349175587
Train loss: 0.00852729523762161
Train loss: 0.006496218024724115
Train loss: 0.007656469670583249
Train loss: 0.006877227649096827
Train loss: 0.006712790246518347
Train loss: 0.006132971823881704
Train loss: 0.0049812288261702085
Train loss: 0.006676871910728372
Train loss: 0.00574372955232684
Train loss: 0.007151418712238902
Train loss: 0.0038626880895116598
Train loss: 0.0050488

In [17]:
val_f1_accuracy[-1]

array([11.76470588,  5.71428571, 14.17322835,  0.        , 86.62337662,
        0.        ])

In [18]:
classification_reports[-1]

{'0': {'precision': 0.3333333333333333,
  'recall': 0.07142857142857142,
  'f1-score': 0.11764705882352941,
  'support': 14},
 '1': {'precision': 0.09090909090909091,
  'recall': 0.041666666666666664,
  'f1-score': 0.05714285714285715,
  'support': 24},
 '2': {'precision': 0.2727272727272727,
  'recall': 0.09574468085106383,
  'f1-score': 0.14173228346456693,
  'support': 94},
 '3': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 42},
 '4': {'precision': 0.804583835946924,
  'recall': 0.9381153305203939,
  'f1-score': 0.8662337662337662,
  'support': 711},
 '5': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 15},
 'micro avg': {'precision': 0.7533333333333333,
  'recall': 0.7533333333333333,
  'f1-score': 0.7533333333333333,
  'support': 900},
 'macro avg': {'precision': 0.2502589221527702,
  'recall': 0.1911592082444493,
  'f1-score': 0.19712599427745328,
  'support': 900},
 'weighted avg': {'precision': 0.671715506492346,
  'recall': 0.7533333333333333,
 

In [19]:
print(classification_reports[-1])

{'0': {'precision': 0.3333333333333333, 'recall': 0.07142857142857142, 'f1-score': 0.11764705882352941, 'support': 14}, '1': {'precision': 0.09090909090909091, 'recall': 0.041666666666666664, 'f1-score': 0.05714285714285715, 'support': 24}, '2': {'precision': 0.2727272727272727, 'recall': 0.09574468085106383, 'f1-score': 0.14173228346456693, 'support': 94}, '3': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 42}, '4': {'precision': 0.804583835946924, 'recall': 0.9381153305203939, 'f1-score': 0.8662337662337662, 'support': 711}, '5': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 15}, 'micro avg': {'precision': 0.7533333333333333, 'recall': 0.7533333333333333, 'f1-score': 0.7533333333333333, 'support': 900}, 'macro avg': {'precision': 0.2502589221527702, 'recall': 0.1911592082444493, 'f1-score': 0.19712599427745328, 'support': 900}, 'weighted avg': {'precision': 0.671715506492346, 'recall': 0.7533333333333333, 'f1-score': 0.7024816998142612, 'support': 900}

In [20]:
df = pd.DataFrame(classification_reports[-1]).transpose()

In [21]:
df.to_csv('gpt2_report.csv', index= True)