In [None]:
!pip install transformers

In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from transformers import pipeline
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, BertTokenizerFast, BertForSequenceClassification, AutoTokenizer, AutoModelForCausalLM
import torch
from torch.nn import BCEWithLogitsLoss, BCELoss
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, accuracy_score
from torch.nn.utils.rnn import pad_sequence



In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [3]:
file_name = "eclipse_bug_report_data.csv"

In [4]:
data = pd.read_csv(file_name)
data = data.dropna()
two_rows = data[['long_description', 'severity_category']].copy()
two_rows.head()
comments = list(two_rows.long_description.values)
labels = list(two_rows.severity_category.values)


In [61]:
print("Average length of each description")

tot = 0
count = 0
for each in two_rows.loc[two_rows.severity_category=='minor'].iterrows():
    tot += len(each[1].iloc[0])
    count += 1
print('Minor')
print(tot/count)

tot = 0
count = 0
for each in two_rows.loc[two_rows.severity_category=='major'].iterrows():
    tot += len(each[1].iloc[0])
    count += 1
print('Major')
print(tot/count)

tot = 0
count = 0
for each in two_rows.loc[two_rows.severity_category=='trivial'].iterrows():
    tot += len(each[1].iloc[0])
    count += 1
print('trivial')
print(tot/count)


tot = 0
count = 0
for each in two_rows.loc[two_rows.severity_category=='normal'].iterrows():
    tot += len(each[1].iloc[0])
    count += 1
print('normal')
print(tot/count)

tot = 0
count = 0
for each in two_rows.loc[two_rows.severity_category=='critical'].iterrows():
    tot += len(each[1].iloc[0])
    count += 1
print('critical')
print(tot/count)


tot = 0
count = 0
for each in two_rows.loc[two_rows.severity_category=='blocker'].iterrows():
    tot += len(each[1].iloc[0])
    count += 1
print('blocker')
print(tot/count)

Average length of each description
Minor
589.8232189973614
Major
1346.577731092437
trivial
346.4220779220779
normal
852.5413642960813
critical
1208.8635097493036
blocker
2172.299019607843


In [5]:
encoder = OneHotEncoder(sparse=False)
# transform data
labels1 = encoder.fit_transform(np.array(labels).reshape(-1,1))

In [6]:
print(labels1)

print(two_rows.severity_category.values)

[[0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 ...
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]]
['normal' 'normal' 'normal' ... 'normal' 'normal' 'normal']


In [27]:
print(encoder.categories_)

[array(['blocker', 'critical', 'major', 'minor', 'normal', 'trivial'],
      dtype='<U8')]


## GPT 2

In [7]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2ForSequenceClassification.from_pretrained("gpt2", problem_type="multi_label_classification", num_labels=6)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
model = model.to(device)

In [9]:
tokenized_comments_temp = []
labels_temp = []
for i, x in enumerate(comments):
    encoded = tokenizer(x, return_tensors='pt')['input_ids']
    tokenized_comments_temp.append(encoded)
    labels_temp.append(labels1[i])



Token indices sequence length is longer than the specified maximum sequence length for this model (4414 > 1024). Running this sequence through the model will result in indexing errors


In [10]:
t_temps =  []
for each in tokenized_comments_temp:
    if each.shape[1]> 1024:
        t_temps.append(each.narrow(1, 0, 1024))
    else:
        t_temps.append(each)

In [11]:
tokenized_comments = np.array(t_temps)
labels = np.array(labels_temp)

  tokenized_comments = np.array(t_temps)
  tokenized_comments = np.array(t_temps)


In [12]:
tokenized_comments_temp

[tensor([[ 464, 1708, 4519,  ...,  259, 3524,   13]]),
 tensor([[ 464, 1708, 4519,  ...,  259, 3524,   13]]),
 tensor([[20556,  2071,  3740,  1378, 37165,    13,    75,  7087,   323,    13,
            785,    14, 25367,   325,    14,    43,  3705,    12, 29143,  1485]]),
 tensor([[   49,   437,  1586,   257, 14393, 10962,   319,   257, 49355,   351,
          20241,   345,   481,  4003,   326,   262,  1364,   290,   262,  1353,
          11637,   389,   407,  7428,    13,   383,  1738,   329,   428,  2331,
            284,   307,   262,  1245,   340,   561,   423,  1626,   257, 49355,
          49925,   304,    13,    70,    13, 24846, 49925,   810,   262,  7652,
            561,   307, 11266,   416, 29175, 11637,   621,    13,   198,   198,
           6214,   262, 10041,  2792,   329,   262,  5114,    25,  2638,  1378,
           2503,    13,    68, 17043,    13,  2398,    14, 37141,    14,  9630,
             13, 10121,    14,    83,    14,  3365,  2816,  2079,    14]]),
 tensor([[2

In [13]:
tokenized_comments

array([tensor([[ 464, 1708, 4519,  ...,  198,  220,  220]]),
       tensor([[ 464, 1708, 4519,  ...,  296,   75,   13]]),
       tensor([[20556,  2071,  3740,  1378, 37165,    13,    75,  7087,   323,    13,
                  785,    14, 25367,   325,    14,    43,  3705,    12, 29143,  1485]]),
       ...,
       tensor([[40781,   635,  7160,   287,   513,    13,    15,    13,    20,    79,
                  198,   198, 45340,   770,  5434,   373,  7317,  2727,   355,   257,
                17271,   286, 15217,  1303,  1959, 14877,    18, 49954,   198,   198,
                21327,   262,  1708,    25,   198,   986,   198,   198,    32,  1678,
                  198,   198,  3237,    62, 18417,   198,   198,   986,   198,   198,
                  271, 14251,   355,    25,   198,   198,   986,   198, 28311,  1391,
                  198,   361,     7,  7942,     8,   198,  1782,  4929,   357, 12355,
                   13, 17204,    13, 16922,   304,     8, 23884,   220,   198, 11709,
   

In [14]:
train_x, test_x, train_y, test_y = train_test_split(tokenized_comments, labels, test_size=0.1, random_state=22)
test_y = torch.tensor(test_y)
train_y = torch.tensor(train_y)

In [15]:
test_y = test_y.to(device)
train_y = train_y.to(device)

In [16]:
test_y[0]

tensor([0., 0., 0., 0., 1., 0.], device='cuda:0', dtype=torch.float64)

In [17]:
# setting custom optimization parameters. You may implement a scheduler here as well.
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [18]:
optimizer = torch.optim.AdamW(optimizer_grouped_parameters,lr=2e-5)

In [19]:
num_labels=6


In [20]:
# Store our loss and accuracy for plotting
train_loss_set = []
val_f1_accuracy = []
classification_reports = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 5

# trange is a tqdm wrapper around the normal python range
for _ in range(epochs):

  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()

  # Tracking variables
  tr_loss = 0 #running loss
  nb_tr_examples, nb_tr_steps = 0, 0
  
  # Train the data for one epoch
  for step, x in enumerate(train_x):
    # Clear out the gradients (by default they accumulate)
    x = x.to(device)
    optimizer.zero_grad()

#######################
    # set the pad token of the model's configuration
    model.config.pad_token_id = model.config.eos_token_id

#######################

    # Forward pass for multilabel classification
    outputs = model(x, token_type_ids=None)
    logits = outputs[0]
    loss_func = BCEWithLogitsLoss()
    loss = loss_func(logits.view(-1,num_labels),train_y[step].type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
    # loss_func = BCELoss() 
    # loss = loss_func(torch.sigmoid(logits.view(-1,num_labels)),b_labels.type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
    train_loss_set.append(loss.item())    

    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    # scheduler.step()
    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += train_y[step].size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))

###############################################################################
  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Variables to gather full output
  logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

  # Predict
  for i, x in enumerate(test_x):
    x = x.to(device)
    with torch.no_grad():
      # Forward pass
      outs = model(x, token_type_ids=None)
      b_logit_pred = outs[0]
      pred_label = torch.sigmoid(b_logit_pred)

      b_logit_pred = b_logit_pred.cpu().numpy()
      pred_label = pred_label.cpu().numpy()
      b_labels = test_y[i].cpu().numpy()

    tokenized_texts.append(x)
    logit_preds.append(b_logit_pred)
    true_labels.append(test_y[i])
    pred_labels.append(pred_label)
  pred_labels_new = [item for sublist in pred_labels for item in sublist]
  max_idx = np.argmax(pred_labels_new, 1)
  pred_label = np.zeros_like(pred_labels_new)
  pred_label[np.arange(len(max_idx)), max_idx] = 1
  true_label = []
  for each in true_labels:
    true_label.append(each.cpu().numpy())
  val_f1_accuracy.append(f1_score(true_label,pred_label,average=None)*100)
  classification_reports.append(classification_report(true_label, pred_label, output_dict=True))



Train loss: 0.2319931305826892


  _warn_prf(average, modifier, msg_start, len(result))


Train loss: 0.21494072219860502


  _warn_prf(average, modifier, msg_start, len(result))


Train loss: 0.1775106600931077
Train loss: 0.11920294023694906
Train loss: 0.07480642704105647


In [21]:
val_f1_accuracy[-1]

array([14.81481481, 14.54545455, 10.85972851,  0.        , 80.98958333,
        0.        ])

In [22]:
classification_reports[-1]

{'0': {'precision': 0.2222222222222222,
  'recall': 0.1111111111111111,
  'f1-score': 0.14814814814814814,
  'support': 18},
 '1': {'precision': 0.25,
  'recall': 0.10256410256410256,
  'f1-score': 0.14545454545454548,
  'support': 39},
 '2': {'precision': 0.09523809523809523,
  'recall': 0.12631578947368421,
  'f1-score': 0.10859728506787329,
  'support': 95},
 '3': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 32},
 '4': {'precision': 0.8077922077922078,
  'recall': 0.8120104438642297,
  'f1-score': 0.8098958333333333,
  'support': 766},
 '5': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 13},
 'micro avg': {'precision': 0.6645898234683282,
  'recall': 0.6645898234683282,
  'f1-score': 0.6645898234683282,
  'support': 963},
 'macro avg': {'precision': 0.2292087542087542,
  'recall': 0.1920002411688546,
  'f1-score': 0.20201596866731672,
  'support': 963},
 'weighted avg': {'precision': 0.6662164592071134,
  'recall': 0.6645898234683282,
  'f1-score': 0

In [23]:
print(classification_reports[-1])

{'0': {'precision': 0.2222222222222222, 'recall': 0.1111111111111111, 'f1-score': 0.14814814814814814, 'support': 18}, '1': {'precision': 0.25, 'recall': 0.10256410256410256, 'f1-score': 0.14545454545454548, 'support': 39}, '2': {'precision': 0.09523809523809523, 'recall': 0.12631578947368421, 'f1-score': 0.10859728506787329, 'support': 95}, '3': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 32}, '4': {'precision': 0.8077922077922078, 'recall': 0.8120104438642297, 'f1-score': 0.8098958333333333, 'support': 766}, '5': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 13}, 'micro avg': {'precision': 0.6645898234683282, 'recall': 0.6645898234683282, 'f1-score': 0.6645898234683282, 'support': 963}, 'macro avg': {'precision': 0.2292087542087542, 'recall': 0.1920002411688546, 'f1-score': 0.20201596866731672, 'support': 963}, 'weighted avg': {'precision': 0.6662164592071134, 'recall': 0.6645898234683282, 'f1-score': 0.6635891426315422, 'support': 963}, 'samples avg

In [24]:
df = pd.DataFrame(classification_reports[-1]).transpose()

In [25]:
df.to_csv('gpt2_report.csv', index= True)

## Bert Model

In [62]:
data = pd.read_csv(file_name)
data = data.dropna()
two_rows = data[['long_description', 'severity_category']].copy()
two_rows.head()
comments = list(two_rows.long_description.values)
labels = list(two_rows.severity_category.values)
encoder = OneHotEncoder(sparse=False)
# transform data
labels = encoder.fit_transform(np.array(labels).reshape(-1,1))

In [63]:
model_name = 'bert-base-uncased'

In [64]:
tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, problem_type="multi_label_classification", num_labels=6)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [65]:
model = model.to(device)

In [69]:
tokenized_comments_temp = []
labels_temp = []
for i, x in enumerate(comments):
    encoded = tokenizer(x, return_tensors='pt')['input_ids']
    tokenized_comments_temp.append(encoded)
    labels_temp.append(labels[i])

t_temps =  []
for each in tokenized_comments_temp:
    if each.shape[1]> 512:
        t_temps.append(each.narrow(1, 0, 512))
    else:
        t_temps.append(each)
tokenized_comments = np.array(t_temps)
labels = np.array(labels_temp)
train_x, test_x, train_y, test_y = train_test_split(tokenized_comments, labels, test_size=0.1, random_state=22)
test_y = torch.tensor(test_y)
train_y = torch.tensor(train_y)
test_y = test_y.to(device)
train_y = train_y.to(device)

  tokenized_comments = np.array(t_temps)
  tokenized_comments = np.array(t_temps)


In [70]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]
optimizer = torch.optim.AdamW(optimizer_grouped_parameters,lr=2e-5)
num_labels=6

In [71]:
# Store our loss and accuracy for plotting
train_loss_set = []
val_f1_accuracy = []
classification_reports = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 1

# trange is a tqdm wrapper around the normal python range
for _ in range(epochs):

  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()

  # Tracking variables
  tr_loss = 0 #running loss
  nb_tr_examples, nb_tr_steps = 0, 0
  
  # Train the data for one epoch
  for step, x in enumerate(train_x):
    # Clear out the gradients (by default they accumulate)
    x = x.to(device)
    optimizer.zero_grad()

#######################
    # set the pad token of the model's configuration
    model.config.pad_token_id = model.config.eos_token_id

#######################

    # Forward pass for multilabel classification
    outputs = model(x, token_type_ids=None)
    logits = outputs[0]
    loss_func = BCEWithLogitsLoss()
    loss = loss_func(logits.view(-1,num_labels),train_y[step].type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
    # loss_func = BCELoss() 
    # loss = loss_func(torch.sigmoid(logits.view(-1,num_labels)),b_labels.type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
    train_loss_set.append(loss.item())    

    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    # scheduler.step()
    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += train_y[step].size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))

###############################################################################
  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Variables to gather full output
  logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

  # Predict
  for i, x in enumerate(test_x):
    x = x.to(device)
    with torch.no_grad():
      # Forward pass
      outs = model(x, token_type_ids=None)
      b_logit_pred = outs[0]
      pred_label = torch.sigmoid(b_logit_pred)

      b_logit_pred = b_logit_pred.cpu().numpy()
      pred_label = pred_label.cpu().numpy()
      b_labels = test_y[i].cpu().numpy()

    tokenized_texts.append(x)
    logit_preds.append(b_logit_pred)
    true_labels.append(test_y[i])
    pred_labels.append(pred_label)
  pred_labels_new = [item for sublist in pred_labels for item in sublist]
  max_idx = np.argmax(pred_labels_new, 1)
  pred_label = np.zeros_like(pred_labels_new)
  pred_label[np.arange(len(max_idx)), max_idx] = 1
  true_label = []
  for each in true_labels:
    true_label.append(each.cpu().numpy())
  val_f1_accuracy.append(f1_score(true_label,pred_label,average=None)*100)
  classification_reports.append(classification_report(true_label, pred_label, output_dict=True))

Train loss: 0.22961652225411444


  _warn_prf(average, modifier, msg_start, len(result))


In [72]:
val_f1_accuracy[-1]

array([ 0.        ,  0.        ,  0.        ,  0.        , 88.60613071,
        0.        ])

In [73]:
classification_reports[-1]

{'0': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 18},
 '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 39},
 '2': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 95},
 '3': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 32},
 '4': {'precision': 0.7954309449636553,
  'recall': 1.0,
  'f1-score': 0.8860613071139387,
  'support': 766},
 '5': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 13},
 'micro avg': {'precision': 0.7954309449636553,
  'recall': 0.7954309449636553,
  'f1-score': 0.7954309449636553,
  'support': 963},
 'macro avg': {'precision': 0.13257182416060923,
  'recall': 0.16666666666666666,
  'f1-score': 0.14767688451898978,
  'support': 963},
 'weighted avg': {'precision': 0.6327103882057736,
  'recall': 0.7954309449636553,
  'f1-score': 0.7048005828133719,
  'support': 963},
 'samples avg': {'precision': 0.7954309449636553,
  'recall': 0.7954309449636553,
  'f1-score': 0.7954309449636553,
  's

In [74]:
df = pd.DataFrame(classification_reports[-1]).transpose()

In [88]:
for each in pred_labels:
    print(each[0])
    break

[0.02946742 0.06747253 0.1629689  0.03375706 0.6466203  0.01727089]


In [None]:
df.to_csv('bert_report.csv', index= True)

## GPT-j

In [None]:
data = pd.read_csv(file_name)
data = data.dropna()
two_rows = data[['long_description', 'severity_category']].copy()
two_rows.head()
comments = list(two_rows.long_description.values)
labels = list(two_rows.severity_category.values)
encoder = OneHotEncoder(sparse=False)
# transform data
labels = encoder.fit_transform(np.array(labels).reshape(-1,1))

In [None]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")

In [None]:
model = model.to(device)

In [None]:
tokenized_comments_temp = []
labels_temp = []
for i, x in enumerate(comments):
    encoded = tokenizer(x, return_tensors='pt')['input_ids']
    if encoded.shape[1] < 2047:
        tokenized_comments_temp.append(encoded)
        labels_temp.append(labels[i])
tokenized_comments = np.array(tokenized_comments_temp)
labels = np.array(labels_temp)
train_x, test_x, train_y, test_y = train_test_split(tokenized_comments_temp, labels, test_size=0.1, random_state=22)
test_y = torch.tensor(test_y)
train_y = torch.tensor(train_y)
test_y = test_y.to(device)
train_y = train_y.to(device)

In [None]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]
optimizer = torch.optim.AdamW(optimizer_grouped_parameters,lr=2e-5)
num_labels=6

In [None]:
# Store our loss and accuracy for plotting
train_loss_set = []
val_f1_accuracy = []
classification_reports = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 1

# trange is a tqdm wrapper around the normal python range
for _ in range(epochs):

  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()

  # Tracking variables
  tr_loss = 0 #running loss
  nb_tr_examples, nb_tr_steps = 0, 0
  
  # Train the data for one epoch
  for step, x in enumerate(train_x):
    # Clear out the gradients (by default they accumulate)
    x = x.to(device)
    optimizer.zero_grad()

#######################
    # set the pad token of the model's configuration
    model.config.pad_token_id = model.config.eos_token_id

#######################

    # Forward pass for multilabel classification
    outputs = model(x, token_type_ids=None)
    logits = outputs[0]
    loss_func = BCEWithLogitsLoss()
    loss = loss_func(logits.view(-1,num_labels),train_y[step].type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
    # loss_func = BCELoss() 
    # loss = loss_func(torch.sigmoid(logits.view(-1,num_labels)),b_labels.type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
    train_loss_set.append(loss.item())    

    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    # scheduler.step()
    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += train_y[step].size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))

###############################################################################
  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Variables to gather full output
  logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

  # Predict
  for i, x in enumerate(test_x):
    x = x.to(device)
    with torch.no_grad():
      # Forward pass
      outs = model(x, token_type_ids=None)
      b_logit_pred = outs[0]
      pred_label = torch.sigmoid(b_logit_pred)

      b_logit_pred = b_logit_pred.cpu().numpy()
      pred_label = pred_label.cpu().numpy()
      b_labels = test_y[i].cpu().numpy()

    tokenized_texts.append(x)
    logit_preds.append(b_logit_pred)
    true_labels.append(test_y[i])
    pred_labels.append(pred_label)
  pred_labels_new = [item for sublist in pred_labels for item in sublist]
  max_idx = np.argmax(pred_labels_new, 1)
  pred_label = np.zeros_like(pred_labels_new)
  pred_label[np.arange(len(max_idx)), max_idx] = 1
  true_label = []
  for each in true_labels:
    true_label.append(each.cpu().numpy())
  val_f1_accuracy.append(f1_score(true_label,pred_label,average=None)*100)
  classification_reports.append(classification_report(true_label, pred_label, output_dict=True))

In [None]:
df = pd.DataFrame(classification_reports[-1]).transpose()
df.to_csv('gptj_report.csv', index= True)