In [None]:
!pip install transformers
!pip install torch

In [2]:
import torch
from torch import nn
from transformers import BertModel, BertTokenizer
from torch.utils.data import Dataset

import pandas as pd

In [37]:
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [3]:
df = pd.read_csv('train.csv')

In [48]:
df.head()

Unnamed: 0,text,label
0,rCriticalSection & rSemaphore removed ...,1
1,[ALSA] Improve SPDIF playback via the P16V...,0
2,KVM: Change the emulator_{read write cmpxc...,0
3,Bump test projects up to .NET 4.5.2 - ...,0
4,Update proto name to fix Windows portabili...,1


In [49]:
df_v = df.sample(frac=0.1)

In [50]:
df_t = df.drop(df_v.index).reset_index(drop=True)

In [51]:
df_v = df_v.reset_index(drop=True)

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [4]:
class BertClassifier(nn.Module):
  def __init__(self):
    super(BertClassifier, self).__init__()
    self.bert = BertModel.from_pretrained('bert-base-uncased')
    self.linear = nn.Linear(768, 1)
    self.sigmoid = nn.Sigmoid()

  def forward(self, input_id, mask):
    _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
    linear_output = self.linear(pooled_output)
    final_output = self.sigmoid(linear_output)

    return final_output

In [5]:
class CustomDataset(Dataset):
  def __init__(self, df, tokenizer, max_len):
    self.df = df
    self.titles = df['text']
    self.labels = df['label']
    self.max_len = max_len
    self.tokenizer = tokenizer

  def __getitem__(self, index):
    title = self.titles[index]
    inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

    return {
          'input_ids': inputs['input_ids'].flatten(),
          'attention_mask': inputs['attention_mask'].flatten(),
          'token_type_ids': inputs["token_type_ids"].flatten(),
          'targets': torch.tensor([self.labels[index]], dtype=torch.float)
    }

  
  def __len__(self):
    return len(self.titles)


In [52]:
train_dataset = CustomDataset(df_t, tokenizer, 56)
validation_dataset = CustomDataset(df_v, tokenizer, 56)

In [None]:
train_dataset[1], validation_dataset[0]

({'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1]),
  'input_ids': tensor([  101, 24888,  2213,  1024,  2689,  1996,  7861, 20350,  1035,  1063,
           3191,  4339,  4642,  2361,  2595,  2818,  2290,  1065,  1035,  1008,
           4972,  2000,  2202,  1037, 18315, 14289,  1012,  1012,  1012,  2612,
           1997,  1037,  1060, 20842,  1035,  7861,  9869,  1035, 14931, 18413,
           2061,  2008,  2060, 20587,  2015,  2064,  2224,  2009,  4089,  1012,
           2772,  1011,  2125,  1011,  2011,   102]),
  'targets': tensor([0.]),
  'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0])},
 {'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [53]:
train_data_loader = torch.utils.data.DataLoader(
    train_dataset, 
    batch_size=8,
    shuffle=True,
    num_workers=0
)

validation_dataset_loader = torch.utils.data.DataLoader(
    validation_dataset,
    batch_size=8,
    shuffle=True,
    num_workers=0
)

In [54]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [55]:
model = BertClassifier()
model.to(device)

def loss_fn(outputs, targets):
    return torch.nn.BCELoss()(outputs, targets)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=2e-5)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [56]:
for epoch in range(0, 3):

  train_loss = 0
  valid_loss = 0

  model.train()
  for idx, data in enumerate(train_data_loader):

    ids = data['input_ids'].to(device, dtype = torch.long)
    mask = data['attention_mask'].to(device, dtype = torch.long)
    targets = data['targets'].to(device, dtype = torch.float)

    outputs = model(ids, mask)
    loss = loss_fn(outputs, targets)

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    train_loss = train_loss + ((1 / (idx + 1)) * (loss.item() - train_loss))
    # print('after loss data in training', loss.item(), train_loss)

  model.eval()
  with torch.no_grad():
    for idx, data in enumerate(validation_dataset_loader, 0):
      ids = data['input_ids'].to(device, dtype = torch.long)
      mask = data['attention_mask'].to(device, dtype = torch.long)
      targets = data['targets'].to(device, dtype = torch.float)

      outputs = model(ids, mask)
      loss = loss_fn(outputs, targets)
      valid_loss =  valid_loss + ((1 / (idx + 1)) * (loss.item() - valid_loss))
  
  print(f"Training loss: {train_loss}")
  print(f"Validation loss: {valid_loss}")

Training loss: 0.44293344861746403
Validation loss: 0.3765771150912927
Training loss: 0.2559768502211685
Validation loss: 0.3589815293317256
Training loss: 0.15793501347014052
Validation loss: 0.2695861232021581


In [None]:
test_df = pd.read_csv('test.csv')
test_df.shape

(324, 2)

In [57]:
test_data = CustomDataset(test_df, tokenizer, 56)

In [None]:
test_data[0]

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]),
 'input_ids': tensor([  101, 11892,  2007,  2047,  2690,  8059,  5587,  2005, 13679, 15985,
          1019,  1012,  1018,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0]),
 'targets': tensor([0.]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0])}

In [None]:
test_data[0]['input_ids']

tensor([  101, 11892,  2007,  2047,  2690,  8059,  5587,  2005, 13679, 15985,
         1019,  1012,  1018,   102,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0])

In [58]:
test_data_loader = torch.utils.data.DataLoader(
    test_data, 
    batch_size=1,
    shuffle=True,
    num_workers=0
)
# model.eval()

In [59]:
val_targets=[]
val_outputs=[]
model.eval()
with torch.no_grad():
  for idx, data in enumerate(test_data_loader, 0):
    ids = data['input_ids'].to(device, dtype = torch.long)
    mask = data['attention_mask'].to(device, dtype = torch.long)
    targets = data['targets'].to(device, dtype = torch.float)

    outputs = model(ids, mask)
    val_targets.extend(targets.cpu().detach().numpy().tolist())
    val_outputs.extend(outputs.cpu().detach().numpy().tolist())

In [None]:
val_targets, val_outputs

In [61]:
predictions = (np.array(val_outputs) > 0.5).astype(int)

In [62]:
confusion_matrix(y_true=val_targets, y_pred=predictions)

array([[206,  11],
       [ 21,  86]])

In [63]:
print(classification_report(val_targets, predictions))

              precision    recall  f1-score   support

         0.0       0.91      0.95      0.93       217
         1.0       0.89      0.80      0.84       107

    accuracy                           0.90       324
   macro avg       0.90      0.88      0.89       324
weighted avg       0.90      0.90      0.90       324



## Make local datasets and models

In [64]:
df_shuffled = df.sample(frac=1)

In [65]:
df_l1 = df_shuffled[:1000].reset_index(drop=True)
df_l2 = df_shuffled[1000:1500].reset_index(drop=True)
df_l3 = df_shuffled[1500:].reset_index(drop=True)

In [66]:
df_l1.shape, df_l2.shape, df_l3.shape

((1000, 2), (500, 2), (331, 2))

### Local model training and testing, loss and optimizer functions

In [67]:
# train local model 1
def TrainLocalModel(no_of_epochs,
                    train_data_loader,
                    model,
                    loss,
                    optimizer,
                    valid_data_loader, 
                    device):

  for epoch in range(0, no_of_epochs):

    train_loss = 0
    valid_loss = 0

    model.train()
    for idx, data in enumerate(train_data_loader):

      ids = data['input_ids'].to(device, dtype = torch.long)
      mask = data['attention_mask'].to(device, dtype = torch.long)
      targets = data['targets'].to(device, dtype = torch.float)

      outputs = model(ids, mask)
      loss = loss_fn(outputs, targets)

      loss.backward()
      optimizer.step()
      optimizer.zero_grad()
      train_loss = train_loss + ((1 / (idx + 1)) * (loss.item() - train_loss))
      # print('after loss data in training', loss.item(), train_loss)

    model.eval()
    with torch.no_grad():
      for idx, data in enumerate(valid_data_loader, 0):
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask)
        loss = loss_fn(outputs, targets)
        valid_loss =  valid_loss + ((1 / (idx + 1)) * (loss.item() - valid_loss))
    
    print(f"Training loss: {train_loss}")
    print(f"Validation loss: {valid_loss}")
  return model

In [68]:
def TestLocalModel(test_data_loader,model):
  val_targets=[]
  val_outputs=[]
  model.eval()
  with torch.no_grad():
    for idx, data in enumerate(test_data_loader, 0):
      ids = data['input_ids'].to(device, dtype = torch.long)
      mask = data['attention_mask'].to(device, dtype = torch.long)
      targets = data['targets'].to(device, dtype = torch.float)

      outputs = model(ids, mask)
      val_targets.extend(targets.cpu().detach().numpy().tolist())
      val_outputs.extend(outputs.cpu().detach().numpy().tolist())
  return val_targets, val_outputs

In [69]:
def loss_fn(outputs, targets):
    return torch.nn.BCELoss()(outputs, targets)

# optimizer = torch.optim.Adam(params =  local_model_1.parameters(), lr=2e-5)

#### Train local model 1

In [70]:
# split train and validation dataframes 
train_df_l1 = df_l1.sample(frac=0.9)
valid_df_l1 = df_l1.drop(train_df_l1.index).reset_index(drop=True)
train_df_l1.reset_index(drop=True, inplace=True)

In [71]:
# make datasets and dataloaders 
train_data_l1 = CustomDataset(train_df_l1, tokenizer, 56)
valid_data_l1 = CustomDataset(valid_df_l1, tokenizer, 56)

In [None]:
train_data_l1[0], valid_data_l1[0]

({'attention_mask': tensor([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0]),
  'input_ids': tensor([ 101, 5604, 2047, 8778, 2544,  102,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0]),
  'targets': tensor([0.]),
  'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0])},
 {'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0

In [72]:
valid_df_l1.shape, train_df_l1.shape

((100, 2), (900, 2))

In [73]:
train_data_loader_l1 = torch.utils.data.DataLoader(
    train_data_l1, 
    batch_size=8,
    shuffle=True,
    num_workers=0
)

valid_data_loader_l1 = torch.utils.data.DataLoader(
    valid_data_l1, 
    batch_size=8,
    shuffle=True,
    num_workers=0
)

In [74]:
# make and train local models
local_model_1 = BertClassifier()
local_model_1.to(device)
optimizer = torch.optim.Adam(params = local_model_1.parameters(), lr=2e-5)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [75]:
trained_model_l1 = TrainLocalModel(
    no_of_epochs=3,
    train_data_loader=train_data_loader_l1,
    model=local_model_1,
    loss=loss_fn,
    optimizer= optimizer,
    valid_data_loader=valid_data_loader_l1,
    device=device
)

Training loss: 0.426633126308433
Validation loss: 0.3387983854000385
Training loss: 0.2508876922054101
Validation loss: 0.2901247441768646
Training loss: 0.14970368600194958
Validation loss: 0.27060740613020384


### Train local model 2

In [76]:
# split train and validation dataframes 
train_df_l2 = df_l2.sample(frac=0.9)
valid_df_l2 = df_l2.drop(train_df_l2.index).reset_index(drop=True)
train_df_l2.reset_index(drop=True, inplace=True)

In [77]:
# make datasets and dataloaders 
train_data_l2 = CustomDataset(train_df_l2, tokenizer, 56)
valid_data_l2 = CustomDataset(valid_df_l2, tokenizer, 56)

In [None]:
train_data_l2[0], valid_data_l2[0], valid_df_l2.shape, train_df_l2.shape

({'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1]),
  'input_ids': tensor([  101,  1044,  2860,  8202,  1024,  1056, 13668, 12740, 14142,  1024,
           1044,  2860,  8202,  4062,  2005,  1056, 13668, 12740, 14142,  5506,
           2278,  2023,  4062, 14451,  2015,  1996, 25353, 22747,  2015, 14164,
           1997,  1996,  1056, 13668, 12740, 14142,  5506,  2278, 11336,  1012,
           2035,  1996, 10004,  3149,  5300,  2024,  5228,  1999,  3408,  1997,
          19842,  1012,  3149,  2410,  1998,   102]),
  'targets': tensor([0.]),
  'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0])},
 {'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [78]:
train_data_loader_l2 = torch.utils.data.DataLoader(
    train_data_l2, 
    batch_size=8,
    shuffle=True,
    num_workers=0
)

valid_data_loader_l2 = torch.utils.data.DataLoader(
    valid_data_l2, 
    batch_size=8,
    shuffle=True,
    num_workers=0
)

In [79]:
# make and train local models
local_model_2 = BertClassifier()
local_model_2.to(device)
optimizer = torch.optim.Adam(params =  local_model_2.parameters(), lr=2e-5)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [80]:
trained_model_l2 = TrainLocalModel(
    no_of_epochs=3,
    train_data_loader=train_data_loader_l2,
    model=local_model_2,
    loss=loss_fn,
    optimizer=optimizer,
    valid_data_loader=valid_data_loader_l2,
    device=device
)

Training loss: 0.5992674053760995
Validation loss: 0.5162456333637238
Training loss: 0.3523262322210429
Validation loss: 0.40603523701429367
Training loss: 0.2356419089462673
Validation loss: 0.6843351583395686


In [None]:
# split train and validation dataframes 
# make datasets and dataloaders 
# train models

### Train local model 3

In [81]:
# split train and validation dataframes 
train_df_l3 = df_l3.sample(frac=0.9)
valid_df_l3 = df_l3.drop(train_df_l3.index).reset_index(drop=True)
train_df_l3.reset_index(drop=True, inplace=True)

In [82]:
# make datasets and dataloaders 
train_data_l3 = CustomDataset(train_df_l3, tokenizer, 56)
valid_data_l3 = CustomDataset(valid_df_l3, tokenizer, 56)

In [None]:
train_data_l3[0], valid_data_l3[0], valid_df_l3.shape, train_df_l3.shape

({'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1]),
  'input_ids': tensor([  101,  7172,  8316,  2361,  2358,  2850,  8197,  2000,  7919, 11487,
           3645, 13005, 21025,  2102,  1011, 17917,  2078,  1011,  8909,  1024,
           5371,  1024,  1013,  1013,  1013,  2188,  1013, 17917,  2078,  1013,
           7705,  2509,  1013,  8260,  1030, 17267,  2509,  1018,  2094, 23632,
           2575,  2546, 19841,  1011,  1019,  2546, 16048,  1011,  5840, 10790,
           1011,  1038, 22275,  2692,  1011,   102]),
  'targets': tensor([0.]),
  'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0])},
 {'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [83]:
train_data_loader_l3 = torch.utils.data.DataLoader(
    train_data_l3, 
    batch_size=8,
    shuffle=True,
    num_workers=0
)

valid_data_loader_l3 = torch.utils.data.DataLoader(
    valid_data_l3, 
    batch_size=8,
    shuffle=True,
    num_workers=0
)

In [84]:
# make and train local models
local_model_3 = BertClassifier()
local_model_3.to(device)
optimizer = torch.optim.Adam(params =  local_model_3.parameters(), lr=2e-5)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [85]:
trained_model_l3 = TrainLocalModel(
    no_of_epochs=3,
    train_data_loader=train_data_loader_l3,
    model=local_model_3,
    loss=loss_fn,
    optimizer=optimizer,
    valid_data_loader=valid_data_loader_l3,
    device=device
)

Training loss: 0.609117082467205
Validation loss: 0.6023594260215759
Training loss: 0.39777719072605433
Validation loss: 0.26798893213272096
Training loss: 0.24851545564046032
Validation loss: 0.3860676646232605


In [None]:
# split train and validation dataframes 
# make datasets and dataloaders 
# train models

### Federated Averaging

In [86]:
# Fed Averaging Function
def FedAveraging(members, weights, central_model):
  model_state_dicts = []
  central_model_dict = central_model.state_dict()
  for member in members:
    model_state_dicts.append(member.state_dict())
  for key in central_model_dict:
    member_weights = [member_dict[key].detach() for member_dict in model_state_dicts]
    # print(member_weights)
    central_model_dict[key] = torch.tensor(np.average(member_weights, 
                                                      weights=weights_array, 
                                                      axis=0))

  central_model.load_state_dict(central_model_dict)
  return central_model


In [87]:
# compute weight matrix
weights_array = [df_l1.shape[0]/df.shape[0], 
                 df_l2.shape[0]/df.shape[0],
                 df_l3.shape[0]/df.shape[0]]

print(weights_array)

[0.5461496450027308, 0.2730748225013654, 0.1807755324959039]


In [88]:
# prepare list of memeber models by moving them back to cpu
members = [trained_model_l1.cpu(), 
           trained_model_l2.cpu(), 
           trained_model_l3.cpu()]

In [89]:
# run federated averaging
BaseModel = BertClassifier()

fedAvgModel = FedAveraging(members=members,
                           weights=weights_array,
                           central_model=BaseModel)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  a = np.asanyarray(a)
  a = np.asanyarray(a)
  if sys.path[0] == '':


In [90]:
# Evaluate the federated averaged model
# TestLocalModel()

test_df = pd.read_csv('test.csv')
test_data = CustomDataset(test_df, tokenizer, 56)
test_data_loader = torch.utils.data.DataLoader(
    test_data, 
    batch_size=8,
    shuffle=True,
    num_workers=0
)

In [None]:
fedAvgModel.to(device)

In [92]:
targets, outputs = TestLocalModel(test_data_loader, fedAvgModel)

In [93]:
predictions = (np.array(outputs) > 0.5).astype(int)

In [94]:
confusion_matrix(y_true=targets, y_pred=predictions)

array([[197,  20],
       [ 17,  90]])

In [95]:
print(classification_report(y_true=targets, y_pred=predictions))

              precision    recall  f1-score   support

         0.0       0.92      0.91      0.91       217
         1.0       0.82      0.84      0.83       107

    accuracy                           0.89       324
   macro avg       0.87      0.87      0.87       324
weighted avg       0.89      0.89      0.89       324

