In [None]:
%%capture
#!git clone https://github.com/vibhor98/GraphNLI.git
#!pip install networkx==1.9
!git clone https://github.com/UKPLab/sentence-transformers.git
!pip install sentence-transformers/

All the codes are based on [the authors' original respository](https://github.com/vibhor98/GraphNLI.git) on GitHub . However, there are a number of changes compared to the original version:
- in data preparation, the graph walk length is changed to the length suggested in the paper
- to ensure alignment of training and testing data in all three models, the data preparation for each model is combined together
- when creating the baseline model, the id is also stored so we can easily reference the sentences
- adding random seed to ensure reproducability
- We changed the training batch size of graph walk model from 16 to 12, since we don't have the resources.
- sentence_transformer.py from [SentenceTransformer](https://github.com/UKPLab/sentence-transformers.git) package has been modified to allow the saving the classifier that obtains to final (best) score.


The original code doesn't distinguish between a validation set and a test set, so we will also keep using validation set for the final accuracy score.

In [None]:
import os
import csv
import pickle as pkl
import pandas as pd
import random
import math
import numpy as np
import networkx as nx
import torch
import logging
from typing import Union, Tuple, List, Iterable, Dict, Callable
from torch.utils.data import DataLoader
from torch import nn, Tensor
from sentence_transformers import SentenceTransformer
from sentence_transformers import LoggingHandler, util, models, losses
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.evaluation import LabelAccuracyEvaluator
from sentence_transformers.cross_encoder.evaluation import CEBinaryAccuracyEvaluator
from sentence_transformers.readers import InputExample
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score
from transformers import AutoModel

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
# please modify this if you intend to run the code
dataset_path = 'drive/MyDrive/graphNLI/serializedGraphs/'
file_saving_path = 'drive/MyDrive/graphNLI/outputs_new/'

## Preparing Dataset


### Graph Walk functions from gen_dataset_graph_walks.py:
https://github.com/vibhor98/GraphNLI/blob/main/GraphNLI/gen_dataset_graph_walks.py

In [None]:
# biased random walk and weighted graph walk data preparation function

def biased_random_walk(sentences, data, node_id, child_edges, walk_len):
    length = 0
    label = -1
    sentences[0] = data.node[node_id]['text']

    if data.node[node_id]['relation'] == 1:
        label = 1
    elif data.node[node_id]['relation'] == -1:
        label = 0

    for i in range(1, walk_len+1):
        length += 1
        choices = []
        probs = []
        edge = data.edge[node_id]
        if len(edge.keys()) > 0:
            choices.append(list(edge.keys())[0])
            probs.append(0.75)
        if node_id in child_edges:
            choices.extend(child_edges[node_id])
            num_child = len(child_edges[node_id])
            probs.extend([0.25/num_child]*num_child)

        if len(choices) == 0:
            return sentences, label
        node = random.choices(choices, probs)[0]
        sentences[i] = data.node[node]['text']
        node_id = node
    return sentences, label


# Weighted Root-seeking Graph Walk
def weighted_graph_walk(sentences, data, node_id, walk_len):
    sentences[0] = data.node[node_id]['text']
    edge = data.edge[node_id]
    label = -1

    for i in range(1, walk_len+1):
        if len(edge.keys()) >= 1:
            parent_node_id = list(edge.keys())[0]
            sentences[i] = data.node[parent_node_id]['text']
            if i == 1:
                if edge[parent_node_id]['weight'] == 1:
                    label = 1
                elif edge[parent_node_id]['weight'] == -1:
                    label = 0
            edge = data.edge[parent_node_id]
        else:
            break
    return sentences, label

### preparing all the datasets for three models

In [None]:
# loop over files
for file in files:
    data = pkl.load(open(dataset_path + file, 'rb'))
    child_edges = {}
    for node_id in data.node.keys():


        # baseline data
        sentence2 = data.node[node_id]['text']
        edge = data.edge[node_id]

        if len(edge.keys()) >= 1:
            parent_node_id = list(edge.keys())[0]
            if edge[parent_node_id]['weight'] == 1:
                sentence1 = data.node[parent_node_id]['text']
                
            elif edge[parent_node_id]['weight'] == -1:
                sentence1 = data.node[parent_node_id]['text']    
        break

In [None]:
# initialize

files = os.listdir(dataset_path)
dataset_samples_baseline = []
dataset_samples_biased = []
dataset_samples_weighted = []
labels_baseline = []
labels_biased = []
labels_weighted = []

In [None]:
# loop over files
for file in files:
    data = pkl.load(open(dataset_path + file, 'rb'))
    child_edges = {}
    for node_id in data.node.keys():

        # baseline data
        sentence2 = data.node[node_id]['text']
        edge = data.edge[node_id]

        if len(edge.keys()) >= 1:
            parent_node_id = list(edge.keys())[0]
            if edge[parent_node_id]['weight'] == 1:
                sentence1 = data.node[parent_node_id]['text']
                dataset_samples_baseline.append([sentence1, sentence2, 1])
            elif edge[parent_node_id]['weight'] == -1:
                sentence1 = data.node[parent_node_id]['text']
                dataset_samples_baseline.append([sentence1, sentence2, 0])

        # appending child_edges, Required for Random Walk.
        if len(edge.keys()) > 0:
            key = list(edge.keys())[0]
            if key in child_edges:
                child_edges[key].append(node_id)
            else:
                child_edges[key] = [node_id]

    # preparing both biased_random_walk data and weighted_graph_walk data
    for node_id in data.node.keys():

        # Required for biased root-seeking Random Walk.
        sentences_biased, label_biased = biased_random_walk( ['']*4, data, node_id, child_edges, 3)

        if label_biased != -1:
            sentences_biased.append(label_biased)
            dataset_samples_biased.append(sentences_biased)

        # Required for weighted root-seeking Graph Walk.
        sentences_weighted, label_weighted = weighted_graph_walk( ['']*5, data, node_id, 4)
        if label_weighted != -1:
            sentences_weighted.append(label_weighted)
            dataset_samples_weighted.append(sentences_weighted)

In [None]:
# checking that it is indeed the same length
print(len(dataset_samples_baseline))
print(len(dataset_samples_biased))
print(len(dataset_samples_weighted))

324373
324373
324373


In [None]:
# checking that it is indeed the same pairs
print(dataset_samples_baseline[15])
print(dataset_samples_biased[15])
print(dataset_samples_weighted[15])

["Bitcoin has a number of features which make it a valuable alternative to traditional currencies. A significant enough proportion of bitcoin's value can be attributed to these applications that its value is not entirely speculative.", "Bitcoin's [volatility](https://www.investopedia.com/articles/investing/052014/why-bitcoins-value-so-volatile.asp) makes it unsuitable for many of its more practical applications. If users cannot be assured that there is some level of stability in the value of their bitcoins, there is little point in possessing them for any reason other than speculating on their value for profit.", 0]
["Bitcoin's [volatility](https://www.investopedia.com/articles/investing/052014/why-bitcoins-value-so-volatile.asp) makes it unsuitable for many of its more practical applications. If users cannot be assured that there is some level of stability in the value of their bitcoins, there is little point in possessing them for any reason other than speculating on their value for 

### shuffling and saving the dataset

In [None]:
# create a random shuffle:
np.random.seed(15)
permutation = np.random.permutation(len(dataset_samples_baseline))
training_idx = permutation[ : math.ceil(0.8*len(dataset_samples_baseline))].tolist()
validation_idx = permutation[ math.ceil(0.8*len(dataset_samples_baseline)):].tolist()

In [None]:
# saving baseline
train_samples = [dataset_samples_baseline[i] for i in training_idx]
dev_samples = [dataset_samples_baseline[i] for i in validation_idx]
pd.DataFrame(train_samples, 
             columns=['sentence1', 'sentence2', 'label']).to_csv(file_saving_path+'train_graph_set.csv', 
                                                                 index=False)
pd.DataFrame(dev_samples, 
             columns=['sentence1', 'sentence2', 'label']).to_csv(file_saving_path+'test_graph_set.csv', 
                                                                 index=False)

print('#train samples:', len(train_samples))
print('#dev samples:', len(dev_samples))

##########################################

#train samples: 259499
#dev samples: 64874


In [None]:
# saving biased dataset
train_samples = [dataset_samples_biased[i] for i in training_idx]
dev_samples = [dataset_samples_biased[i] for i in validation_idx]
pd.DataFrame(train_samples, 
             columns=['sent1', 'sent2', 'sent3', 'sent4','label']).to_csv(file_saving_path+'train_biased_walk_set.csv', 
                                                                 index=False)
pd.DataFrame(dev_samples, 
             columns=['sent1', 'sent2', 'sent3', 'sent4','label']).to_csv(file_saving_path+'test_biased_walk_set.csv', 
                                                                 index=False)

print('#train samples:', len(train_samples))
print('#dev samples:', len(dev_samples))


#train samples: 259499
#dev samples: 64874


In [None]:
# saving weighted dataset
train_samples = [dataset_samples_weighted[i] for i in training_idx]
dev_samples = [dataset_samples_weighted[i] for i in validation_idx]
pd.DataFrame(train_samples, 
             columns=['sent1', 'sent2', 'sent3', 'sent4', 'sent5','label']).to_csv(file_saving_path+'train_weighted_walk_set.csv', 
                                                                 index=False)
pd.DataFrame(dev_samples, 
             columns=['sent1', 'sent2', 'sent3', 'sent4', 'sent5','label']).to_csv(file_saving_path+'test_weighted_walk_set.csv', 
                                                                 index=False)

print('#train samples:', len(train_samples))
print('#dev samples:', len(dev_samples))

#train samples: 259499
#dev samples: 64874


### create extra dataset for indexing

In [None]:
# initialize

files = os.listdir(dataset_path)
dataset_samples_baseline = []
dataset_samples_biased = []
dataset_samples_weighted = []
labels_baseline = []
labels_biased = []
labels_weighted = []

In [None]:
# loop over files
for file in files:
    data = pkl.load(open(dataset_path + file, 'rb'))
    child_edges = {}
    for node_id in data.node.keys():

        # baseline data
        sentence2 = data.node[node_id]['text']
        edge = data.edge[node_id]

        if len(edge.keys()) >= 1:
            parent_node_id = list(edge.keys())[0]
            if edge[parent_node_id]['weight'] == 1:
                sentence1 = data.node[parent_node_id]['text']
                dataset_samples_baseline.append([parent_node_id, sentence1, node_id, sentence2, 1])
            elif edge[parent_node_id]['weight'] == -1:
                sentence1 = data.node[parent_node_id]['text']
                dataset_samples_baseline.append([parent_node_id, sentence1, node_id,  sentence2, 0])

In [None]:
# create a random shuffle:
np.random.seed(15)
permutation = np.random.permutation(len(dataset_samples_baseline))
training_idx = permutation[ : math.ceil(0.8*len(dataset_samples_baseline))].tolist()
validation_idx = permutation[ math.ceil(0.8*len(dataset_samples_baseline)):].tolist()

In [None]:
# saving baseline
train_samples = [dataset_samples_baseline[i] for i in training_idx]
dev_samples = [dataset_samples_baseline[i] for i in validation_idx]
pd.DataFrame(train_samples, 
             columns=['node_id_1', 'sentence_1', 'node_id_2', 'sentence_2', 'label'],
             dtype=str).to_csv(file_saving_path+'reference_train_set.csv', 
                                                                 index=False)
pd.DataFrame(dev_samples, 
             columns=['node_id_1', 'sentence_1', 'node_id_2', 'sentence_2', 'label'],
             dtype=str).to_csv(file_saving_path+'reference_test_set.csv', 
                                                                 index=False)

print('#train samples:', len(train_samples))
print('#dev samples:', len(dev_samples))

##########################################

#train samples: 259499
#dev samples: 64874


## Baseline Model Training

### Training

In [None]:
# load baseline model data

train_samples = []
dev_samples = []
test_samples = []
val_label = []

trainset = pd.read_csv('drive/MyDrive/graphNLI/outputs_new/train_graph_set.csv')
for i in range(len(trainset)):
    train_samples.append(InputExample(texts=[str(trainset.iloc[i]['sentence1']),
            str(trainset.iloc[i]['sentence2'])], label=int(trainset.iloc[i]['label'])))

devset = pd.read_csv('drive/MyDrive/graphNLI/outputs_new/test_graph_set.csv')
for i in range(len(devset)):
    dev_samples.append(InputExample(texts=[str(devset.iloc[i]['sentence1']),
            str(devset.iloc[i]['sentence2'])], label=int(devset.iloc[i]['label'])))
    test_samples.append([str(devset.iloc[i]['sentence1']), str(devset.iloc[i]['sentence2'])])
    val_label.append(int(devset.iloc[i]['label']))


In [None]:
# training config and load model 
train_batch_size = 32
num_epochs = 4
model_save_path = 'drive/MyDrive/graphNLI/outputs_new/' + 'baseline'

#Define our CrossEncoder model. We use distilroberta-base as basis and setup it up to predict 2 labels
model = CrossEncoder('distilroberta-base', num_labels=2)

#We wrap train_samples, which is a list ot InputExample, in a pytorch DataLoader
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)

#During training, we use CESoftmaxAccuracyEvaluator to measure the accuracy on the dev set.
evaluator = CEBinaryAccuracyEvaluator.from_input_examples(dev_samples, name='nli-dev')

warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up

Downloading:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/316M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.out_proj.weig

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [None]:
# Train the model
model.fit(train_dataloader=train_dataloader,
          evaluator=evaluator,
          epochs=num_epochs,
          activation_fct=nn.Softmax(),
          evaluation_steps=10000,
          optimizer_params={'lr': 1e-5},
          warmup_steps=warmup_steps,
          output_path=model_save_path)

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8110 [00:00<?, ?it/s]

  logits = activation_fct(model_predictions.logits)


Iteration:   0%|          | 0/8110 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8110 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8110 [00:00<?, ?it/s]

In [None]:
# evaluation of the test set data
test_samples = []
val_label = []

# Our S-BERT trained model path.
model_path = 'drive/MyDrive/graphNLI/outputs_new/' + 'baseline'

devset = pd.read_csv('drive/MyDrive/graphNLI/outputs_new/test_graph_set.csv')
for i in range(len(devset)):
    test_samples.append([str(devset.iloc[i]['sentence1']), str(devset.iloc[i]['sentence2'])])
    val_label.append(int(devset.iloc[i]['label']))

model = CrossEncoder(model_path, num_labels=2)

pred_prob = model.predict(test_samples, activation_fct=nn.Softmax())

pred_labels = np.argmax(pred_prob, axis=1)

print('Precision:', precision_score(val_label, pred_labels))
print('Recall:', recall_score(val_label, pred_labels))
print('F1-score:', f1_score(val_label, pred_labels))

print('Classification Report')
print(classification_report(val_label, pred_labels))

  logits = activation_fct(model_predictions.logits)


Precision: 0.7995836802664447
Recall: 0.6862962301232803
F1-score: 0.7386212864147678
Classification Report
              precision    recall  f1-score   support

           0       0.79      0.87      0.83     36889
           1       0.80      0.69      0.74     27985

    accuracy                           0.79     64874
   macro avg       0.79      0.78      0.78     64874
weighted avg       0.79      0.79      0.79     64874



In [None]:
print('F1-score:', accuracy_score(val_label, pred_labels))

F1-score: 0.7904707587014829


### generate baseline model predictions

In [None]:
# construct train set and dev set
train_samples = []
train_labels = []
dev_samples = []
dev_labels = []
trainset = pd.read_csv(file_saving_path+'train_graph_set.csv')
for i in range(len(trainset)):
    train_samples.append([str(trainset.iloc[i]['sentence1']), str(trainset.iloc[i]['sentence2'])])
    train_labels.append(int(trainset.iloc[i]['label']))

devset = pd.read_csv(file_saving_path+'test_graph_set.csv')
for i in range(len(devset)):
    dev_samples.append([str(devset.iloc[i]['sentence1']), str(devset.iloc[i]['sentence2'])])
    dev_labels.append(int(devset.iloc[i]['label']))

In [None]:
# Our S-BERT trained model path.
model_path = file_saving_path + 'baseline'
model = CrossEncoder(model_path, num_labels=2)

In [None]:
train_samples[:5]

In [None]:
LabelAccuracyEvaluator()

In [None]:
pd.read_csv(file_saving_path+"reference_train_set.csv")[:5]

Unnamed: 0,node_id_1,sentence_1,node_id_2,sentence_2,label
0,3371.146,All Muslims should be Sunni.,3371.152,The successor to the Prophet Mohammed should b...,0
1,19648.5,Younger people are not capable of voting and t...,19648.68,"The age of 16, however, is often considered an...",0
2,20564.6282,There is a priori no good reason why a sheer o...,20564.6472,"There must be a genuine reason to doubt, and j...",0
3,5904.581,They tend to be compensated in their families....,5904.583,This creates unequal opportunities for the gen...,0
4,3615.437,"To suggest that hard sciences, such as physics...",3615.439,The Higgs Boson is a model in our heads we are...,0


In [None]:
# make predictions
train_pred_prob = model.predict(train_samples, activation_fct=nn.Softmax())
test_pred_prob = model.predict(dev_samples, activation_fct=nn.Softmax())

  logits = activation_fct(model_predictions.logits)


In [None]:
# save the scores
pd.DataFrame(train_pred_prob, columns=["score 0", "score 1"]).to_csv(file_saving_path+'baseline_train_score.csv')
pd.DataFrame(test_pred_prob, columns=["score 0", "score 1"]).to_csv(file_saving_path+'baseline_test_score.csv')

## training of weighted root-seaking graph walk model

### Define SoftmaxLoss class

In [None]:
logger = logging.getLogger(__name__)

class SoftmaxLoss(nn.Module):
    """
    This loss function is a modification of loss used in S-BERT to train the SentenceTransformer
    model on NLI data. It adds a softmax classifier on top of the output of two transformer networks.
    :param model: SentenceTransformer model
    :param sentence_embedding_dimension: Dimension of your sentence embeddings
    :param num_labels: Number of different labels
    :param concatenation_sent_rep: Concatenate vectors u,v for the softmax classifier?
    :param concatenation_sent_difference: Add abs(u-v) for the softmax classifier?
    :param concatenation_sent_multiplication: Add u*v for the softmax classifier?
    :param loss_fct: Optional: Custom pytorch loss function. If not set, uses nn.CrossEntropyLoss()
    """
    def __init__(self,
                 model: SentenceTransformer,
                 sentence_embedding_dimension: int,
                 num_labels: int,
                 concatenation_sent_rep: bool = True,
                 concatenation_sent_difference: bool = True,
                 concatenation_sent_multiplication: bool = False,
                 loss_fct: Callable = nn.CrossEntropyLoss()):
        super(SoftmaxLoss, self).__init__()
        self.model = model
        self.num_labels = num_labels
        self.concatenation_sent_rep = concatenation_sent_rep
        self.concatenation_sent_difference = concatenation_sent_difference
        self.concatenation_sent_multiplication = concatenation_sent_multiplication

        num_vectors_concatenated = 0
        if concatenation_sent_rep:
            num_vectors_concatenated += 2
        if concatenation_sent_difference:
            num_vectors_concatenated += 1
        if concatenation_sent_multiplication:
            num_vectors_concatenated += 1
        logger.info("Softmax loss: #Vectors concatenated: {}".format(num_vectors_concatenated))
        self.classifier = nn.Linear(num_vectors_concatenated * sentence_embedding_dimension, num_labels)
        self.loss_fct = loss_fct

    def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
        reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features]
        rep_a, rep_b = self.weighted_aggregate(reps)

        vectors_concat = []
        if self.concatenation_sent_rep:
            vectors_concat.append(rep_a)
            vectors_concat.append(rep_b)

        if self.concatenation_sent_difference:
            vectors_concat.append(torch.abs(rep_a - rep_b))

        if self.concatenation_sent_multiplication:
            vectors_concat.append(rep_a * rep_b)

        features = torch.cat(vectors_concat, 1)

        output = self.classifier(features)

        if labels is not None:
            loss = self.loss_fct(output, labels.view(-1))
            return loss
        else:
            return reps, output

    # Different aggregation strategies used to capture the neighboring context.
    def weighted_aggregate(self, reps):
        v = reps[0]
        prev_weight = 1
        for i in range(1, len(reps)):
            weight = 0.75 * prev_weight
            reps[i] = torch.mul(reps[i], weight)
            prev_weight = prev_weight - weight
        u = torch.sum(torch.stack(reps[1:]), dim=0)
        return u, v

    def half_forward(self, sentence_features: Iterable[Dict[str, Tensor]]):
        reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features]
        rep_a, rep_b = self.weighted_aggregate(reps)

        vectors_concat = []
        if self.concatenation_sent_rep:
            vectors_concat.append(rep_a)
            vectors_concat.append(rep_b)

        if self.concatenation_sent_difference:
            vectors_concat.append(torch.abs(rep_a - rep_b))

        if self.concatenation_sent_multiplication:
            vectors_concat.append(rep_a * rep_b)

        features = torch.cat(vectors_concat, 1)

        return features

### training

In [None]:
model_name = 'distilroberta-base'

train_batch_size = 12
graph_walk_len = 5
num_epochs = 4

train_samples = []
test_samples = []

model_save_path = file_saving_path+ 'weighted_walk_full2'

In [None]:
# Using RoBERTa model for mapping tokens to embeddings.
word_embedding_model = models.Transformer(model_name)

# Applying mean pooling to get one fixed sized sentence vector.
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

trainset = pd.read_csv(file_saving_path+'train_weighted_walk_set.csv')
trainset = trainset.fillna('')

for i in range(len(trainset)):
    texts = []
    for j in range(1, graph_walk_len+1):  # 5 for graph walk and 4 for random walk.
        texts.append(trainset.iloc[i]['sent' + str(j)])
    train_samples.append(InputExample(texts=texts, label=int(trainset.iloc[i]['label'])))

devset = pd.read_csv(file_saving_path+'test_weighted_walk_set.csv')
devset = devset.fillna('')

for i in range(len(devset)):
    texts = []
    for j in range(1, graph_walk_len+1):
        texts.append(devset.iloc[i]['sent' + str(j)])
    test_samples.append(InputExample(texts=texts, label=int(devset.iloc[i]['label'])))


Downloading config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/316M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [None]:
# dataloader, loss, 
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=2)

dev_dataloader = DataLoader(test_samples, shuffle=True, batch_size=train_batch_size)
dev_evaluator = LabelAccuracyEvaluator(dev_dataloader, name='sts-dev', softmax_model=train_loss)

warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up

In [None]:
# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
    evaluator=dev_evaluator,
    epochs=num_epochs,
    warmup_steps=warmup_steps,
    evaluation_steps=15000,
    output_path=model_save_path)


Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/21625 [00:00<?, ?it/s]

Iteration:   0%|          | 0/21625 [00:00<?, ?it/s]

Iteration:   0%|          | 0/21625 [00:00<?, ?it/s]

Iteration:   0%|          | 0/21625 [00:00<?, ?it/s]

###prediction

In [None]:
model_name = 'distilroberta-base'

train_batch_size = 12
graph_walk_len = 5
num_epochs = 4

train_samples = []
test_samples = []

model_save_path = file_saving_path+ + 'weighted_walk_full2'

trainset = pd.read_csv('drive/MyDrive/graphNLI/outputs_new/train_weighted_walk_set.csv')
trainset = trainset.fillna('')

for i in range(len(trainset)):
    texts = []
    for j in range(1, graph_walk_len+1):  # 5 for graph walk and 4 for random walk.
        texts.append(trainset.iloc[i]['sent' + str(j)])
    train_samples.append(InputExample(texts=texts, label=int(trainset.iloc[i]['label'])))

devset = pd.read_csv(file_saving_path+'test_weighted_walk_set.csv')
devset = devset.fillna('')

for i in range(len(devset)):
    texts = []
    for j in range(1, graph_walk_len+1):
        texts.append(devset.iloc[i]['sent' + str(j)])
    test_samples.append(InputExample(texts=texts, label=int(devset.iloc[i]['label'])))

In [None]:
# load the model
model = SentenceTransformer(model_save_path)
model.to(device)

train_loss = SoftmaxLoss(model=model, 
                         sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=2)
train_loss.to(device)

SoftmaxLoss(
  (model): SentenceTransformer(
    (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: RobertaModel 
    (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  )
  (classifier): Linear(in_features=2304, out_features=2, bias=True)
  (loss_fct): CrossEntropyLoss()
)

In [None]:
loaded_classifier = torch.load(model_save_path+'/best_trained_classifier_0.pt').to(device)

In [None]:
train_loss.classifier = loaded_classifier

In [None]:
# make predictions
# set up dataloader, loss and evaluator
train_dataloader = DataLoader(train_samples, shuffle=False, batch_size=12)
dev_dataloader = DataLoader(test_samples, shuffle=False, batch_size=12)
train_dataloader.collate_fn = model.smart_batching_collate
dev_dataloader.collate_fn = model.smart_batching_collate
train_iterator = iter(train_dataloader)
dev_iterator = iter(dev_dataloader)

In [None]:
train_predictions = []
for i in range(len(train_dataloader)):
    features, _ = next(train_iterator)
    features = list(map(lambda batch: util.batch_to_device(batch, model._target_device), features))
    _, output = train_loss(features, None)
    train_predictions.append(output.detach().to('cpu').numpy())
train_predictions = np.concatenate(train_predictions)

test_predictions = []
for i in range(len(dev_dataloader)):
    features, _ = next(dev_iterator)
    features = list(map(lambda batch: util.batch_to_device(batch, model._target_device), features))
    _, output = train_loss(features, None)
    test_predictions.append(output.detach().to('cpu').numpy())
test_predictions = np.concatenate(test_predictions)

In [None]:
# save the scores
pd.DataFrame(train_predictions, columns=["score 0", "score 1"]).to_csv(file_saving_path+'weighted_walk_train_score2.csv')
pd.DataFrame(test_predictions, columns=["score 0", "score 1"]).to_csv(file_saving_path+'weighted_walk_test_score2.csv')

In [None]:
accuracy_score(test_predictions.argmax(axis=1), devset.label)

0.8262478034343497