In [1]:
'''
!pip install bertviz
!pip install jupyterlab
!pip install ipywidgets
'''

'\n!pip install bertviz\n!pip install jupyterlab\n!pip install ipywidgets\n'

In [1]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from spacy.lang.en import English
from splitbert.textsplit import text_segmentation
from splitbert.splitbert import SplitBertConcatEncoderModel
from splitbert.splitbert import SplitBertTransformerModel
from splitbert.splitbert import conduct_input_ids_and_attention_masks
from splitbert.splitbert import make_masks
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

from bertviz import util
from bertviz.transformers_neuron_view import BertModel
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig
from bertviz import head_view, model_view

2023-08-22 11:24:17.638436: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


# Data Preparation

In [2]:
post_df = pd.read_csv('../predicting-satisfaction-using-graphs/csv/dataset/liwc_post.csv', encoding='UTF-8')
comment_df = pd.read_csv('../predicting-satisfaction-using-graphs/csv/dataset/liwc_comment.csv', encoding='UTF-8')
reply_df = pd.read_csv('../predicting-satisfaction-using-graphs/csv/dataset/avg_satisfaction_raw_0-999.csv', encoding='ISO-8859-1')

modes = [['seg', 'seg', 'snt']]

nlp = English()
nlp.add_pipe("sentencizer")

# satisfaction score (y)
satisfactions_float = list(reply_df['satisfy_composite'])
satisfactions = []

for s in satisfactions_float:
    if s < 3.5:
        satisfactions.append(0)
    elif s < 5:
        satisfactions.append(1)
    else:
        satisfactions.append(2)

reply_contents = list(reply_df['replyContent'])
post_contents = list(post_df['content'])
comment_bodies = list(comment_df['content'])


def get_sequences(contents, mode):
    sequences = []

    if mode == 'all':
        for content in contents:
            sequences.append([content])
    elif mode == 'seg':
        for content in contents:
            sentences = list(map(lambda x: str(x), list(nlp(content).sents)))
            sequences.append(text_segmentation(sentences))
    else:  # sentences
        for content in contents:
            sequences.append(list(map(lambda x: str(x), list(nlp(content).sents))))

    return sequences


for mode in modes:
    print(mode)
    post_sequences = get_sequences(post_contents, mode[0])
    comment_sequences = get_sequences(comment_bodies, mode[1])
    reply_sequences = get_sequences(reply_contents, mode[2])

    data = []
    max_post, max_comment, max_reply = 0, 0, 0
    i = 0
    for post, comment, reply, satisfaction, satisfaction_float in zip(post_sequences, comment_sequences,
                                                                          reply_sequences, satisfactions,
                                                                          satisfactions_float):
        if len(post) > max_post:
            max_post = len(post)
        if len(comment) > max_comment:
            max_comment = len(comment)
        if len(reply) > max_reply:
            max_reply = len(reply)

        data.append([i, post, comment, reply, satisfaction, satisfaction_float])
        i += 1

    print(max_post, max_comment, max_reply)
    max_count = max(max_post, max_comment, max_reply)
    print(max_count)

    columns = ['index', 'post_contents', 'comment_contents', 'reply_contents', 'label', 'score']
    df = pd.DataFrame(data, columns=columns)

    # data split (train & test sets)
    idx_train, idx_remain = train_test_split(df.index.values, test_size=0.20, random_state=42)
    idx_val, idx_test = train_test_split(idx_remain, test_size=0.50, random_state=42)

    train_df = df.iloc[idx_train]
    val_df = df.iloc[idx_val]
    test_df = df.iloc[idx_test]

    count_min_label = min(train_df['label'].value_counts())

    labels = [0, 1, 2]

    train_sample_df = pd.DataFrame([], columns=columns)

    for label in labels:
        tmp = train_df[train_df['label'] == label]
        tmp_sampled = tmp.sample(frac=1).iloc[:count_min_label]
        train_sample_df = pd.concat([train_sample_df, tmp_sampled])

    train_sample_df = train_sample_df.sample(frac=1)

['seg', 'seg', 'snt']
10 4 10
10


In [7]:
def prepare_model(target, mode, concat_mode, model, epoch):
    # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    device = torch.device('cpu')

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

    model_path = f'../predicting-satisfaction-using-graphs/splitbert/model/{mode}/epoch_{epoch}.model'
    
    if model == 'transformer':
        model = SplitBertTransformerModel(num_labels=len(labels), embedding_size=384, max_sentences=10, max_len1=10,
                                          max_len2=4, device=device)
    else:  # encoder
        model = SplitBertConcatEncoderModel(num_labels=len(labels), embedding_size=384, 
                                            max_len=max_count, max_post_len=max_post, max_comment_len=max_comment,
                                            device='cpu', target=target, concat_mode=concat_mode, output_attentions=True)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to('cpu')
    model.eval()

    for param in model.sbert.parameters():
        param.requires_grad = False
    
    if model == 'encoder':
        for param in model.bert.parameters():
            param.requires_grad = False
        
    return device, model, tokenizer

In [8]:
device, model, tokenizer = prepare_model('post_comment', 'seg_seg', 'concat_all', 'encoder', 8)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
def construct_input_ref_pair(targets):
    input_ids_list, ref_input_ids_list, attention_masks_list, sentence_count_list = [], [], [], []
    
    for contents in targets:
        result = tokenizer(contents, pad_to_max_length=True, truncation=True, max_length=256, return_tensors='pt')
        
        input_ids = result['input_ids']
        sentence_count_list.append(torch.tensor(len(input_ids)).unsqueeze(0))
        attention_masks = result['attention_mask']
        
        pad = (0, 0, 0, max_count-len(input_ids))
        input_ids = nn.functional.pad(input_ids, pad, "constant", 0)
        attention_masks = nn.functional.pad(attention_masks, pad, "constant", 0)
        ref_input_ids = torch.zeros_like(input_ids)

        input_ids_list.append(input_ids.unsqueeze(0))
        ref_input_ids_list.append(ref_input_ids.unsqueeze(0))
        attention_masks_list.append(attention_masks.unsqueeze(0))
    
    return input_ids_list, ref_input_ids_list, attention_masks_list, sentence_count_list

In [16]:
N = 2
post, comment = post_sequences[N], comment_sequences[N]

In [47]:
input_ids, ref_input_ids, attention_masks, sentence_counts = construct_input_ref_pair([post, comment])
    
one_hot_labels = torch.nn.functional.one_hot(torch.tensor(satisfactions[N]), num_classes=len(labels))

inputs = {'labels': one_hot_labels.type(torch.float).to(device),
          'input_ids1': input_ids[0].to(device),
          'input_ids2': input_ids[1].to(device),
          'attention_mask1': attention_masks[0].to(device),
          'attention_mask2': attention_masks[1].to(device),
          'sentence_count1': sentence_counts[0].to(device),
          'sentence_count2': sentence_counts[1].to(device),
          'mode': 'post_comment'
         }
    
with torch.no_grad():
    outputs = model(**inputs)

In [48]:
all_sentences = [post, comment]
all_tokens = [item for all_sentences in all_sentences for item in all_sentences]
print(len(all_tokens))

comment_start = len(post)
all_tokens

4


[" I am 30 years old and my girlfriend is 24 are together since 4 months. She has high level of depression and anxiety and she sees a therapist. When she is depressed, she starts insulting me and saying not nice things to me. She explained, that because of her depression, she can't control her feelings. I am very supportive to her and when she feels down, i always remind her that i love her and if she wants us to meet we can do it or have a video chat.",
 " Lately, if i say something nice, she says: you want this relationship more than me and you are desperate for a relationship, that's why you said that. When i give her space, she gets angry that I don't trust her. Any advise?",
 " Also, do people with depression insults the ones they love? Note: she wasn't like that for the first two months and then she changed, blaming it on that relationship scares her. Thanks",
 "Depression is not an excuse for this behavior, it is manipulative and unhealthy. Yeah, maybe she feels shitty, but that

# Encoder

In [49]:
attention = [outputs.attentions[0][0].unsqueeze(0)]

In [50]:
head_view(attention, all_tokens, comment_start)

<IPython.core.display.Javascript object>

# Encoder-decoder

In [None]:
attentions = outputs[3]

In [37]:
head_view(
    encoder_attention=attentions['encoder_attentions'],
    decoder_attention=attentions['decoder_attentions'],
    cross_attention=attentions['cross_attentions'],
    encoder_tokens= all_tokens[:comment_start],
    decoder_tokens=all_tokens[comment_start:]
)

TypeError: list indices must be integers or slices, not str