In [1]:
'''
!pip install bertviz
!pip install jupyterlab
!pip install ipywidgets
'''

'\n!pip install bertviz\n!pip install jupyterlab\n!pip install ipywidgets\n'

In [1]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from spacy.lang.en import English
from splitbert.textsplit import text_segmentation
from splitbert.splitbert import SplitBertConcatEncoderModel
from splitbert.splitbert import SplitBertTransformerModel
from splitbert.splitbert import conduct_input_ids_and_attention_masks
from splitbert.splitbert import make_masks
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

from bertviz import util
from bertviz.transformers_neuron_view import BertModel
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig
from bertviz import head_view, model_view

2023-08-15 20:19:57.054972: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


# Data Preparation

In [2]:
post_df = pd.read_csv('../predicting-satisfaction-using-graphs/csv/dataset/liwc_post.csv', encoding='UTF-8')
comment_df = pd.read_csv('../predicting-satisfaction-using-graphs/csv/dataset/liwc_comment.csv', encoding='UTF-8')
reply_df = pd.read_csv('../predicting-satisfaction-using-graphs/csv/dataset/avg_satisfaction_raw_0-999.csv', encoding='ISO-8859-1')

modes = [['seg', 'seg', 'snt']]

nlp = English()
nlp.add_pipe("sentencizer")

# satisfaction score (y)
satisfactions_float = list(reply_df['satisfy_composite'])
satisfactions = []

for s in satisfactions_float:
    if s < 3.5:
        satisfactions.append(0)
    elif s < 5:
        satisfactions.append(1)
    else:
        satisfactions.append(2)

reply_contents = list(reply_df['replyContent'])
post_contents = list(post_df['content'])
comment_bodies = list(comment_df['content'])


def get_sequences(contents, mode):
    sequences = []

    if mode == 'all':
        for content in contents:
            sequences.append([content])
    elif mode == 'seg':
        for content in contents:
            sentences = list(map(lambda x: str(x), list(nlp(content).sents)))
            sequences.append(text_segmentation(sentences))
    else:  # sentences
        for content in contents:
            sequences.append(list(map(lambda x: str(x), list(nlp(content).sents))))

    return sequences


for mode in modes:
    print(mode)
    post_sequences = get_sequences(post_contents, mode[0])
    comment_sequences = get_sequences(comment_bodies, mode[1])
    reply_sequences = get_sequences(reply_contents, mode[2])

    data = []
    max_post, max_comment, max_reply = 0, 0, 0
    i = 0
    for post, comment, reply, satisfaction, satisfaction_float in zip(post_sequences, comment_sequences,
                                                                          reply_sequences, satisfactions,
                                                                          satisfactions_float):
        if len(post) > max_post:
            max_post = len(post)
        if len(comment) > max_comment:
            max_comment = len(comment)
        if len(reply) > max_reply:
            max_reply = len(reply)

        data.append([i, post, comment, reply, satisfaction, satisfaction_float])
        i += 1

    print(max_post, max_comment, max_reply)
    max_count = max(max_post, max_comment, max_reply)
    print(max_count)

    columns = ['index', 'post_contents', 'comment_contents', 'reply_contents', 'label', 'score']
    df = pd.DataFrame(data, columns=columns)

    # data split (train & test sets)
    idx_train, idx_remain = train_test_split(df.index.values, test_size=0.20, random_state=42)
    idx_val, idx_test = train_test_split(idx_remain, test_size=0.50, random_state=42)

    train_df = df.iloc[idx_train]
    val_df = df.iloc[idx_val]
    test_df = df.iloc[idx_test]

    count_min_label = min(train_df['label'].value_counts())

    labels = [0, 1, 2]

    train_sample_df = pd.DataFrame([], columns=columns)

    for label in labels:
        tmp = train_df[train_df['label'] == label]
        tmp_sampled = tmp.sample(frac=1).iloc[:count_min_label]
        train_sample_df = pd.concat([train_sample_df, tmp_sampled])

    train_sample_df = train_sample_df.sample(frac=1)

['seg', 'seg', 'snt']
10 4 10
10


In [3]:
def prepare_model(target, mode, model, epoch):
    # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    device = torch.device('cpu')

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

    model_path = f'../predicting-satisfaction-using-graphs/splitbert/model/{mode}/epoch_{epoch}.model'
    
    if model == 'transformer':
        model = SplitBertTransformerModel(num_labels=len(labels), embedding_size=384, max_sentences=10, max_len1=10,
                                          max_len2=4, device=device)
    else:
        model = SplitBertConcatEncoderModel(num_labels=len(labels), embedding_size=384, 
                                            max_len=max_count, device='cpu', target=target, output_attentions=True)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to('cpu')
    model.eval()

    for param in model.sbert.parameters():
        param.requires_grad = False
    
    if model == 'encoder':
        for param in model.bert.parameters():
            param.requires_grad = False
        
    return device, model, tokenizer

In [4]:
device, model, tokenizer = prepare_model('post_comment', 'seg_seg', 'transformer', 7)

In [5]:
def construct_input_ref_pair(targets):
    input_ids_list, ref_input_ids_list, attention_masks_list, sentence_count_list = [], [], [], []
    
    for contents in targets:
        result = tokenizer(contents, pad_to_max_length=True, truncation=True, max_length=256, return_tensors='pt')
        
        input_ids = result['input_ids']
        sentence_count_list.append(torch.tensor(len(input_ids)).unsqueeze(0))
        attention_masks = result['attention_mask']
        
        pad = (0, 0, 0, max_count-len(input_ids))
        input_ids = nn.functional.pad(input_ids, pad, "constant", 0)
        attention_masks = nn.functional.pad(attention_masks, pad, "constant", 0)
        ref_input_ids = torch.zeros_like(input_ids)

        input_ids_list.append(input_ids.unsqueeze(0))
        ref_input_ids_list.append(ref_input_ids.unsqueeze(0))
        attention_masks_list.append(attention_masks.unsqueeze(0))
    
    return input_ids_list, ref_input_ids_list, attention_masks_list, sentence_count_list

In [6]:
for N in range(1000):
    post, comment = post_sequences[N], comment_sequences[N]

    if len(post) >= 7 and len(comment) > 2:
        break

In [8]:
input_ids, ref_input_ids, attention_masks, sentence_counts = construct_input_ref_pair([post, comment])
    
one_hot_labels = torch.nn.functional.one_hot(torch.tensor(label), num_classes=len(labels))
inputs = {'labels': one_hot_labels.type(torch.float).to(device),
          'input_ids1': input_ids[0].to(device),
          'input_ids2': input_ids[1].to(device),
          'attention_mask1': attention_masks[0].to(device),
          'attention_mask2': attention_masks[1].to(device),
          'sentence_count1': sentence_counts[0].to(device),
          'sentence_count2': sentence_counts[1].to(device),
          'mode': 'post_comment'
         }
    
with torch.no_grad():
    attention = model(**inputs)

ValueError: Expected input batch_size (1) to match target batch_size (3).

In [147]:
attention[0].shape

torch.Size([1, 10, 10])

In [148]:
viz_attention = [attention[0].unsqueeze(0)]
print(viz_attention)

[tensor([[[[0.0911, 0.0959, 0.1063, 0.1124, 0.1015, 0.1009, 0.0903, 0.0897,
           0.1043, 0.1076],
          [0.0951, 0.0949, 0.1011, 0.1091, 0.1042, 0.1104, 0.0996, 0.0878,
           0.0951, 0.1027],
          [0.1005, 0.1008, 0.1037, 0.1097, 0.1011, 0.1046, 0.0938, 0.0884,
           0.0950, 0.1025],
          [0.1018, 0.1032, 0.0971, 0.1044, 0.1011, 0.1099, 0.1039, 0.0878,
           0.0930, 0.0978],
          [0.1034, 0.1091, 0.1046, 0.1060, 0.0979, 0.0996, 0.0922, 0.0886,
           0.0960, 0.1026],
          [0.0974, 0.1057, 0.1026, 0.1064, 0.0976, 0.0991, 0.0985, 0.0870,
           0.0997, 0.1059],
          [0.0946, 0.1048, 0.1068, 0.1072, 0.0937, 0.0961, 0.0930, 0.0911,
           0.1030, 0.1096],
          [0.0902, 0.0929, 0.1029, 0.1126, 0.1027, 0.1078, 0.0932, 0.0866,
           0.1039, 0.1073],
          [0.0973, 0.0974, 0.1028, 0.1059, 0.0979, 0.1044, 0.0942, 0.0949,
           0.0990, 0.1063],
          [0.0985, 0.1009, 0.1023, 0.1070, 0.1022, 0.1080, 0.0960, 0.089

In [149]:
all_sentences = [post, comment]
all_tokens = [item for all_sentences in all_sentences for item in all_sentences]
print(len(all_tokens))

comment_start = len(post)
all_tokens

10


[" I know I'm depressed. I have been many times before.",
 " I've had a really bad few months. Lost two close relatives, been constantly sick and moved recently.",
 " I'm isolating myself more and more. I think I am starting to use facebook as an excuse to not go out and actually see people. I have a hard time making myself leave the house to socialize even when I'm not depressed because I feel so socially awkward. I don't want to go out. Making myself go to the mailbox or the store is hard.",
 " I just don't know what to do. I mean I know I should call my doctor but I don't do it.",
 " I don't know if anyone around me knows I'm depressed. I don't like crying in front of people and if I talked about it I would probably cry.",
 " I know they would be there for me if I told them but I don't know. I don't know what to do. Its all been too hard.",
 " I always tell myself I will be better when the sunshine comes. But this time I'm just not. :'(",
 ' Oh, yikes! That kind of life celebration.

In [150]:
head_view(viz_attention, all_tokens, comment_start)

<IPython.core.display.Javascript object>