In [7]:
import pandas as pd
import numpy as np
import random

import torch
import torch.nn as nn
from tqdm.notebook import tqdm
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from transformers import BertForSequenceClassification

from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import r2_score
import csv

In [8]:
data_df = pd.read_csv('../predicting-satisfaction-using-graphs/seeker_satisfaction_1000_thread.csv', encoding='ISO-8859-1')
# post_titles = list(data_df['post_title'])
post_contents = list(data_df['post_content'])
comment_bodies = list(data_df['comment_body'])
satisfactions = list(data_df['satisfaction'])

print(satisfactions.index(max(satisfactions)))

922


In [9]:
data = []

for content, body, satisfaction in zip(post_contents, comment_bodies, satisfactions):
    if content != '[deleted]' and content != '[removed]' and body != '[deleted]' and body != '[removed]':
        data.append([content + ' ' + body, satisfaction])
        # data.append([content + '[SEP]' + body, satisfaction])
        
data

[['Third year college student. My depression has always stemmed largely from social anxiety, wishing I had friends but not knowing how to make them. I spent two years in dorms and never spoke to any of my neighbors and didn\'t really meet anybody.\n\nLast night, for the first time in my life, I was invited to a party (not personally mind you; I was a part of a group that got collectively invited). I drank a little bit, but got tipsy quickly, and not knowing my limits I stopped. I was completely sober within the first hour. \n\nBesides introductions and an occasional greeting, the only people I actually spoke to were the people I already knew. I then spent a few hours sitting in a corner and observing drunk antics.\n\nIt was pretty weird, observing two people go from complete strangers, to being in one another\'s arms at the end of the night. Looks so easy.\n\nIs this the shit that I\'ve been "missing" for all of my life? In which case, wow. Good riddance, I\'ll take my depression over 

In [11]:
df = pd.DataFrame(data, columns=['contents', 'label'])

test_size = 0.2
seed = 42
inputs_train, inputs_test, labels_train, labels_test = train_test_split(df.index.values,
                                                                        df.label.values,
                                                                        test_size=test_size,
                                                                        random_state=seed)

df['data_type'] = ['not_set'] * df.shape[0]

df.loc[inputs_train, 'data_type'] = 'train'
df.loc[inputs_test, 'data_type'] = 'test'

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                          do_lower_case=True)

encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type == 'train'].contents.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=512,
    return_tensors='pt'
)

encoded_data_test = tokenizer.batch_encode_plus(
    df[df.data_type == 'test'].contents.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=512,
    return_tensors='pt'
)

# BertTokenizer.build_inputs_with_special_tokens()

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(labels_train, dtype=torch.float32)

input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(labels_test, dtype=torch.float32)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)

batch_size = 3
# batch_size = 32

dataloader_train = DataLoader(dataset_train,
                              sampler=RandomSampler(dataset_train),
                              batch_size=batch_size)

dataloader_test = DataLoader(dataset_test,
                                   sampler=SequentialSampler(dataset_test),
                                   batch_size=batch_size)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                      num_labels=1,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU.")
else:
    print("No GPU available, using the CPU instead.")
    device = torch.device("cpu")

model.to(device)

optimizer = AdamW(model.parameters(),
                  lr=2e-5,  # learning rate.
                  eps=1e-8)  # learning rate가 0으로 나눠지는 것을 방지하기 위한 epsilon 값.

epochs = 200

# learning rate decay를 위한 scheduler. (linear 이용)
# lr이 0부터 optimizer에서 설정한 lr까지 linear하게 warmup 됐다가 다시 0으로 linear 하게 감소.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train) * epochs)

loss_function = nn.MSELoss()


def evaluate(dataloader_val):
    model.eval()
    loss_val_total = 0

    predictions, true_vals = [], []

    for batch in dataloader_val:
        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2],
                  }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total / len(dataloader_val)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals


training_result = []
val_values = []

for epoch in tqdm(range(1, epochs + 1)):
    evaluation_result = []
    model.train()
    loss_train_total = 0
    # print(dataloader_train)
    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    i = 0

    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2],
                  }

        # print(inputs['labels'])
        # batch_inputs, batch_masks, batch_labels = tuple(b.to(device) for b in batch)
        # print(batch_labels[0][0])
        outputs = model(**inputs)
        print(f'{model.bert.embeddings.word_embeddings.weight}')
        # print(outputs[1])
        loss = outputs[0]
        print(f'i : {i}, loss : {loss}')
        
        your_file = open('test.csv', 'ab')
        np.savetxt(model.bert.embeddings.word_embeddings.weight.numpy())
        your_file.close()
        
        # r2 = my_r2_score(outputs[1], batch[2])
        # print(r2)
        # print('======================')
        loss_train_total += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item() / len(batch))})
        i += 1

    # torch.save(model.state_dict(), f'data_volume/inf_macro_finetuned_BERT_epoch_{epoch}.model')
    tqdm.write(f'\nEpoch {epoch}')
    loss_train_avg = loss_train_total / len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')
    val_loss, predictions, true_vals = evaluate(dataloader_test)
    evaluation_result.append([val_loss, predictions, torch.tensor(true_vals)])
    tqdm.write(f'Validation loss: {val_loss}')

    true_vals = evaluation_result[0][2].tolist()
    # print(true_vals)
    predict = sum(evaluation_result[0][1].tolist(), [])
    # print(predict)
    tqdm.write(f'R^2 score: {r2_score(true_vals, predict)}')

    pred_df = pd.DataFrame(predictions)
    pred_df.to_csv(f'../predicting-satisfaction-using-graphs/csv/whitespace/batch_{batch_size}_lr_2e-5/epoch_{epoch}_predicted_vals.csv')

    training_result.append([epoch, loss_train_avg, val_loss, r2_score(true_vals, predict)])


fields = ['epoch', 'training_loss', 'validation_loss', 'r^2_score']
with open(f'../predicting-satisfaction-using-graphs/csv/whitespace/batch_{batch_size}_lr_2e-5/training_result.csv', 'w', newline='') as f:
    # using csv.writer method from CSV package
    write = csv.writer(f)

    write.writerow(fields)
    write.writerows(training_result)


true_df = pd.DataFrame(true_vals)
true_df.to_csv(f'../predicting-satisfaction-using-graphs/csv/true_vals.csv')

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model f

No GPU available, using the CPU instead.


  0%|          | 0/200 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/151 [00:00<?, ?it/s]

Parameter containing:
tensor([[-0.0102, -0.0615, -0.0265,  ..., -0.0199, -0.0372, -0.0098],
        [-0.0117, -0.0600, -0.0323,  ..., -0.0168, -0.0401, -0.0107],
        [-0.0198, -0.0627, -0.0326,  ..., -0.0165, -0.0420, -0.0032],
        ...,
        [-0.0218, -0.0556, -0.0135,  ..., -0.0043, -0.0151, -0.0249],
        [-0.0462, -0.0565, -0.0019,  ...,  0.0157, -0.0139, -0.0095],
        [ 0.0015, -0.0821, -0.0160,  ..., -0.0081, -0.0475,  0.0753]],
       requires_grad=True)
i : 0, loss : 20.07868194580078
Parameter containing:
tensor([[-0.0102, -0.0615, -0.0265,  ..., -0.0199, -0.0372, -0.0098],
        [-0.0117, -0.0600, -0.0323,  ..., -0.0168, -0.0401, -0.0107],
        [-0.0198, -0.0627, -0.0326,  ..., -0.0165, -0.0420, -0.0032],
        ...,
        [-0.0218, -0.0556, -0.0135,  ..., -0.0043, -0.0151, -0.0249],
        [-0.0462, -0.0565, -0.0019,  ...,  0.0157, -0.0139, -0.0095],
        [ 0.0015, -0.0821, -0.0160,  ..., -0.0081, -0.0475,  0.0753]],
       requires_grad=True)
i 

KeyboardInterrupt: 