# Preparation

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split

from torch.utils.data import TensorDataset, DataLoader
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import lr_scheduler

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

In [4]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [16]:
pheme = pd.read_csv('./data/_PHEME_text_AVGw2v.csv').drop(['token'],axis=1)
pheme_y = pd.read_csv('./data/_PHEME_target.csv').target
pheme_sparse = pd.read_csv('./data/_PHEME_sparse.csv')
pheme_event = pd.read_csv('./data/_PHEME_text.csv').Event

ext = pd.read_csv('./data/_PHEMEext_text_AVGw2v.csv').drop(['token'],axis=1)
ext_y = pd.read_csv('./data/_PHEMEext_text.csv').target
ext_sparse = pd.read_csv('./data/_PHEMEext_sparse.csv')
ext_event = pd.read_csv('./data/_PHEMEext_text.csv').Event

rhi = pd.read_csv('./data/_RHI_text_AVGw2v.csv').drop(['token'],axis=1)
rhi_y = pd.read_csv('./data/_RHI_target.csv')

In [17]:
pheme_sparse.verified = pheme_sparse.verified.replace({True: 1, False: 0}) 
ext_sparse.verified = ext_sparse.verified.replace({True: 1, False: 0}) 
pheme_sparse.has_question = pheme_sparse.has_question.replace({True: 1, False: 0}) 
ext_sparse.has_question = ext_sparse.has_question.replace({True: 1, False: 0}) 
pheme_sparse.has_exclaim = pheme_sparse.has_exclaim.replace({True: 1, False: 0}) 
ext_sparse.has_exclaim = ext_sparse.has_exclaim.replace({True: 1, False: 0}) 
pheme_sparse.has_period = pheme_sparse.has_period.replace({True: 1, False: 0}) 
ext_sparse.has_period = ext_sparse.has_period.replace({True: 1, False: 0}) 

In [24]:
pheme_sparse.shape[1]

27

# Torch

## BERTCLF 노트북을 참고한 방법 - Train

In [31]:
train_inputs = torch.tensor(pheme_sparse.values).float()
val_inputs = torch.tensor(ext_sparse.values).float()

# Convert other data types to torch.Tensor
train_labels = torch.tensor(pheme_y.values).float()
val_labels = torch.tensor(ext_y.values).float()

# For fine-tuning BERT, the authors recommend a batch size of 16 or 32.
batch_size = 32

# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set
val_data = TensorDataset(val_inputs, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

In [32]:
numFeature = train_inputs.size()[1]
print("Length of the Inputs are:\nTrain\t%s\nVal\t%s" %(train_inputs.size(), val_inputs.size()))

Length of the Inputs are:
Train	torch.Size([5802, 27])
Val	torch.Size([485, 27])


In [56]:
import random

class SparseMLPClassifier(torch.nn.Module):
  def __init__(self, freeze_model=False, numFeature=numFeature):
    """
    In the constructor we construct three nn.Linear instances that we will use
    in the forward pass.
    """
    super(SparseMLPClassifier, self).__init__()

    D_in, H, D_out = numFeature, 50, 1

    self.classifier = nn.Sequential(
        nn.Linear(D_in, H),
        # nn.BatchNorm1d(H),
        nn.ReLU(),
        # nn.Dropout(0.2),
        nn.Linear(H, D_out),
        # nn.ReLU(),
        # nn.Dropout(0.2),
        nn.Sigmoid()
    )

    if freeze_model:
        for param in self.classifier.parameters():
            param.requires_grad = False

  def forward(self, input):
    result = self.classifier(input)
    return result

In [57]:
# from torch.nn.optim import Adam

def initialize_model(epochs=300):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    model = SparseMLPClassifier(numFeature=numFeature)

    # Tell PyTorch to run the model on GPU
    model.to(device)

    # Create the optimizer
    optimizer = optim.Adam(model.parameters(), lr=0.1, weight_decay=0.1)

    criterion = nn.BCELoss()
    # loss = nn.CrossEntropyLoss()

    # Total number of training steps
    total_steps = len(train_dataloader) * epochs

    # Set up the learning rate scheduler
    # scheduler = get_linear_schedule_with_warmup(optimizer,
    #                                             num_warmup_steps=0, # Default value
    #                                             num_training_steps=total_steps)
    scheduler = lr_scheduler.StepLR(optimizer, step_size=0.1, gamma= 0.99)       

    return model, optimizer, criterion, scheduler
    # return bert_classifier, optimizer, scheduler


In [68]:
import random
import time

def set_seed(seed_value=42):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    """Train the BertClassifier model.
    """
    # Start training loop
    print("Start training...\n")
    val_loss_list = []
    val_acc_list = []
    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        # print("-"*70)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            # Load batch to GPU
            b_input, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass. This will return logits.
            logits = model(b_input)

            # Compute loss and accumulate the loss values
            # torch.argmax(logits(i)) for logit, i in enumerate(logits)
            
            logits = logits.flatten()
            loss = criterion(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()

            # Print the loss values and time elapsed for every 20 batches
            if (step % 100 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                # print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)
        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            val_loss, val_accuracy = evaluate(model, val_dataloader)
            val_loss_list.append(val_loss)
            val_acc_list.append(val_accuracy)

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            # print(val_loss_list)
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")
    # print(val_loss_list)
    # print(np.range(1,epochs+1))
    fig = plt.figure(figsize=(18,10))
    plt.plot(np.arange(0,epochs),val_loss_list)
    fig2 = plt.figure(figsize=(18,10))
    plt.plot(np.arange(0,epochs),val_acc_list)
    print("Training complete!")

def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input, b_labels = tuple(t.to(device) for t in batch)
        # print("\n\ninputs are:\n",b_input)
        # print("\n\nlabels are:\n",b_labels)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input)
            print("\n\nLogits are:",logits)
            # logits = logits.flatten()

        # Compute loss
        loss = criterion(logits, b_labels.unsqueeze(0))
        # loss = criterion(logits.flatten(), b_labels)
        print("\n\Losses are:",loss)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1)

        # Calculate the accuracy rate
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

import torch.nn.functional as F


In [59]:
set_seed(42)    # Set seed for reproducibility
model, optimizer, criterion, scheduler = initialize_model(epochs=100)
criterion = nn.BCELoss()
# criterion = nn.BCEWithLogitsLoss()
# criterion = nn.CrossEntropyLoss()

In [60]:
ext_sparse.head(3)
# pheme_sparse

Unnamed: 0,URLcount,Noun,Verb,Adjective,Pronoun,FirstPersonPronoun,SecondPersonPronoun,ThirdPersonPronoun,Adverb,Numeral,...,word_count,has_question,has_exclaim,has_period,capital_ratio,tweet_count,listed_count,friends_count,follow_ratio,verified
0,1,2,2,1,0,0,0,0,0,0,...,8,0,0,1,0.101449,15.31189,148,652,14.414223,0
1,0,9,6,3,0,2,0,0,3,0,...,25,0,0,1,0.027027,8.991522,1623,176,18.897987,1
2,0,7,4,1,1,0,0,1,0,0,...,20,0,0,1,0.02521,16.344851,2165,144,17.825921,0


In [69]:
train(model, train_dataloader, val_dataloader, epochs=1, evaluation=True)

Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------


Logits are: tensor([[0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [1.6081e-16],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00]])


ValueError: Using a target size (torch.Size([1, 32])) that is different to the input size (torch.Size([32, 1])) is deprecated. Please ensure they have the same size.

In [None]:
torch.save(model.state_dict(), './MLP_Sparse_CLF_ord.pt')

## Test

In [None]:
PATH = './MLP_W2V_to_Sigmoid_ord.pt'
# bn_state_dict = torch.load('./BERT_raw_to_fine_tune_ord.pt')
# bert_classifier.load_state_dict(bn_state_dict)
model.load_state_dict(torch.load(PATH))

In [370]:
from sklearn.metrics import accuracy_score, roc_curve, auc

def evaluate_roc(probs, y_true):
    """
    - Print AUC and accuracy on the test set
    - Plot ROC
    @params    probs (np.array): an array of predicted probabilities with shape (len(y_true), 2)
    @params    y_true (np.array): an array of the true values with shape (len(y_true),)
    """
    preds = probs[:, 1]
    fpr, tpr, threshold = roc_curve(y_true, preds)
    roc_auc = auc(fpr, tpr)
    print(f'AUC: {roc_auc:.4f}')
       
    # Get accuracy over the test set
    y_pred = np.where(preds >= 0.5, 1, 0)
    accuracy = accuracy_score(y_true, y_pred)
    print(f'Accuracy: {accuracy*100:.2f}%')
    
    # Plot ROC AUC
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

In [388]:
import torch.nn.functional as F

def model_predict(model, test_dataloader):
    """Perform a forward pass on the trained BERT model to predict probabilities
    on the test set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    all_logits = []

    # For each batch in our test set...
    for batch in test_dataloader:
        # Load batch to GPU
        b_input,b_labels = tuple(t.to(device) for t in batch)
        # print(b_input)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input)
        all_logits.append(logits)
    
    # Concatenate logits from each batch
    all_logits = torch.cat(all_logits, dim=0)

    # Apply softmax to calculate probabilities
    probs = F.softmax(all_logits, dim=1).cpu()

    return probs

In [425]:
# Compute predicted probabilities on the test set
probs = model_predict(model, val_dataloader)
probs = probs.numpy()
# print(probs)
# print(type(probs))

# # Evaluate the Bert classifier
# print(len(probs))
target = torch.tensor(ext_y)
target = target.unsqueeze(dim=1).numpy()
# print(target)
# evaluate_roc(probs, target)

IndexError: index 1 is out of bounds for axis 1 with size 1

In [None]:
model.eval()
correct = 0
total = 0
test_loss = 0
outputs_list = []
y_list = []

with torch.no_grad():
    val_loss = 0

    for i, data in enumerate(test_loader):
        x, y = data
        x, y = x.float(), y.long()
        outputs = task1_model(x)
        loss = criterion(outputs, y.unsqueeze(1).float())
        _, predicted = torch.max(outputs.data, 1)
        outputs_list.append(predicted[:])
        total += y.size(0)
        correct += (predicted == y).double().sum().item()
        val_loss += loss.item()
        y_list.append(y)

print('Accuracy of the test dataset is: %d %%' % (100 * correct / total))
print("Loss of validation set: {:.5f}".format((val_loss / test_size)))
acc = (100 * correct / total)

## adf

In [None]:
task1_model.eval()
correct = 0
total = 0
test_loss = 0
outputs_list = []
y_list = []

with torch.no_grad():
    val_loss = 0

    for i, data in enumerate(task1_test_dataloader):
        x, y = data
        x, y = x.float(), y.long()
        outputs = task1_model(x)
        loss = criterion(outputs, y.unsqueeze(1).float())
        _, predicted = torch.max(outputs.data, 1)
        outputs_list.append(predicted[:])
        total += y.size(0)
        correct += (predicted == y).double().sum().item()
        val_loss += loss.item()
        y_list.append(y)

print('Accuracy of the test dataset is: %d %%' % (100 * correct / total))
print("Loss of validation set: {:.5f}".format((val_loss / test_size)))
acc = (100 * correct / total)

In [39]:
print(task1_model.parameters)

<bound method Module.parameters of FC_net(
  (fc1): Linear(in_features=792, out_features=130, bias=True)
  (fc2): Linear(in_features=130, out_features=60, bias=True)
  (fc3): Linear(in_features=60, out_features=1, bias=True)
  (drop_2): Dropout(p=0.2, inplace=False)
)>


In [40]:
import torch.optim as optim

criterion = nn.BCELoss()
# optimizer = optim.SGD(task1_model.parameters(), lr=0.001, momentum=0.9)
optimizer = optim.Adam(task1_model.parameters(), lr=0.01)

In [41]:
train_loss = []
train_accuracy = []

prev_loss = 10
PATH = "./state_dict_BERT_fc.pt"
best_acc = 10.0
num_epochs = 10

val_corrects_list = []
val_loss_list = []

In [42]:
for epoch in range(num_epochs):  # loop over the dataset multiple times
    print('Epoch {}/{}'.format(epoch, num_epochs - 1))
    print('-' * 10)

    running_loss = 0.0
    running_corrects = 0
    task1_model.train()  # Set model to training mode
    for i, data in enumerate(task1_train_dataloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        # inputs, labels = inputs.float(), labels.long()

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = task1_model(inputs)

        labels = labels.unsqueeze(1).float()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()    

        running_loss += loss.item()
        running_corrects += torch.sum(outputs == labels.data)
        # print(running_corrects)

    epoch_loss = running_loss / train_size
    epoch_acc = running_corrects.double() / train_size
    train_loss.append(epoch_loss)
    train_accuracy.append(epoch_acc)

    print('Train) Loss: {:.4f} Acc: {:.4f}'.format(epoch_loss, epoch_acc))

    # if epoch_loss < best_acc:
    #     # print("prev_loss: {:.5f}".format(prev_loss))
    #     # print("loss: {:.5f}".format(loss))
    #     print("Saving the best model w/ loss {:.4f}".format(epoch_loss))
    #     torch.save(task1_model.state_dict(),PATH)
    #     best_acc = epoch_loss

Epoch 0/9
----------
Train) Loss: 0.3190 Acc: 0.0042
Epoch 1/9
----------
Train) Loss: 5.4705 Acc: 0.6325
Epoch 2/9
----------
Train) Loss: 5.7149 Acc: 0.6568
Epoch 3/9
----------
Train) Loss: 5.7213 Acc: 0.6568
Epoch 4/9
----------
Train) Loss: 5.7149 Acc: 0.6568
Epoch 5/9
----------
Train) Loss: 5.7213 Acc: 0.6568
Epoch 6/9
----------
Train) Loss: 5.7149 Acc: 0.6568
Epoch 7/9
----------
Train) Loss: 5.7149 Acc: 0.6568
Epoch 8/9
----------
Train) Loss: 5.7213 Acc: 0.6568
Epoch 9/9
----------
Train) Loss: 5.7213 Acc: 0.6568


In [43]:
task1_model.eval()
correct = 0
total = 0
test_loss = 0
outputs_list = []
y_list = []

with torch.no_grad():
    val_loss = 0

    for i, data in enumerate(task1_test_dataloader):
        x, y = data
        x, y = x.float(), y.long()
        outputs = task1_model(x)
        loss = criterion(outputs, y.unsqueeze(1).float())
        _, predicted = torch.max(outputs.data, 1)
        outputs_list.append(predicted[:])
        total += y.size(0)
        correct += (predicted == y).double().sum().item()
        val_loss += loss.item()
        y_list.append(y)

print('Accuracy of the test dataset is: %d %%' % (100 * correct / total))
print("Loss of validation set: {:.5f}".format((val_loss / test_size)))
acc = (100 * correct / total)

Accuracy of the test dataset is: 68 %
Loss of validation set: 5.20883
