### On top of the Embeddedings gained from Bertweet, I added simple Neural network to 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split

from torch.utils.data import TensorDataset, DataLoader
import torchvision
import torchvision.transforms as transforms

In [3]:
train_avg = pd.read_csv('./train_avg.csv')
test_avg = pd.read_csv('./test_avg.csv')

train_doc = pd.read_csv('./train_doc.csv')
test_doc = pd.read_csv('./test_doc.csv')

valid_avg = pd.read_csv('./valid_avg.csv')
valid_doc = pd.read_csv('./valid_doc.csv')

df_bertweet = pd.read_csv('./df_bertweet.csv')
df_valid_bertweet = pd.read_csv('./df_valid_bertweet.csv')

In [4]:
for dataset in [train_avg, test_avg, train_doc, test_doc, valid_avg, valid_doc, df_bertweet, df_valid_bertweet]:
    dataset.drop(['text','text_token','urls', 'urls_expanded','user_url'], axis=1, inplace=True)

### Data with BERTweet Embedding consists of 797 Dimensions: which are 767 Embeddings and 30 additional features

### Other Dataset consists of 328 dimensions

In [5]:
print("Shape of the Train and test data of Averaged Word2Vec: {}/{}".format(train_avg.shape, test_avg.shape))
print("Shape of the Train and test data of Doc2vec: {}/{}".format(train_doc.shape, test_doc.shape))
print("\nShape of the validation data of Avg: {}".format(valid_avg.shape))
print("Shape of the validation data of Doc2vec: {}".format(valid_doc.shape))
print("\nShape of the data w/ BERTweet: {}".format(df_bertweet.shape))
print("\nShape of the validation data w/ BERTweet: {}".format(df_valid_bertweet.shape))

Shape of the Train and test data of Averaged Word2Vec: (4912, 329)/(890, 329)
Shape of the Train and test data of Doc2vec: (4912, 329)/(890, 329)

Shape of the validation data of Avg: (328, 329)
Shape of the validation data of Doc2vec: (328, 329)

Shape of the data w/ BERTweet: (5802, 797)

Shape of the validation data w/ BERTweet: (390, 797)


In [7]:
# Dropping Target values from the dataset
train_y = train_avg.isRumor
test_y = test_avg.isRumor
valid_y = valid_avg.isRumor
df_bertweet_y = df_bertweet.isRumor
df_valid_bertweet_y = df_valid_bertweet.isRumor
for dataset in [train_avg, test_avg, train_doc, test_doc, valid_avg, valid_doc, df_bertweet, df_valid_bertweet]:
    dataset.drop(['isRumor'], axis=1, inplace=True)

In [8]:
print(train_y.shape, test_y.shape, valid_y.shape, df_bertweet_y.shape, df_valid_bertweet_y.shape)

(4912,) (890,) (328,) (5802,) (390,)


In [28]:
# Base dataset are the baseline feature set to be inputted to the model
# Here, 4 features are dropped for their lack of predictive power

train_avg_base = train_avg.drop(['hasURL', 'hasUserURL', 'isNotOnlyText', 'char_count'],axis=1)
test_avg_base = test_avg.drop(['hasURL', 'hasUserURL', 'isNotOnlyText', 'char_count'],axis=1)
valid_avg_base = valid_avg.drop(['hasURL', 'hasUserURL', 'isNotOnlyText', 'char_count'],axis=1)
train_doc_base = train_doc.drop(['hasURL', 'hasUserURL', 'isNotOnlyText', 'char_count'],axis=1)
test_doc_base = test_doc.drop(['hasURL', 'hasUserURL', 'isNotOnlyText', 'char_count'],axis=1)
valid_doc_base = valid_doc.drop(['hasURL', 'hasUserURL', 'isNotOnlyText', 'char_count'],axis=1)
bertweet_base = df_bertweet.drop(['hasURL', 'hasUserURL', 'isNotOnlyText', 'char_count'],axis=1)
bertweet_valid_base = df_valid_bertweet.drop(['hasURL', 'hasUserURL', 'isNotOnlyText', 'char_count'],axis=1)

In [29]:
bertweet_base

Unnamed: 0,Noun,Verb,Adjective,Pronoun,FirstPersonPronoun,SecondPersonPronoun,ThirdPersonPronoun,Adverb,Numeral,Conjunction_inj,...,758,759,760,761,762,763,764,765,766,767
0,6,3,0,0,0,0,0,0,0,2,...,0.080576,-0.049174,0.014364,0.135755,0.120841,-0.019499,-0.092559,0.037895,-0.026119,-0.037888
1,2,1,2,0,0,0,0,0,0,1,...,0.065754,-0.008325,0.039999,0.267778,0.049828,0.036786,-0.076806,0.090258,-0.006316,0.135195
2,3,4,8,0,0,0,0,1,0,2,...,0.037640,-0.017665,0.026855,0.044829,0.037694,-0.000555,-0.025542,0.087989,0.146719,0.105573
3,5,5,0,0,0,0,0,0,0,0,...,-0.013915,-0.057492,0.044381,0.163093,-0.009258,0.032880,-0.051174,0.089870,0.100986,-0.036360
4,7,2,0,0,0,0,0,0,0,2,...,-0.001629,-0.001269,0.042457,0.149599,0.114457,0.021848,-0.135230,0.047658,0.151301,-0.004796
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5797,2,2,1,1,0,1,0,0,0,2,...,0.029064,0.093296,0.016626,0.213296,-0.138641,0.002770,0.038687,-0.118342,0.048587,-0.007331
5798,5,2,1,1,1,0,0,0,0,2,...,-0.060785,0.063776,0.005694,0.109999,0.124272,0.051577,0.152288,0.192712,-0.036597,0.031372
5799,4,3,2,1,0,0,1,0,0,1,...,0.101352,0.036456,0.062657,0.156872,0.106964,0.037552,-0.016714,0.061652,0.222777,-0.101658
5800,7,3,2,0,0,0,0,0,0,3,...,-0.019152,-0.143544,0.130950,0.113090,0.227164,-0.087948,-0.073185,-0.042964,0.112651,-0.094290


In [30]:
# 위의 데이터들을 torchTensor로 변환한뒤 Unsqueeze한다.
# 이후 TensorDataset를 생성한다. (X, y 값을 담은 텐서들을 인자로 넘겨줌)

tensor_x = torch.Tensor(np.array(bertweet_base))
tensor_y = torch.Tensor(np.array(df_bertweet_y))

tensor_x = tensor_x.unsqueeze(1)
tensor_y = tensor_y.unsqueeze(1)

task1_dataset = TensorDataset(tensor_x,tensor_y)

In [31]:
# 훈련 데이터 / 테스트 데이터를 torch.utils.data.random_split()를 통해서 나눠준다
input_len = len(task1_dataset)
test_ratio = 0.1
test_size = int(input_len * test_ratio)
train_size = input_len - test_size

print("Length of the Inputs are: ",input_len, train_size, test_size)

train_data, test_data = torch.utils.data.random_split(task1_dataset, (train_size, test_size))
print(len(train_data), len(test_data))

Length of the Inputs are:  5802 5222 580
5222 580


In [32]:
# 위에서 생성한 훈련/테스트 데이터를 각각 DataLoader를 호출해 데이터 로더를 생성한다.
# 참고로 이 코드에서는 task1_dataset -> tensor_x/y -> train_avg_base/train_y를 사용하고 있다.
task1_train_dataloader = DataLoader(train_data, batch_size=6, shuffle=True, num_workers=2)
task1_test_dataloader = DataLoader(test_data, batch_size=6, shuffle=True, num_workers=2)

In [38]:
# FC_net을 생성 -> 
class FC_net(nn.Module):
    def __init__(self):
        super(FC_net, self).__init__() # 1*20
        self.fc1 = nn.Linear(792, 130) # 420
        self.fc2 = nn.Linear(130, 60)
        self.fc3 = nn.Linear(60, 1)

        self.drop_2 = nn.Dropout(0.2)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x
        
task1_model = FC_net()

In [39]:
print(task1_model.parameters)

<bound method Module.parameters of FC_net(
  (fc1): Linear(in_features=792, out_features=130, bias=True)
  (fc2): Linear(in_features=130, out_features=60, bias=True)
  (fc3): Linear(in_features=60, out_features=1, bias=True)
  (drop_2): Dropout(p=0.2, inplace=False)
)>


In [40]:
import torch.optim as optim

criterion = nn.BCELoss()
# optimizer = optim.SGD(task1_model.parameters(), lr=0.001, momentum=0.9)
optimizer = optim.Adam(task1_model.parameters(), lr=0.01)

In [41]:
train_loss = []
train_accuracy = []

prev_loss = 10
PATH = "./state_dict_BERT_fc.pt"
best_acc = 10.0
num_epochs = 10

val_corrects_list = []
val_loss_list = []

In [42]:
for epoch in range(num_epochs):  # loop over the dataset multiple times
    print('Epoch {}/{}'.format(epoch, num_epochs - 1))
    print('-' * 10)

    running_loss = 0.0
    running_corrects = 0
    task1_model.train()  # Set model to training mode
    for i, data in enumerate(task1_train_dataloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        # inputs, labels = inputs.float(), labels.long()

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = task1_model(inputs)

        labels = labels.unsqueeze(1).float()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()    

        running_loss += loss.item()
        running_corrects += torch.sum(outputs == labels.data)
        # print(running_corrects)

    epoch_loss = running_loss / train_size
    epoch_acc = running_corrects.double() / train_size
    train_loss.append(epoch_loss)
    train_accuracy.append(epoch_acc)

    print('Train) Loss: {:.4f} Acc: {:.4f}'.format(epoch_loss, epoch_acc))

    # if epoch_loss < best_acc:
    #     # print("prev_loss: {:.5f}".format(prev_loss))
    #     # print("loss: {:.5f}".format(loss))
    #     print("Saving the best model w/ loss {:.4f}".format(epoch_loss))
    #     torch.save(task1_model.state_dict(),PATH)
    #     best_acc = epoch_loss

Epoch 0/9
----------
Train) Loss: 0.3190 Acc: 0.0042
Epoch 1/9
----------
Train) Loss: 5.4705 Acc: 0.6325
Epoch 2/9
----------
Train) Loss: 5.7149 Acc: 0.6568
Epoch 3/9
----------
Train) Loss: 5.7213 Acc: 0.6568
Epoch 4/9
----------
Train) Loss: 5.7149 Acc: 0.6568
Epoch 5/9
----------
Train) Loss: 5.7213 Acc: 0.6568
Epoch 6/9
----------
Train) Loss: 5.7149 Acc: 0.6568
Epoch 7/9
----------
Train) Loss: 5.7149 Acc: 0.6568
Epoch 8/9
----------
Train) Loss: 5.7213 Acc: 0.6568
Epoch 9/9
----------
Train) Loss: 5.7213 Acc: 0.6568


In [43]:
task1_model.eval()
correct = 0
total = 0
test_loss = 0
outputs_list = []
y_list = []

with torch.no_grad():
    val_loss = 0

    for i, data in enumerate(task1_test_dataloader):
        x, y = data
        x, y = x.float(), y.long()
        outputs = task1_model(x)
        loss = criterion(outputs, y.unsqueeze(1).float())
        _, predicted = torch.max(outputs.data, 1)
        outputs_list.append(predicted[:])
        total += y.size(0)
        correct += (predicted == y).double().sum().item()
        val_loss += loss.item()
        y_list.append(y)

print('Accuracy of the test dataset is: %d %%' % (100 * correct / total))
print("Loss of validation set: {:.5f}".format((val_loss / test_size)))
acc = (100 * correct / total)

Accuracy of the test dataset is: 68 %
Loss of validation set: 5.20883
