import libraries

In [29]:
import pandas as pd
import numpy as np
import zipfile
import re
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score

unzip and load csv files

In [30]:
path_to_zip = '/content/drive/My Drive/data/public_data.zip'

In [31]:
with zipfile.ZipFile(path_to_zip) as myzip:
    myzip.extractall()

In [32]:
path_to_csv = '/content/train.csv'

In [33]:
train_df = pd.read_csv(path_to_csv, sep='\t')

calculate number of each classes

In [7]:
print('true: ', sum(train_df.iloc[:, 1] == 'true'), 'false: ',sum(train_df.iloc[:, 1] != 'true'))

true:  19919 false:  3964


create new balanced dataset from old

In [34]:
train_df_0 = train_df.loc[train_df.iloc[:, 1] == 'true', :].sample(3964)
train_df_1 = train_df.loc[train_df.iloc[:, 1] == 'fake', :]
train_balanced = pd.concat([train_df_0, train_df_1]).iloc[:, :]
train_df, valid_df = train_test_split(train_balanced, test_size=0.2, random_state=42)


train/valid splitting

In [8]:
x_train, x_test = train_test_split(train_df, test_size=0.2, random_state = 1)

load multilingual bert from https://github.com/UKPLab/sentence-transformers

In [35]:
!pip install -U sentence-transformers

Requirement already up-to-date: sentence-transformers in /usr/local/lib/python3.6/dist-packages (0.3.6)


In [36]:
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer('distiluse-base-multilingual-cased')

drop rows with nan 

In [70]:
train_df = train_df.dropna()
valid_df = valid_df.dropna()

define dataset and dataloader

In [39]:
class Text_Dataset(object):
    """An abstract class representing a Dataset.
    All other datasets should subclass it. All subclasses should override
    ``__len__``, that provides the size of the dataset, and ``__getitem__``,
    supporting integer indexing in range from 0 to len(self) exclusive.
    """
    def __init__(self, df):
      self.data = df
    
    def __getitem__(self, index):
      text = self.data.iloc[index, 0]
      label = int(self.data.iloc[index, 1] == 'true')
      return text, label

    def __len__(self):
      return self.data.shape[0]

In [105]:
train_dataset = Text_Dataset(train_df)
test_dataset = Text_Dataset(valid_df)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

define model bert + 2 layer softmax classifier

In [42]:
class bert_classifier(torch.nn.Module):
    def __init__(self, D_in, H, D_out, embedder):
        super(bert_classifier, self).__init__()
        self.embedder = embedder
        self.linear1 = torch.nn.Linear(in_features=D_in, out_features = H)
        self.linear2 = torch.nn.Linear(in_features=H, out_features=D_out)

    def forward(self, x):
        emb = torch.Tensor(self.embedder.encode(x))
        h_relu = F.relu(self.linear1(emb))
        y_pred = self.linear2(h_relu)
        return y_pred

initialise classifier and optimizer

In [None]:
N, D_in, H, D_out = 16, 512, 256, 2

# Construct our model by instantiating the class defined above
classifier = bert_classifier(D_in, H, D_out, embedder)
classifier = classifier.to(device)

optimizer = torch.optim.Adam([
                {'params': classifier.linear1.parameters()},
                {'params': classifier.linear2.parameters()},
                {'params': list(classifier.embedder.parameters()), 'lr': 5e-5}], lr=1e-3)
n_epoch = 16
device = 'cpu'

trainig

In [116]:
loss_hist = []
acc_hist = []

for _epoch in range(n_epoch):
    classifier.train()
    loss_accum = 0
    acc_accum = 0
    for _i_step, batch in enumerate(train_loader):
        x = batch[0]
        y = torch.tensor(batch[1])

        optimizer.zero_grad()

        predictions = classifier(x)
        loss = nn.functional.cross_entropy(predictions.cpu(), y)
        loss.backward()
        optimizer.step()

        loss_accum += loss.item()
        _, preds = torch.max(predictions, 1)
        acc = sum(y == preds).double() / y.shape[0]
        acc_accum += acc.item()
    acc_accum_mean = acc_accum / (_i_step + 1)
    loss_accum_mean = loss_accum / (_i_step + 1)
    loss_hist.append(loss_accum_mean)
    acc_hist.append(acc_accum_mean)

    print('Epoch: ', _epoch+1) 
    print('Loss: ', loss_accum_mean, 'acc: ', acc_accum_mean)


  # Remove the CWD from sys.path while we load stuff.


Epoch:  1
Loss:  0.018103461047389337 acc:  0.9988979848866498
Epoch:  2
Loss:  0.01251998608728099 acc:  1.0


save model

In [121]:
torch.save(classifier, '/content/drive/My Drive/classifier_final.pth')

eval model on valid data

In [118]:
    classifier.eval()
    preds_all = torch.Tensor().long().to(device)
    labels_all = torch.Tensor().long().to(device)
    with torch.no_grad():
        for _i_step, batch in enumerate(test_loader):
            x = batch[0]
            y = torch.tensor(batch[1])

            probs = classifier(x)
            _, preds = torch.max(probs, 1)

            labels_all = torch.cat((labels_all, y), 0)
            preds_all = torch.cat((preds_all, preds.long()), 0)

    acc = sum(labels_all == preds_all).double() / preds_all.shape[0]


  import sys


In [120]:
f1_score(labels_all.numpy(), preds_all.numpy(), average='macro')

0.9287433894462541

make prediction

In [124]:
path_to_csv = '/content/test.csv'
test_df = pd.read_csv(path_to_csv, sep='\t')
test_score_dataset = Text_Dataset(test_df)
test_score_dataloader = DataLoader(test_score_dataset, batch_size=16, shuffle=True)
classifier.eval()
preds_all = torch.Tensor().long().to(device)
labels_all = torch.Tensor().long().to(device)
with torch.no_grad():
    for _i_step, batch in enumerate(test_score_dataloader):
        x = batch[0]
        y = torch.tensor(batch[1])

        probs = classifier(x)
        _, preds = torch.max(probs, 1)

        preds_all = torch.cat((preds_all, preds.long()), 0)


  # This is added back by InteractiveShellApp.init_path()


In [144]:
for i in range(test_df.shape[0]):
    if preds_all[i] == 0:
        test_df.iloc[i, 1] = 'fake'
    else:
         test_df.iloc[i, 1] = 'true'

In [147]:
test_df.to_csv('result.csv')