In [1]:
from transformers import ElectraModel
from transformers import ElectraTokenizer
import pandas as pd
import torch
import torch.nn as nn
import numpy as np

from pan20 import fake
from torch.nn import DataParallel
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

In [2]:
### Load dataset

df = fake.load_data()
truth = {x['author']: x['label'] for x in fake.load_truth()}
df['label'] = df.author.apply(lambda x: truth[x])

In [3]:
tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')

In [4]:
# encoded_data = tokenizer.batch_encode_plus(df['tweet'], pad_to_max_length=True, add_special_tokens=True)

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
model= ElectraModel.from_pretrained('google/electra-base-discriminator')
#model = DataParallel(model)
model = model.to(device)
model.eval()

ElectraModel(
  (embeddings): ElectraEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
    

In [None]:
# with torch.no_grad():

#     input_ids = torch.tensor(encoded_data["input_ids"])
#     outputs = model(input_ids)
#     last_hidden_states = outputs[0]
#     sentencevec = last_hidden_states[:,0,:]

In [7]:
from torch.utils.data import Dataset

class PanDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):

        text, label = self.data.iloc[idx, :].values
        label_tensors = torch.tensor(label)
        
        word_pieces = ["[CLS]"]
        tokens = self.tokenizer.tokenize(text)
        word_pieces += tokens + ["[SEP]"]
        
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensors = torch.tensor(ids)
        
        return tokens_tensors, label_tensors

In [8]:
def collate_fn(batch):
    
    tokens_tensors = [data[0] for data in batch]
    if batch[0][1] is not None:
        label_ids = torch.stack([label[1] for label in batch])
    else:
        label_ids = None

    tokens_tensors = pad_sequence(tokens_tensors, batch_first=True)
        
    masks_tensors = torch.zeros(tokens_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0, 1)
    
    return tokens_tensors, masks_tensors, label_ids

In [9]:
df_test = df.drop('author', 1)

In [10]:
dataset = PanDataset(df_test, tokenizer)

In [11]:
testloader = DataLoader(dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)

In [12]:
u_vectors = []

In [13]:
with torch.no_grad():
    for data in testloader:
        tokens_tensors, masks_tensors = data[:2]
        tokens_tensors = tokens_tensors.to(device)
        masks_tensors = masks_tensors.to(device)
        outputs = model(input_ids=tokens_tensors,
                        token_type_ids=None,
                        attention_mask=masks_tensors)
        
        u_vectors.append(outputs[0][:,0,:].cpu().squeeze().numpy())

        del tokens_tensors
        del masks_tensors

In [14]:
len(u_vectors)

30000

In [43]:
type(u_vectors)

list

In [15]:
sum_vectors = []

for i in range(0, 30000, 100):
    #sum_vectors.append(list(map(sum, zip(*u_vectors[i:i+100]))))
    sum_vectors.append(np.sum(u_vectors[i:i+100], axis=0))

In [16]:
avg_vectors = []

for i in range(0, 30000, 100):
    #sum_vectors.append(list(map(sum, zip(*u_vectors[i:i+100]))))
    avg_vectors.append(np.average(u_vectors[i:i+100], axis=0))

In [20]:
df.author[100]

'8vp74g6kssomu1a6akix6y3hqy6552t7'

In [84]:
## get original information

new_df = {
    'author': [],
    'label': []}

for i in range(0, 30000, 100):
    new_df['author'].append(df.author[i])
    new_df['label'].append(df.label[i])


new_df = pd.DataFrame(new_df)

In [18]:
len(new_df)

300

In [19]:
new_df['label'].value_counts()

1    150
0    150
Name: label, dtype: int64

In [20]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

In [55]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
scores = cross_val_score(clf, sum_vectors, new_df['label'], cv=5)
print("Vectors by sum:", scores)
print("Vectors by sum:", np.mean(scores))

clf = LogisticRegression()
scores = cross_val_score(clf, avg_vectors, new_df['label'], cv=5)
print("Vectors by avg:", scores)
print("Vectors by avg:", np.mean(scores))



Vectors by sum: [0.6        0.63333333 0.61666667 0.63333333 0.55      ]
Vectors by sum: 0.6066666666666667
Vectors by avg: [0.68333333 0.7        0.68333333 0.56666667 0.65      ]
Vectors by avg: 0.6566666666666665




In [89]:
is_spreader = []
for i in range(300):
    is_spreader.append(proba[i][1])

In [90]:
new_df_all = new_df.assign(probability=is_spreader)

In [91]:
pred_s = []
for i in range(300):
    if proba[i][1]<proba[i][0]:
        pred_s.append(0)
    else:
        pred_s.append(1)

In [92]:
np.mean(pred_s == new_df_all['label'])

0.6566666666666666

In [22]:
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
s_scores = cross_val_score(clf, sum_vectors, new_df['label'], cv=5)
print("Vectors by sum:", s_scores)
print("Vectors by sum:", np.mean(s_scores))

clf = GaussianNB()
a_scores = cross_val_score(clf, avg_vectors, new_df['label'], cv=5)
print("Vectors by avg:", a_scores)
print("Vectors by avg:", np.mean(a_scores))

Vectors by sum: [0.68333333 0.65       0.58333333 0.55       0.58333333]
Vectors by sum: 0.6100000000000001
Vectors by avg: [0.68333333 0.65       0.58333333 0.55       0.58333333]
Vectors by avg: 0.6100000000000001


In [77]:
from sklearn import svm

svc = svm.SVC(C=0.9, kernel='linear')
s_scores = cross_val_score(svc, sum_vectors, new_df['label'], cv=5)
print("Vectors by sum:", s_scores)
print("Vectors by sum:", np.mean(s_scores))

svc = svm.SVC(C=10000, degree=1, kernel='sigmoid', gamma=0.00009, probability=True)
a_scores = cross_val_score(svc, avg_vectors, new_df['label'], cv=5)
proba = cross_val_predict(svc, avg_vectors, new_df['label'], cv=5, method='predict_proba')
print("Vectors by avg:", a_scores)
print("Vectors by avg:", np.mean(a_scores))

Vectors by sum: [0.6        0.65       0.61666667 0.53333333 0.53333333]
Vectors by sum: 0.5866666666666667
Vectors by avg: [0.68333333 0.68333333 0.75       0.55       0.63333333]
Vectors by avg: 0.66


In [93]:
new_df_all.to_csv('electra_base_svm_0.66.csv', index=False)