In [1]:
from transformers import BertModel
from transformers import BertTokenizer
import pandas as pd
import torch
import torch.nn as nn
import numpy as np

from pan20 import fake
from torch.nn import DataParallel
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

In [2]:
### Load dataset

df = fake.load_data()
truth = {x['author']: x['label'] for x in fake.load_truth()}
df['label'] = df.author.apply(lambda x: truth[x])

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

In [4]:
# encoded_data = tokenizer.batch_encode_plus(df['tweet'], pad_to_max_length=True, add_special_tokens=True)

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
model= BertModel.from_pretrained('bert-large-uncased')
#model = DataParallel(model)
model = model.to(device)
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 1024, padding_idx=0)
    (position_embeddings): Embedding(512, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=Fals

In [None]:
# with torch.no_grad():

#     input_ids = torch.tensor(encoded_data["input_ids"])
#     outputs = model(input_ids)
#     last_hidden_states = outputs[0]
#     sentencevec = last_hidden_states[:,0,:]

In [6]:
from torch.utils.data import Dataset

class PanDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):

        text, label = self.data.iloc[idx, :].values
        label_tensors = torch.tensor(label)
        
        word_pieces = ["[CLS]"]
        tokens = self.tokenizer.tokenize(text)
        word_pieces += tokens + ["[SEP]"]
        
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensors = torch.tensor(ids)
        
        return tokens_tensors, label_tensors

In [7]:
def collate_fn(batch):
    
    tokens_tensors = [data[0] for data in batch]
    if batch[0][1] is not None:
        label_ids = torch.stack([label[1] for label in batch])
    else:
        label_ids = None

    tokens_tensors = pad_sequence(tokens_tensors, batch_first=True)
        
    masks_tensors = torch.zeros(tokens_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0, 1)
    
    return tokens_tensors, masks_tensors, label_ids

In [8]:
df_test = df.drop('author', 1)

In [9]:
dataset = PanDataset(df_test, tokenizer)

In [10]:
testloader = DataLoader(dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)

In [11]:
u_vectors = []

In [12]:
with torch.no_grad():
    for data in testloader:
        tokens_tensors, masks_tensors = data[:2]
        tokens_tensors = tokens_tensors.to(device)
        masks_tensors = masks_tensors.to(device)
        outputs = model(input_ids=tokens_tensors,
                        token_type_ids=None,
                        attention_mask=masks_tensors)
        
        u_vectors.append(outputs[0][:,0,:].cpu().squeeze().numpy())

        del tokens_tensors
        del masks_tensors

In [13]:
len(u_vectors)

30000

In [15]:
type(u_vectors)

list

In [14]:
sum_vectors = []

for i in range(0, 30000, 100):
    #sum_vectors.append(list(map(sum, zip(*u_vectors[i:i+100]))))
    sum_vectors.append(np.sum(u_vectors[i:i+100], axis=0))

In [15]:
avg_vectors = []

for i in range(0, 30000, 100):
    #sum_vectors.append(list(map(sum, zip(*u_vectors[i:i+100]))))
    avg_vectors.append(np.average(u_vectors[i:i+100], axis=0))

In [18]:
df.author[100]

'8vp74g6kssomu1a6akix6y3hqy6552t7'

In [16]:
## get original information

new_df = {
    'author': [],
    'label': []}

for i in range(0, 30000, 100):
    new_df['author'].append(df.author[i])
    new_df['label'].append(df.label[i])


new_df = pd.DataFrame(new_df)

In [16]:
new_df['label'].value_counts()

1    150
0    150
Name: label, dtype: int64

In [17]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

In [23]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
scores = cross_val_score(clf, sum_vectors, new_df['label'], cv=5)
print("Vectors by sum:", scores)
print("Vectors by sum:", np.mean(scores))

clf = LogisticRegression()
scores = cross_val_score(clf, avg_vectors, new_df['label'], cv=5)
proba = cross_val_predict(clf, avg_vectors, new_df['label'], cv=5, method='predict_proba')
print("Vectors by avg:", scores)
print("Vectors by avg:", np.mean(scores))



Vectors by sum: [0.71666667 0.7        0.71666667 0.63333333 0.7       ]
Vectors by sum: 0.6933333333333334




Vectors by avg: [0.71666667 0.66666667 0.76666667 0.66666667 0.66666667]
Vectors by avg: 0.6966666666666665




In [104]:
is_spreader = []
for i in range(300):
    is_spreader.append(proba[i][1])

In [105]:
new_df_all = new_df.assign(probability=is_spreader)

In [106]:
pred_s = []
for i in range(300):
    if proba[i][0]>proba[i][1]:
        pred_s.append(0)
    else:
        pred_s.append(1)

In [107]:
np.mean(pred_s == new_df_all['label'])

0.7

In [24]:
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
s_scores = cross_val_score(clf, sum_vectors, new_df['label'], cv=5)
print("Vectors by sum:", s_scores)
print("Vectors by sum:", np.mean(s_scores))

clf = GaussianNB()
a_scores = cross_val_score(clf, avg_vectors, new_df['label'], cv=5)
print("Vectors by avg:", a_scores)
print("Vectors by avg:", np.mean(a_scores))

Vectors by sum: [0.73333333 0.65       0.68333333 0.6        0.65      ]
Vectors by sum: 0.6633333333333333
Vectors by avg: [0.73333333 0.65       0.68333333 0.6        0.65      ]
Vectors by avg: 0.6633333333333333


In [24]:
from sklearn import svm

svc = svm.SVC(C=0.9, kernel='linear')
s_scores = cross_val_score(svc, sum_vectors, new_df['label'], cv=5)
print("Vectors by sum:", s_scores)
print("Vectors by sum:", np.mean(s_scores))

svc = svm.SVC(C=1000, degree=1, kernel='sigmoid', gamma=0.0009, probability=True)
#svc = svm.SVC(C=1000, degree=100, kernel='rbf', gamma=0.0009, probability=True)
a_scores = cross_val_score(svc, avg_vectors, new_df['label'], cv=5)
proba = cross_val_predict(svc, avg_vectors, new_df['label'], cv=5, method='predict_proba')
print("Vectors by avg:", a_scores)
print("Vectors by avg:", np.mean(a_scores))

Vectors by sum: [0.7        0.7        0.73333333 0.66666667 0.7       ]
Vectors by sum: 0.7
Vectors by avg: [0.71666667 0.68333333 0.73333333 0.7        0.68333333]
Vectors by avg: 0.7033333333333334


In [108]:
new_df_all.to_csv('bert_large_svm_0.7.csv', index=False)

In [48]:
new_df_all.to_csv('bert_encoded.csv', index=False)

In [92]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=200, max_depth=5)
s_scores = cross_val_score(rf, sum_vectors, new_df['label'], cv=5)
print("Vectors by sum:", s_scores)
print("Vectors by sum:", np.mean(s_scores))

rf = RandomForestClassifier(n_estimators=200, max_depth=5)
a_scores = cross_val_score(rf, avg_vectors, new_df['label'], cv=5)
print("Vectors by avg:", a_scores)
print("Vectors by avg:", np.mean(a_scores))

Vectors by sum: [0.68333333 0.68333333 0.75       0.66666667 0.66666667]
Vectors by sum: 0.69
Vectors by avg: [0.68333333 0.68333333 0.7        0.7        0.6       ]
Vectors by avg: 0.6733333333333333


In [19]:
## input all data for training

from sklearn import svm

svc = svm.SVC(C=1000, degree=1, kernel='sigmoid', gamma=0.0009, probability=True)
#svc = svm.SVC(C=1000, degree=100, kernel='rbf', gamma=0.0009, probability=True)
svc.fit(avg_vectors, new_df['label'])

SVC(C=1000, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=1, gamma=0.0009, kernel='sigmoid',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [20]:
np.mean(new_df['label']== svc.predict(avg_vectors))

0.89

In [21]:
import joblib

joblib.dump(svc, 'pan20/fake/bert-large-sigmoid.model')

['pan20/fake/bert-large-sigmoid.model']

In [25]:
proba

array([[0.29617567, 0.70382433],
       [0.5       , 0.5       ],
       [0.57803379, 0.42196621],
       [0.58956482, 0.41043518],
       [0.31739315, 0.68260685],
       [0.39742677, 0.60257323],
       [0.40111351, 0.59888649],
       [0.54975952, 0.45024048],
       [0.58789156, 0.41210844],
       [0.62249247, 0.37750753],
       [0.48036478, 0.51963522],
       [0.50686205, 0.49313795],
       [0.68985105, 0.31014895],
       [0.72996014, 0.27003986],
       [0.58573696, 0.41426304],
       [0.53677975, 0.46322025],
       [0.25408683, 0.74591317],
       [0.47761078, 0.52238922],
       [0.38404175, 0.61595825],
       [0.2666539 , 0.7333461 ],
       [0.49039304, 0.50960696],
       [0.73768677, 0.26231323],
       [0.30264598, 0.69735402],
       [0.78917744, 0.21082256],
       [0.68784775, 0.31215225],
       [0.89308839, 0.10691161],
       [0.19344285, 0.80655715],
       [0.3244257 , 0.6755743 ],
       [0.46457124, 0.53542876],
       [0.56614084, 0.43385916],
       [0.