In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
path = 'Constraint_Train.csv'

In [3]:
df = pd.read_csv(path)
df.head(3)

Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,real
1,2,States reported 1121 deaths a small rise from ...,real
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake


In [4]:
len(df)

6420

In [5]:
df.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
real,3360
fake,3060


Окей, классы примерно сбалансированы

In [6]:
from nltk.tokenize import TweetTokenizer
import nltk
tk = TweetTokenizer(preserve_case=False)

In [7]:
tweets = [tk.tokenize(tweet) for tweet in df.tweet]

# TopicModelling + LogisticRegression

## Make a topic modelling

In [8]:
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim import similarities

In [9]:
dictionary = Dictionary(tweets)
gensim_corpus = [dictionary.doc2bow(tweet) for tweet in tweets]

In [10]:
tf_idf = TfidfModel(corpus=gensim_corpus)
tf_idf_corpus = tf_idf[gensim_corpus]

In [11]:
from gensim.models import lsimodel

In [12]:
lsi_matrix = lsimodel.LsiModel(corpus=tf_idf_corpus, id2word=dictionary, num_topics=300)

topics = np.array([lsi_matrix[i] for i in gensim_corpus])

## Prepare data

In [13]:
X = topics[:, :, 1]
y = (df.label == 'real').astype('int')

## Work with model

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=12345)

In [16]:
lr = LogisticRegression(random_state=12345)
lr.fit(X_train, y_train)

yhat = lr.predict(X_test)

In [17]:
print(classification_report(y_test, yhat))

              precision    recall  f1-score   support

           0       0.92      0.93      0.93       895
           1       0.94      0.93      0.93      1031

    accuracy                           0.93      1926
   macro avg       0.93      0.93      0.93      1926
weighted avg       0.93      0.93      0.93      1926



# w2v + CNN

## Get text embeddings

In [18]:
from gensim.models import Word2Vec

In [19]:
%%time
word_vect_size = 300
tweet_model = Word2Vec(tweets, vector_size=word_vect_size, min_count=5, window=5, epochs=15)

CPU times: user 7.57 s, sys: 5.65 s, total: 13.2 s
Wall time: 2.46 s


In [20]:
tweet_model.wv.most_similar('wuhan')

[('china', 0.878233015537262),
 ('virology', 0.8698071837425232),
 ('chinese', 0.8117592334747314),
 ('military', 0.8057734966278076),
 ('originated', 0.7962672114372253),
 ('created', 0.7806059718132019),
 ('dead', 0.7690945267677307),
 ('são', 0.7648612260818481),
 ('institute', 0.7645329833030701),
 ('paulo', 0.7627395987510681)]

In [21]:
from collections import Counter

In [22]:
lens = Counter(len(tweet) for tweet in tweets)

In [23]:
def get_embeddings(tokens, max_len=100):
    result = []
    for i in range(max_len):
        if i < len(tokens):
            word = tokens[i]
            if word in tweet_model.wv:
                result.append(tweet_model.wv[word])
            else:
                result.append(np.zeros(word_vect_size))
        else:
            result.append(np.zeros(word_vect_size))
    return result

In [24]:
tweet_embeddings = np.array([get_embeddings(tweet) for tweet in tqdm(tweets)])

100%|██████████| 6420/6420 [00:01<00:00, 5270.97it/s]


In [25]:
tweet_embeddings.shape

(6420, 100, 300)

## Prepare data

In [26]:
import torch

In [27]:
X_train, X_test, y_train, y_test = train_test_split(tweet_embeddings, y, test_size=.3, random_state=12345)

In [28]:
in_data = torch.tensor(X_train).float()
targets = torch.tensor(y_train.values).float()

in_data_test = torch.tensor(X_test).float()
targets_test = torch.tensor(y_test.values).float()

## Build a model

In [29]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [30]:
class my_CNN(nn.Module):
    def __init__(self, word_vect_size, batch_size):
        super(my_CNN, self).__init__()
        self.batch_size = batch_size
        self.conv2 = nn.Conv2d(in_channels=1, out_channels=2, kernel_size=(2, word_vect_size))
        self.conv3 = nn.Conv2d(in_channels=1, out_channels=2, kernel_size=(3, word_vect_size))
        self.conv4 = nn.Conv2d(in_channels=1, out_channels=2, kernel_size=(4, word_vect_size))
        self.drop = nn.Dropout(.3)
        self.softmax = nn.Softmax(-1)
        self.out = nn.Linear(6, 1)
        # self.sigm = nn.Sigmoid()

    def pooling(self, x):
        pool = nn.MaxPool2d((x.shape[2], 1))(x).reshape((x.shape[0], 2))
        # res = torch.cat([pool[0], pool[1]])
        return pool

    def forward(self, x):
        # self.x = x
        x = x.reshape((x.shape[0], 1, x.shape[1], word_vect_size))
        x1 = self.pooling(F.relu(self.conv2(x)))
        x2 = self.pooling(F.relu(self.conv3(x)))
        x3 = self.pooling(F.relu(self.conv4(x)))
        concat_ = torch.cat((x1, x2, x3), dim=1)
        solved = self.out(self.softmax(self.drop(concat_)))

        return solved

net = my_CNN(300, 16)

In [31]:
print(net)

my_CNN(
  (conv2): Conv2d(1, 2, kernel_size=(2, 300), stride=(1, 1))
  (conv3): Conv2d(1, 2, kernel_size=(3, 300), stride=(1, 1))
  (conv4): Conv2d(1, 2, kernel_size=(4, 300), stride=(1, 1))
  (drop): Dropout(p=0.3, inplace=False)
  (softmax): Softmax(dim=-1)
  (out): Linear(in_features=6, out_features=1, bias=True)
)


## Define learning algorythm

In [32]:
optimizer = optim.NAdam(net.parameters(), lr=0.01)
criterion = nn.BCEWithLogitsLoss()

In [33]:
def train_epoch(model, X_train, y_train, batch_size=16, criterion=None, optimizer=None):
    for i in tqdm(range(0, in_data.shape[0], batch_size)):
        batch_x = X_train[i:i+batch_size]
        batch_y = y_train[i:i+batch_size]
        optimizer.zero_grad()
        output = model(batch_x)
        loss = criterion(output.reshape(-1), batch_y)
        loss.backward()
        optimizer.step()
    return loss

def train_model(model, in_data, targets, n_epochs=40, criterion=criterion, optimizer=optimizer):
    for ep in range(1, n_epochs+1):
        print(f'Training epoch {ep}')
        loss = train_epoch(model, in_data, targets, criterion=criterion, optimizer=optimizer)
        print('Loss function result:', loss.data.tolist())

def eval_model(model, in_data_test, targets_test):
    with torch.no_grad():
        output = model(in_data_test).squeeze(1)

    result = np.array((output > 0.5) == targets_test).astype(int)
    print(classification_report(targets_test, result))

## Train and eval model

In [34]:
train_model(net, in_data, targets)

Training epoch 1


100%|██████████| 281/281 [00:02<00:00, 134.82it/s]


Loss function result: 0.45328661799430847
Training epoch 2


100%|██████████| 281/281 [00:02<00:00, 123.45it/s]


Loss function result: 0.39669105410575867
Training epoch 3


100%|██████████| 281/281 [00:02<00:00, 123.96it/s]


Loss function result: 0.48157167434692383
Training epoch 4


100%|██████████| 281/281 [00:02<00:00, 125.03it/s]


Loss function result: 0.3018181324005127
Training epoch 5


100%|██████████| 281/281 [00:02<00:00, 129.49it/s]


Loss function result: 0.5316572189331055
Training epoch 6


100%|██████████| 281/281 [00:02<00:00, 132.88it/s]


Loss function result: 0.3860326409339905
Training epoch 7


100%|██████████| 281/281 [00:02<00:00, 128.54it/s]


Loss function result: 0.5292941927909851
Training epoch 8


100%|██████████| 281/281 [00:02<00:00, 123.10it/s]


Loss function result: 0.2994154989719391
Training epoch 9


100%|██████████| 281/281 [00:02<00:00, 133.35it/s]


Loss function result: 0.6257075071334839
Training epoch 10


100%|██████████| 281/281 [00:02<00:00, 123.42it/s]


Loss function result: 0.5919832587242126
Training epoch 11


100%|██████████| 281/281 [00:02<00:00, 123.91it/s]


Loss function result: 0.29603487253189087
Training epoch 12


100%|██████████| 281/281 [00:02<00:00, 109.81it/s]


Loss function result: 0.5589989423751831
Training epoch 13


100%|██████████| 281/281 [00:02<00:00, 109.21it/s]


Loss function result: 0.46713122725486755
Training epoch 14


100%|██████████| 281/281 [00:02<00:00, 109.25it/s]


Loss function result: 0.5273116230964661
Training epoch 15


100%|██████████| 281/281 [00:02<00:00, 129.62it/s]


Loss function result: 0.5859057307243347
Training epoch 16


100%|██████████| 281/281 [00:02<00:00, 135.52it/s]


Loss function result: 0.4303719997406006
Training epoch 17


100%|██████████| 281/281 [00:02<00:00, 133.83it/s]


Loss function result: 0.5293723940849304
Training epoch 18


100%|██████████| 281/281 [00:02<00:00, 129.22it/s]


Loss function result: 0.42361336946487427
Training epoch 19


100%|██████████| 281/281 [00:02<00:00, 122.45it/s]


Loss function result: 0.43287625908851624
Training epoch 20


100%|██████████| 281/281 [00:02<00:00, 136.11it/s]


Loss function result: 0.5594396591186523
Training epoch 21


100%|██████████| 281/281 [00:02<00:00, 137.84it/s]


Loss function result: 0.3284465968608856
Training epoch 22


100%|██████████| 281/281 [00:02<00:00, 136.64it/s]


Loss function result: 0.4505043625831604
Training epoch 23


100%|██████████| 281/281 [00:02<00:00, 134.93it/s]


Loss function result: 0.3835628628730774
Training epoch 24


100%|██████████| 281/281 [00:02<00:00, 126.26it/s]


Loss function result: 0.36022406816482544
Training epoch 25


100%|██████████| 281/281 [00:02<00:00, 115.88it/s]


Loss function result: 0.11595787107944489
Training epoch 26


100%|██████████| 281/281 [00:02<00:00, 134.90it/s]


Loss function result: 0.27766457200050354
Training epoch 27


100%|██████████| 281/281 [00:02<00:00, 137.54it/s]


Loss function result: 0.3457057476043701
Training epoch 28


100%|██████████| 281/281 [00:02<00:00, 135.21it/s]


Loss function result: 0.48455366492271423
Training epoch 29


100%|██████████| 281/281 [00:02<00:00, 135.05it/s]


Loss function result: 0.36052390933036804
Training epoch 30


100%|██████████| 281/281 [00:02<00:00, 122.00it/s]


Loss function result: 0.3704860508441925
Training epoch 31


100%|██████████| 281/281 [00:02<00:00, 132.31it/s]


Loss function result: 0.11072523891925812
Training epoch 32


100%|██████████| 281/281 [00:02<00:00, 133.31it/s]


Loss function result: 0.21962495148181915
Training epoch 33


100%|██████████| 281/281 [00:02<00:00, 131.59it/s]


Loss function result: 0.4505220353603363
Training epoch 34


100%|██████████| 281/281 [00:02<00:00, 128.01it/s]


Loss function result: 0.13483406603336334
Training epoch 35


100%|██████████| 281/281 [00:02<00:00, 131.87it/s]


Loss function result: 0.2658945620059967
Training epoch 36


100%|██████████| 281/281 [00:02<00:00, 130.63it/s]


Loss function result: 0.2964777946472168
Training epoch 37


100%|██████████| 281/281 [00:02<00:00, 133.19it/s]


Loss function result: 0.13124389946460724
Training epoch 38


100%|██████████| 281/281 [00:02<00:00, 137.93it/s]


Loss function result: 0.19545602798461914
Training epoch 39


100%|██████████| 281/281 [00:02<00:00, 137.10it/s]


Loss function result: 0.32014307379722595
Training epoch 40


100%|██████████| 281/281 [00:02<00:00, 136.40it/s]

Loss function result: 0.20660655200481415





In [35]:
eval_model(net, in_data_test, targets_test)

              precision    recall  f1-score   support

         0.0       0.42      0.13      0.19       895
         1.0       0.53      0.85      0.65      1031

    accuracy                           0.51      1926
   macro avg       0.48      0.49      0.42      1926
weighted avg       0.48      0.51      0.44      1926



# w2v + 2way LSTM

Lets leave out our last embeddings

In [36]:
X = tweet_embeddings
y = np.array((df.label == 'real')).astype(int)

In [37]:
X.shape[0] == y.shape[0]

True

## Prepare data

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=15)

In [39]:
in_data = torch.tensor(X_train).float()
targets = torch.tensor(y_train).float()

in_data_test = torch.tensor(X_test).float()
targets_test = torch.tensor(y_test).float()

## Build model

In [40]:
class seryozny_RNN(nn.Module):
    def __init__(self):
        super(seryozny_RNN, self).__init__()
        # self.lstm1 = nn.LSTM(300, 150, 2, dropout=.25)
        self.lstm2 = nn.LSTM(300, 150, bidirectional=True)

        self.pool = nn.MaxPool2d(kernel_size=(1, 300))

        self.dense1 = nn.Linear(100, 1)
        # self.dense2 = nn.Linear(50, 25)
        # self.dense3 = nn.Linear(25, 1)
        self.drop = nn.Dropout(.5)
        self.sigm = nn.Sigmoid()
        # self.res_solver = nn.Linear(100, 1)

    def forward(self, x):
        h0 = torch.randn(2, 100, 150)
        c0 = torch.randn(2, 100, 150)

        # embed, (h1, c1) = self.lstm1(x, (h0, c0))
        embed1, (h2, c2) = self.lstm2(x, (h0, c0))

        x_out = self.pool(embed1).reshape((x.shape[0], 100))

        x_out = self.dense1(x_out)
        # x_out = self.sigm(self.res_solver(x_out.reshape(16, 100)))

        return self.sigm(x_out)
        #

net2 = seryozny_RNN()

## Train and eval data

In [41]:
optimizer1 = optim.Adam(net2.parameters(), lr=0.01)
criterion1 = nn.BCEWithLogitsLoss()

In [42]:
train_model(net2, in_data, targets, n_epochs=40, criterion=criterion1, optimizer=optimizer1)

Training epoch 1


100%|██████████| 281/281 [00:04<00:00, 59.24it/s]


Loss function result: 0.5388667583465576
Training epoch 2


100%|██████████| 281/281 [00:04<00:00, 58.92it/s]


Loss function result: 0.5370286703109741
Training epoch 3


100%|██████████| 281/281 [00:04<00:00, 61.09it/s]


Loss function result: 0.5319088101387024
Training epoch 4


100%|██████████| 281/281 [00:04<00:00, 60.32it/s]


Loss function result: 0.5358630418777466
Training epoch 5


100%|██████████| 281/281 [00:04<00:00, 61.86it/s]


Loss function result: 0.5273175835609436
Training epoch 6


100%|██████████| 281/281 [00:04<00:00, 61.11it/s]


Loss function result: 0.5294984579086304
Training epoch 7


100%|██████████| 281/281 [00:04<00:00, 57.00it/s]


Loss function result: 0.5293110013008118
Training epoch 8


100%|██████████| 281/281 [00:05<00:00, 54.93it/s]


Loss function result: 0.5327244400978088
Training epoch 9


100%|██████████| 281/281 [00:05<00:00, 53.89it/s]


Loss function result: 0.5303908586502075
Training epoch 10


100%|██████████| 281/281 [00:05<00:00, 54.17it/s]


Loss function result: 0.5278099179267883
Training epoch 11


100%|██████████| 281/281 [00:05<00:00, 52.12it/s]


Loss function result: 0.5303411483764648
Training epoch 12


100%|██████████| 281/281 [00:05<00:00, 47.37it/s]


Loss function result: 0.5306784510612488
Training epoch 13


100%|██████████| 281/281 [00:05<00:00, 48.89it/s]


Loss function result: 0.5199688076972961
Training epoch 14


100%|██████████| 281/281 [00:05<00:00, 46.99it/s]


Loss function result: 0.5310918688774109
Training epoch 15


100%|██████████| 281/281 [00:06<00:00, 46.47it/s]


Loss function result: 0.5304362177848816
Training epoch 16


100%|██████████| 281/281 [00:06<00:00, 44.22it/s]


Loss function result: 0.5302203893661499
Training epoch 17


100%|██████████| 281/281 [00:06<00:00, 43.38it/s]


Loss function result: 0.5304835438728333
Training epoch 18


100%|██████████| 281/281 [00:06<00:00, 41.35it/s]


Loss function result: 0.5352168679237366
Training epoch 19


100%|██████████| 281/281 [00:06<00:00, 41.33it/s]


Loss function result: 0.530795156955719
Training epoch 20


100%|██████████| 281/281 [00:07<00:00, 39.35it/s]


Loss function result: 0.5315490365028381
Training epoch 21


100%|██████████| 281/281 [00:07<00:00, 39.07it/s]


Loss function result: 0.5505733489990234
Training epoch 22


100%|██████████| 281/281 [00:07<00:00, 36.82it/s]


Loss function result: 0.5360298752784729
Training epoch 23


100%|██████████| 281/281 [00:07<00:00, 36.77it/s]


Loss function result: 0.528559148311615
Training epoch 24


100%|██████████| 281/281 [00:07<00:00, 36.43it/s]


Loss function result: 0.5331501364707947
Training epoch 25


100%|██████████| 281/281 [00:07<00:00, 35.40it/s]


Loss function result: 0.5397530794143677
Training epoch 26


100%|██████████| 281/281 [00:07<00:00, 35.33it/s]


Loss function result: 0.5340901613235474
Training epoch 27


100%|██████████| 281/281 [00:08<00:00, 33.98it/s]


Loss function result: 0.5244712233543396
Training epoch 28


100%|██████████| 281/281 [00:08<00:00, 33.79it/s]


Loss function result: 0.5293012261390686
Training epoch 29


100%|██████████| 281/281 [00:08<00:00, 33.76it/s]


Loss function result: 0.5297016501426697
Training epoch 30


100%|██████████| 281/281 [00:08<00:00, 32.59it/s]


Loss function result: 0.5284325480461121
Training epoch 31


100%|██████████| 281/281 [00:08<00:00, 32.21it/s]


Loss function result: 0.5298836827278137
Training epoch 32


100%|██████████| 281/281 [00:08<00:00, 32.51it/s]


Loss function result: 0.5288249254226685
Training epoch 33


100%|██████████| 281/281 [00:09<00:00, 31.20it/s]


Loss function result: 0.5301772952079773
Training epoch 34


100%|██████████| 281/281 [00:08<00:00, 31.32it/s]


Loss function result: 0.5301278829574585
Training epoch 35


100%|██████████| 281/281 [00:08<00:00, 31.93it/s]


Loss function result: 0.5316230058670044
Training epoch 36


100%|██████████| 281/281 [00:08<00:00, 31.28it/s]


Loss function result: 0.5201858878135681
Training epoch 37


100%|██████████| 281/281 [00:09<00:00, 30.29it/s]


Loss function result: 0.5259977579116821
Training epoch 38


100%|██████████| 281/281 [00:09<00:00, 30.96it/s]


Loss function result: 0.5307155251502991
Training epoch 39


100%|██████████| 281/281 [00:09<00:00, 29.73it/s]


Loss function result: 0.5297655463218689
Training epoch 40


100%|██████████| 281/281 [00:09<00:00, 29.90it/s]

Loss function result: 0.5302874445915222





In [43]:
eval_model(net2, in_data_test, targets_test)

              precision    recall  f1-score   support

         0.0       0.35      0.08      0.12       910
         1.0       0.51      0.87      0.65      1016

    accuracy                           0.50      1926
   macro avg       0.43      0.47      0.39      1926
weighted avg       0.43      0.50      0.40      1926

