In [1]:
import pandas as pd
import numpy as np
import torch
import transformers as trf

In [2]:
import torch.nn as nn
import torch.nn.functional as F

In [3]:
import re

### Load DistilBERT model

In [4]:
model_class = trf.DistilBertModel
tokenizer_class = trf.DistilBertTokenizer
pretrained_weights = 'distilbert-base-uncased'

In [5]:
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)

In [6]:
tokenizer.encode('today is a good day')

[101, 2651, 2003, 1037, 2204, 2154, 102]

In [7]:
# load distilBERT model
#model = model_class.from_pretrained(pretrained_weights)
dBERT_model = torch.load('distill_bert0.h5')

In [8]:
dBERT_model

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0): TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Linear(i

In [None]:
# torch.save(model, 'distill_bert0.h5')

### Sanity check

In [23]:
try_x = torch.tensor(tokenizer.encode('today is a good day'))
try_x

tensor([ 101, 2651, 2003, 1037, 2204, 2154,  102])

In [24]:
try_x = try_x.reshape(1,-1)

In [25]:
try_x

tensor([[ 101, 2651, 2003, 1037, 2204, 2154,  102]])

In [27]:
dBERT_model(try_x)

BaseModelOutput(last_hidden_state=tensor([[[-0.0024, -0.1809,  0.1915,  ..., -0.0416,  0.4621,  0.0992],
         [-0.2186,  0.0181, -0.1486,  ..., -0.4835,  0.5390, -0.5484],
         [-0.4523, -0.1693,  0.4058,  ..., -0.5069,  0.2973,  0.3459],
         ...,
         [ 0.0654,  0.1925,  0.5155,  ..., -0.2969,  0.2309, -0.7778],
         [-0.1178, -0.1854,  0.1412,  ...,  0.0207,  0.1747, -0.9925],
         [ 0.8455,  0.1028, -0.3025,  ...,  0.0966, -0.4139, -0.3374]]],
       grad_fn=<NativeLayerNormBackward>), hidden_states=None, attentions=None)

### Load data + Build models

In [11]:
import pickle

In [12]:
# Load training data
with open('train_test_data_v2/pos_tweets.pkl', 'rb') as f:
    pos_data = pickle.load(f)

with open('train_test_data_v2/semi_pos_tweets.pkl', 'rb') as f:
    semi_pos_data = pickle.load(f)

with open('train_test_data_v2/neutral_augmented.pkl', 'rb') as f:
    neu_data = pickle.load(f)

with open('train_test_data_v2/semi_neg_tweets.pkl', 'rb') as f:
    semi_neg_data = pickle.load(f)
    
with open('train_test_data_v2/neg_tweets.pkl', 'rb') as f:
    neg_data = pickle.load(f)

In [13]:
len(pos_data)

13410

In [14]:
def random_select(tweets, filter_ratio=1):
    print(len(tweets))
    ret = []
    for tweet in tweets:
        if np.random.randint(filter_ratio) == 0:
            ret.append(tweet)
    print(len(ret))
    return ret

pos_data = random_select(pos_data, 2)

13410
6700


In [15]:
len(pos_data)

6700

In [16]:
print(len(semi_pos_data))
print(len(neu_data))
print(len(semi_neg_data))
print(len(neg_data))

5451
6849
4597
5542


In [17]:
def get_embeddings(model, sentence_list):
    ret = []
    for i in range(len(sentence_list)):
        if i%500 == 0:
            print('On row %d'%i)
        new_input = torch.tensor(tokenizer.encode(sentence_list[i])).reshape(1,-1)
        with torch.no_grad():
            new_output = model(new_input).last_hidden_state[0][0]
        ret.append(np.array(new_output).reshape(1,-1))
    return ret

In [18]:
tweets_data = pos_data + semi_pos_data + neu_data + semi_neg_data + neg_data
tweets_data = [remove_url(tweet) for tweet in tweets_data]
print(len(tweets_data))

29139


In [20]:
tweets_data[1]

'Please wear a mask! Not because you are sick, but also protect yourself and others! #wearamask #COVID19  '

In [21]:
# Get sentence embeddings
emb_list = get_embeddings(dBERT_model, tweets_data)

On row 0
On row 500
On row 1000
On row 1500
On row 2000
On row 2500
On row 3000
On row 3500
On row 4000
On row 4500
On row 5000
On row 5500
On row 6000
On row 6500
On row 7000
On row 7500
On row 8000
On row 8500
On row 9000
On row 9500
On row 10000
On row 10500
On row 11000
On row 11500
On row 12000
On row 12500
On row 13000
On row 13500
On row 14000
On row 14500
On row 15000
On row 15500
On row 16000
On row 16500
On row 17000
On row 17500
On row 18000
On row 18500
On row 19000
On row 19500
On row 20000
On row 20500
On row 21000
On row 21500
On row 22000
On row 22500
On row 23000
On row 23500
On row 24000
On row 24500
On row 25000
On row 25500
On row 26000
On row 26500
On row 27000
On row 27500
On row 28000
On row 28500
On row 29000


In [22]:
embeddings = np.concatenate((emb_list),axis=0)
embeddings.shape

(29139, 768)

In [24]:
#with open('emb.npy','wb') as f:
#    np.save(f, embeddings)

In [6]:
#with open('tweet_emb.npy','rb') as f:
#    embeddings = np.load(f)

In [26]:
# Build embedding-to-score PyTorch model
class ScoreNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(768,1)
        
    def forward(self, x):
        return torch.tanh(self.linear(x))


class TwoLayerScoreNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = nn.Linear(768,128)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(128,1)
        
    def forward(self, x):
        ret = self.relu(self.linear1(x))
        ret = self.linear2(ret)
        return torch.tanh(ret)

### Split train test data

In [27]:
shuffled_index = np.arange(len(embeddings))
np.random.shuffle(shuffled_index)
print(shuffled_index[:10])

[27528 28383 13349  8833 15029 10871 28427 17342 28805  2576]


In [28]:
split = int(0.8*len(embeddings))
train_index = shuffled_index[:split]
test_index = shuffled_index[split:]

print(train_index.shape)
print(test_index.shape)

(23311,)
(5828,)


In [29]:
X = embeddings
y = np.array(([1]*len(pos_data) + [0.5]*len(semi_pos_data) + [0]*len(neu_data) 
              + [-0.5]*len(semi_neg_data) + [-1]*len(neg_data)))

In [30]:
print(X.shape)
print(y.shape)

(29139, 768)
(29139,)


In [31]:
X_train, y_train, X_test, y_test = X[train_index], y[train_index], X[test_index], y[test_index]

In [32]:
X_train = torch.tensor(X_train)
y_train = torch.tensor(y_train).reshape(-1,1)
X_test = torch.tensor(X_test)
y_test = torch.tensor(y_test).reshape(-1,1)

In [33]:
X_train.shape

torch.Size([23311, 768])

### Training

In [34]:
def train(model, epochs=5000, lr=1e-3):
    optimizer = torch.optim.Adam(params = model.parameters(), lr = lr)
    
    for iteration in range(epochs):
        y_out = model(X_train)
        loss = (y_out-y_train).pow(2).sum()

        if iteration%(int(epochs/10)) == 0:
            print('On epoch %d: '%iteration, 'Loss: %d'%loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print('DONE')

In [38]:
snet = ScoreNet()
train(snet, 5000, 2*1e-3)

On epoch 0:  Loss: 12534
On epoch 500:  Loss: 2903
On epoch 1000:  Loss: 2617
On epoch 1500:  Loss: 2488
On epoch 2000:  Loss: 2412
On epoch 2500:  Loss: 2363
On epoch 3000:  Loss: 2331
On epoch 3500:  Loss: 2310
On epoch 4000:  Loss: 2296
On epoch 4500:  Loss: 2287
DONE


In [40]:
train(snet, 5000, 2*1e-3)

On epoch 0:  Loss: 2290
On epoch 500:  Loss: 2279
On epoch 1000:  Loss: 2277
On epoch 1500:  Loss: 2274
On epoch 2000:  Loss: 2271
On epoch 2500:  Loss: 2268
On epoch 3000:  Loss: 2267
On epoch 3500:  Loss: 2265
On epoch 4000:  Loss: 2263
On epoch 4500:  Loss: 2262
DONE


In [None]:
### 

In [57]:
snet_v2 = TwoLayerScoreNet()
train(snet_v2, 1000, 1e-3)

On epoch 0:  Loss: 11710
On epoch 100:  Loss: 2993
On epoch 200:  Loss: 2245
On epoch 300:  Loss: 1968
On epoch 400:  Loss: 1803
On epoch 500:  Loss: 1701
On epoch 600:  Loss: 1591
On epoch 700:  Loss: 1504
On epoch 800:  Loss: 1432
On epoch 900:  Loss: 1391
DONE


In [59]:
train(snet_v2, 1000, 1e-4)

On epoch 0:  Loss: 1330
On epoch 100:  Loss: 1291
On epoch 200:  Loss: 1272
On epoch 300:  Loss: 1248
On epoch 400:  Loss: 1222
On epoch 500:  Loss: 1193
On epoch 600:  Loss: 1163
On epoch 700:  Loss: 1132
On epoch 800:  Loss: 1100
On epoch 900:  Loss: 1070
DONE


In [None]:
## alternatively, use sklearn linear regression

In [42]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge # linear classifier with L2 regularization

In [43]:
LRclassifier = LinearRegression()

In [44]:
LRclassifier.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [45]:
LRclassifier.score(X_train, y_train) # R^2 score of predicted and actual

0.7907537159600979

In [46]:
LRclassifier.score(X_test, y_test) # R^2 score of predicted and actual

0.7777609448987791

In [47]:
Ridge_classifier = Ridge(alpha=1) # alpha=1 gives the best result, compared to alpha=0.1 or 10

In [48]:
Ridge_classifier.fit(X_train, y_train)

Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=None, solver='auto', tol=0.001)

In [49]:
Ridge_classifier.score(X_train, y_train)

0.7903840948098128

In [50]:
Ridge_classifier.score(X_test, y_test)

0.7786628584490942

### Evaluation

In [36]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score

In [41]:
# best
print('performance on training data: ', r2_score(y_train, snet(X_train).detach()))
print('performance on test data: ', r2_score(y_test, snet(X_test).detach()))

performance on training data:  0.8070095105173132
performance on test data:  0.795529087896512


In [60]:
print('performance on training data: ', r2_score(y_train, snet_v2(X_train).detach()))
print('performance on test data: ', r2_score(y_test, snet_v2(X_test).detach()))

performance on training data:  0.9111099758831454
performance on test data:  0.8503452651247398


In [51]:
print('performance on training data: ', r2_score(y_train, Ridge_classifier.predict(X_train)))
print('performance on test data: ', r2_score(y_test, Ridge_classifier.predict(X_test)))

performance on training data:  0.7903840948098129
performance on test data:  0.7786628584490942


In [52]:
print('performance on training data: ', r2_score(y_train, LRclassifier.predict(X_train)))
print('performance on test data: ', r2_score(y_test, LRclassifier.predict(X_test)))

performance on training data:  0.7907537159600979
performance on test data:  0.7777609448987791


In [61]:
y_pred = list(snet_v2(X_test).detach())
y_pred = [val.item() for val in y_pred]
print(y_pred[:10])

[0.9287555813789368, 0.4493280351161957, -0.4090578556060791, -0.8656989932060242, 0.7675151824951172, 0.2942424714565277, 0.6786667108535767, 0.9472098350524902, -0.6245620250701904, -0.7595279216766357]


In [62]:
def score_to_class(scores):
    ret = []
    for score in scores:
        if score > 0.6: 
            ret += [0] # positive
        elif score >0.2:
            ret += [1] # semi-positive
        elif score >=-0.2:
            ret += [2] # neutral
        elif score >= -0.6:
            ret += [3] # semi-negative
        else:
            ret += [4] #negative
    return ret

In [63]:
def score_to_class_rough(scores, threshold=0.25):
    ret = []
    for score in scores:
        if score > threshold:
            ret += [0] # positive or semi-positive
        elif score >= -threshold:
            ret += [1] # neutral
        else:
            ret += [2] #negative or semi-negative
    return ret

In [70]:
cutoff = 0.25 # cutoff between positive and neutral (and between neutral and negative)

y_pred_class = score_to_class_rough(y_pred, cutoff)
y_test_class = score_to_class_rough([val.item() for val in list(y_test)], cutoff)

In [71]:
conf_mat = confusion_matrix(y_test_class, y_pred_class)
print(conf_mat, '\n')

correct_count = 0
for i in range(len(conf_mat)):
    correct_count += conf_mat[i][i]
print('accuracy: ', round(correct_count/len(y_test_class),4))

[[2323  155    5]
 [ 250  828  254]
 [   9  131 1873]] 

accuracy:  0.862


### Save model

In [72]:
for name, param in snet_v2.named_parameters():
    if param.requires_grad:
        print(name, param.data)

linear1.weight tensor([[-0.0725,  0.0076,  0.0362,  ..., -0.0614, -0.1010,  0.0308],
        [-0.1231, -0.0637, -0.0137,  ..., -0.0846,  0.0417, -0.0195],
        [-0.0375,  0.0009,  0.0017,  ..., -0.0264, -0.0401, -0.0260],
        ...,
        [-0.0559, -0.0550,  0.0466,  ..., -0.0745,  0.0075,  0.0189],
        [ 0.1420,  0.0347, -0.0662,  ..., -0.0181, -0.0549, -0.0399],
        [ 0.0042, -0.0030, -0.0250,  ...,  0.0006,  0.0322, -0.0294]])
linear1.bias tensor([-0.0107, -0.0102,  0.0274, -0.0259,  0.0266,  0.0050,  0.0221,  0.0081,
        -0.0262,  0.0292,  0.0154, -0.0187, -0.0189,  0.0166, -0.0121, -0.0321,
         0.0097,  0.0155,  0.0028, -0.0012, -0.0020,  0.0196,  0.0039,  0.0045,
        -0.0247,  0.0020,  0.0034,  0.0026, -0.0117,  0.0240,  0.0165,  0.0202,
        -0.0037,  0.0112,  0.0207,  0.0197,  0.0362, -0.0344,  0.0056,  0.0136,
         0.0304, -0.0191,  0.0186,  0.0129, -0.0015, -0.0084,  0.0190,  0.0174,
        -0.0161, -0.0137,  0.0194,  0.0312, -0.0264, -0.01

In [74]:
# save model
# torch.save(snet.state_dict(), 'OneLayerScoreModel.pth')
# torch.save(snet_v2.state_dict(), 'TwoLayerScoreModel.pth')

In [75]:
# test loading model
# test_model = ScoreNet()
# test_model.load_state_dict(torch.load('OneLayerScoreModel.pth'))
# test_model(X_train[0:1])

tensor([[-0.7261]], grad_fn=<TanhBackward>)

### Test on real, unseen tweets

In [84]:
def make_prediction(sentence, score_model):
    sentence = remove_url(sentence.lower())
    x = torch.tensor(tokenizer.encode(sentence)).reshape(1,-1)
    with torch.no_grad():
        cls = dBERT_model(x).last_hidden_state[0][0].reshape(1,768)
    pred = score_model(cls)
    return pred.item()

In [203]:
def show_tweet_and_pred(df, classifier_model, num_tweets=10, show_index=False, show_label=False):
    for i in range(num_tweets):
        rand_id = np.random.randint(len(df))
        tweet = df['tweet'][rand_id]
        
        if show_index:
            print(df['index'][rand_id])
        
        print(tweet)
        score = make_prediction(tweet, classifier_model)
        if show_label:
            category = ''
            if score > 0.3:
                category = 'positive'
            elif score < -0.3:
                category = 'negative'
            else:
                category = 'neutral/irrelevant'
            print('prediction: ', category, ' ', score, '\n')
            
        else:
            print('prediction: ', score,'\n')

In [86]:
df_sd = pd.read_csv('labeled v1/socialdistance_labeled.csv',usecols=[1,2,3])
df_sd.head()

Unnamed: 0,date,tweet,label
0,2020-02-16,How best can we protect ourselves from #COVID1...,2
1,2020-02-25,@anoush941 Plan to #SocialDistance for 4 weeks...,0
2,2020-02-25,#COVID19 Plan to #SocialDistance for 4 weeks o...,0
3,2020-02-27,All the info on the Coronavirus has mentioned ...,0
4,2020-03-06,Please listen and share. And wash your hands! ...,0


In [87]:
sd_uncertain = df_sd[df_sd['label']<=1]
sd_uncertain = sd_uncertain.reset_index()
sd_uncertain.head()

Unnamed: 0,index,date,tweet,label
0,1,2020-02-25,@anoush941 Plan to #SocialDistance for 4 weeks...,0
1,2,2020-02-25,#COVID19 Plan to #SocialDistance for 4 weeks o...,0
2,3,2020-02-27,All the info on the Coronavirus has mentioned ...,0
3,4,2020-03-06,Please listen and share. And wash your hands! ...,0
4,5,2020-03-07,@medicalaxioms Please spread hygiene measures ...,0


In [209]:
show_tweet_and_pred(sd_uncertain, snet, 10, True, True)

11799
There aren’t real people who think kids can social distance, right? Mono was going around the 10th grade this year, and I still saw them sharing drinks. Not to mention little kids like mine, that just lick each other for now reason. #SocialDistance #ShelterInPlace
prediction:  positive   0.3401376008987427 

9059
How Amazon Has Continued to Exploit Workers During the Pandemic  https://t.co/I4inSAEesh If #coronavirusfear was gone would you still protest or would you accept the risk and quietly ask for changes in system? Bad timing! #doctors&amp;nurses can't #socialdistance from patients.
prediction:  neutral/irrelevant   0.21334707736968994 

6997
@b_base haha sometimes it's funny to see how people post their opinion on critical topics like an expert at Facebook. #SocialDistance yourself from them, stay safe :)
prediction:  neutral/irrelevant   -0.1244615912437439 

13177
07 - Orange and Gold Face Mask  https://t.co/U3W7RMffqj via @zazzle #orange #gold #checkers #abstract #drawing

In [212]:
show_tweet_and_pred(sd_uncertain, snet_v2, 10, True, True)

11331
#GreenMask instead #N95  from #India 🇮🇳 #Masks4AllChallenge  #SocialDistance  #COVID19  https://t.co/0MLAh9nVGt
prediction:  neutral/irrelevant   0.08014827221632004 

26023
Hi Americans, please take a good look of the map below. Instant self-gratification will harm you &amp; others. American adults for once, grow up so you'll have holidays to celebrate w/ your children in 2021. #WearAMask #SocialDistance #StayAtHome Stop #Covid_19
prediction:  positive   0.8073064684867859 

17205
Stay SAFE !!  Stay FRESH‼️ #TheBarbersLounge  #Cam  #Bril  #Father  #Son  #SocialDistance  #Covid19  #GenerationalGrooming @ The Barbers Lounge  https://t.co/mRrj0rKu34
prediction:  positive   0.5199558138847351 

13681
 https://t.co/kOPnhklDlS #Killer #Viruses🦠 can #Kill⚰️#COVID19 ☣️ has #Killed #Celebrities, #Politicians, Reg #Persons, #Rich &amp; #Poor, #Famous or #Unfamous #People. #Wear #NanoFiber #Masks 😷 #FaceShields 🥽 &amp; #Gloves 🧤 Keep your #SocialDistance 👣 #Protect #Family &amp; #Friends 👨

In [89]:
df_other = pd.read_csv('labeled v1/negative_labeled.csv', usecols=[1,2,3])
df_other.head()

Unnamed: 0,date,tweet,label
0,2020-01-06,@HoustonTexuhz @ReemBoi25 Hey @ReemBoi25 make ...,0
1,2020-01-08,@EricCHenry_ @speedboy_te75 Can’t w8 to see U ...,0
2,2020-01-10,HELLO FLAMIN EVERYBODY! #RoadtoPartner #NO...,0
3,2020-01-14,Free Hot Cheeto Bags for Everybody! Anybody wa...,0
4,2020-01-22,SUPER SMASH BROS ULTIMATE! LETS XXTRA FLAMIN G...,0


In [90]:
other_uncertain = df_other[df_other['label']<=1].reset_index()
other_uncertain.head()

Unnamed: 0,index,date,tweet,label
0,0,2020-01-06,@HoustonTexuhz @ReemBoi25 Hey @ReemBoi25 make ...,0
1,1,2020-01-08,@EricCHenry_ @speedboy_te75 Can’t w8 to see U ...,0
2,2,2020-01-10,HELLO FLAMIN EVERYBODY! #RoadtoPartner #NO...,0
3,3,2020-01-14,Free Hot Cheeto Bags for Everybody! Anybody wa...,0
4,4,2020-01-22,SUPER SMASH BROS ULTIMATE! LETS XXTRA FLAMIN G...,0


In [214]:
show_tweet_and_pred(other_uncertain, snet_v2, 10, True, True)

30138
@TheSolariReport "COVID is a mild, self-limiting disease 97% of the time unless you're a kid-- then you can't get it at all." - Dr. Tim O'Shea   https://t.co/JP9gveYthk  #COVID19 #SCAMdemic
prediction:  negative   -0.9551659822463989 

2801
#gop #democrats #republicans #nomask #redstates  This is why people who had to actually work that shit show are now infected.  I wonder if all the VIP participants have health insurance thru the government.  Socialism for the privilege.
prediction:  negative   -0.5231501460075378 

13426
one year ago more or less #CertificateOfVaccinationID #Controlavirus #BillAndMelindaGatesFoundation #Plandemic #WHO #IMF #Rockefeller #OrderFollowers #MindControl #Mindfulness #Awake #alternativeHEALING #ScientificDictatorship #Fear #GlobalEconomicCollapse  https://t.co/ziTcx1Vqek
prediction:  neutral/irrelevant   -0.01744580641388893 

10808
Listen Up! #Plandemic @realDonaldTrump  https://t.co/ZeilS8pyJj
prediction:  neutral/irrelevant   -0.12470266222953796 