In [None]:
# importing required libraries

import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import nltk
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Getting the data of Tweets 

df = pd.read_csv('covid19_tweets.csv',header=0) 


In [None]:
# Getting the Facts data by WHO

df_facts = pd.read_excel('Covid-19_Facts.xlsx')
df_facts.head()

Unnamed: 0,Organisation,Facts
0,WHO,Vitamin¬†and mineral supplements cannot cure CO...
1,WHO,Studies show hydroxychloroquine does not have ...
2,WHO,Is dexamethasone a treatment for all COVID-19 ...
3,WHO,People should NOT wear masks while exercising
4,WHO,Water or swimming does not transmit the COVID-...


In [None]:
# Lemmating the data 

from nltk.stem import WordNetLemmatizer
wordnet=WordNetLemmatizer()

In [None]:
# Cleaning the data to remove unnecessary links and emoticons

def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text)
    text = text.lower()
    #text = re.sub('[^a-zA-Z]', ' ', text)
   
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = text.split()
    text = [wordnet.lemmatize(word) for word in text if not word in set(stopwords.words('english'))]
    text = ' '.join(text)
    return text

In [None]:
# Applying to the data

X= df.text.apply(lambda x: clean_text(x))
Y=df_facts.Facts.apply(lambda x: clean_text(x))

In [None]:
# Sample corpus
Tweets = X
Statements = Y

In [None]:
Tweets

0         smelled scent hand sanitizers today someone pa...
1                 hey wouldnt made sense player pay respect
2                     trump never claimed hoax claim effort
3         one gift give appreciation simple thing always...
4             july medium bulletin novel coronavirusupdates
                                ...                        
179103      thanks nominating wearamask challengei nominate
179104                                    year insanity lol
179105    powerful painting juan lucena tribute grandpar...
179106      student test positive major university abc news
179107                                             stop see
Name: text, Length: 179108, dtype: object

In [None]:
Statements

0              vitaminand mineral supplement cannot cure
1      study show hydroxychloroquine clinical benefit...
2                        dexamethasone treatment patient
3                            people wear mask exercising
4                          water swimming transmit virus
                             ...                        
220    handling preparing food wash hand vegetable fr...
221                     staying home help prevent spread
222    person infected everyone may contact person id...
223    may asked share contact information visit plac...
224    without contact tracing effort continue spread...
Name: Facts, Length: 225, dtype: object

In [None]:
pd.set_option('display.max_colwidth', 0)
pd.set_option('display.max_columns', 0)

In [None]:
# Converting them back to dataframe

Tweets_df=pd.DataFrame(Tweets)
Statements_df=pd.DataFrame(Statements)

In [None]:
Tweets_df

Unnamed: 0,text
0,smelled scent hand sanitizers today someone past would think intoxicated
1,hey wouldnt made sense player pay respect
2,trump never claimed hoax claim effort
3,one gift give appreciation simple thing always around
4,july medium bulletin novel coronavirusupdates
...,...
179103,thanks nominating wearamask challengei nominate
179104,year insanity lol
179105,powerful painting juan lucena tribute grandparent died covid grandc
179106,student test positive major university abc news


In [None]:
A=Tweets_df.text.to_list()

In [None]:
B=Statements_df.Facts.to_list()

In [None]:
# df1 = pd.MultiIndex.from_product(
#     [Tweets_df["text"], Statements_df["Facts"]], names=["text", "Facts"]
# ).to_frame(index=False)

In [None]:
 # installing transformers 
 
  !pip install -U sentence-transformers

Collecting sentence-transformers
[?25l  Downloading https://files.pythonhosted.org/packages/35/aa/f672ce489063c4ee7a566ebac1b723c53ac0cea19d9e36599cc241d8ed56/sentence-transformers-1.0.4.tar.gz (74kB)
[K     |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 81kB 8.1MB/s 
[?25hCollecting transformers<5.0.0,>=3.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/ed/d5/f4157a376b8a79489a76ce6cfe147f4f3be1e029b7144fa7b8432e8acb26/transformers-4.4.2-py3-none-any.whl (2.0MB)
[K     |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2.0MB 30.7MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/f5/99/e0808cb947ba10f575839c43e8fafc9cc44e4a7a2c8f79c60db48220a577/sentencepiece-0.1.95-cp37-cp37m-manylinux2014_x86_64.whl (1.2MB)
[K     |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1.2MB 56.8MB/s 

In [None]:
from sentence_transformers import SentenceTransformer
from sentence_transformers import models, losses
# import pandas as pd
# import numpy as np
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Using BERT pre trained models

model = SentenceTransformer('bert-base-nli-mean-tokens')

HBox(children=(FloatProgress(value=0.0, max=405234788.0), HTML(value='')))




In [None]:
A_text_embeddings = model.encode(A, batch_size = 8, show_progress_bar = True)
B_text_embeddings = model.encode(B, batch_size = 8, show_progress_bar = True)

HBox(children=(FloatProgress(value=0.0, description='Batches', max=22389.0, style=ProgressStyle(description_wi‚Ä¶




HBox(children=(FloatProgress(value=0.0, description='Batches', max=29.0, style=ProgressStyle(description_width‚Ä¶




In [None]:
np.shape(A_text_embeddings)
np.shape(B_text_embeddings)

(225, 768)

In [None]:
A_text_embeddings[0]

array([ 3.71261239e-01,  4.94284749e-01,  5.28250873e-01,  1.25490129e-01,
       -1.95825323e-01, -1.14478163e-01,  5.69078982e-01, -1.42466202e-01,
        5.03396213e-01, -1.18997961e-01,  4.07801010e-02,  3.26957643e-01,
        1.51620090e-01,  2.67742783e-01, -1.28355443e-01,  6.42317414e-01,
        1.29928142e-01, -9.29021761e-02,  6.24966919e-01, -2.32564896e-01,
        1.65548325e-01, -5.51271677e-01,  4.14417744e-01, -2.61129867e-02,
        3.57317865e-01,  3.52871090e-01, -2.56159991e-01,  5.33772558e-02,
       -1.39991868e+00, -2.29175925e-01,  9.43815708e-02,  3.68511498e-01,
        3.66777927e-03, -3.95773113e-01,  1.34592876e-01,  1.57031822e+00,
        7.86763430e-02, -2.66254917e-02,  2.45391205e-01,  2.28516147e-01,
        3.26052196e-02,  3.16924065e-01,  3.06848943e-01,  6.32425845e-02,
       -4.67146933e-01, -1.84816137e-01,  5.07143438e-01,  6.30993843e-01,
       -1.80072844e-01, -5.13520420e-01,  4.94085029e-02, -5.52765965e-01,
        1.16256189e+00,  

In [None]:
# Getting the similarities

similarities = cosine_similarity(A_text_embeddings, B_text_embeddings)
print('pairwise dense output:\n {}\n'.format(similarities))

pairwise dense output:
 [[0.41258898 0.5422849  0.4967702  ... 0.5283385  0.5178247  0.31125844]
 [0.6191432  0.25010973 0.27053082 ... 0.24927363 0.248626   0.4974145 ]
 [0.5645153  0.09320036 0.09619252 ... 0.12042828 0.1544315  0.5230565 ]
 ...
 [0.2107704  0.30434352 0.41242522 ... 0.36252284 0.29131812 0.31220505]
 [0.0651368  0.3362767  0.30201337 ... 0.40784326 0.38861966 0.20137346]
 [0.5320057  0.16285202 0.3507428  ... 0.21896815 0.12727202 0.42562562]]



In [None]:
# getting the top 200 similarities with maximum score


similarities = cosine_similarity(A_text_embeddings, B_text_embeddings)
temp = similarities

list_of_lists = []
for i in range(200):
  argmax=np.where(temp==temp.max())
  max=np.max(temp)
  print(max," Tweet: ",df['text'][argmax[0][0]]," Facts: ",df_facts['Facts'][argmax[1][0]])
  li = [max, df['text'][argmax[0][0]], df_facts['Facts'][argmax[1][0]]]
  list_of_lists.append(li)
  temp[argmax[0][0]][argmax[1][0]] = -1


0.9881723  Tweet:  Follow advice from your national health authority on what to do if you have #COVID19 symptoms. 
In some situations,‚Ä¶ https://t.co/cBCdHj4BhA  Facts:  Follow advice from your national health authority on what to do if you have COVID-19 symptoms.‚Äã‚Äã
0.9881722  Tweet:  Follow advice from your national health authority on what to do if you have #COVID19 symptoms. 

In some situations‚Ä¶ https://t.co/VZ1NwDJmic  Facts:  Follow advice from your national health authority on what to do if you have COVID-19 symptoms.‚Äã‚Äã
0.9827492  Tweet:  Exposing yourself to the sun or temperatures higher than 25¬∞C DOES NOT protect you from COVID-19
#covid19 #poll  Facts:  Exposing yourself to the sun or temperatures higher than 25¬∞C DOES NOT protect you from COVID-19
0.9691514  Tweet:  FACT: The #COVID19 virus CANNOT be spread through mosquito bites. Read more here: https://t.co/q3PKyqYNNw‚Ä¶ https://t.co/phnc4y5O6T  Facts:  The COVID-19 virus CANNOT be spread through mosquito bit

In [None]:
list_of_lists

[[0.9881723,
  'Follow advice from your national health authority on what to do if you have #COVID19 symptoms. \nIn some situations,‚Ä¶ https://t.co/cBCdHj4BhA',
  'Follow advice from your national health authority on what to do if you have COVID-19 symptoms.\u200b\u200b'],
 [0.9881722,
  'Follow advice from your national health authority on what to do if you have #COVID19 symptoms. \n\nIn some situations‚Ä¶ https://t.co/VZ1NwDJmic',
  'Follow advice from your national health authority on what to do if you have COVID-19 symptoms.\u200b\u200b'],
 [0.9827492,
  'Exposing yourself to the sun or temperatures higher than 25¬∞C DOES NOT protect you from COVID-19\n#covid19 #poll',
  'Exposing yourself to the sun or temperatures higher than 25¬∞C DOES NOT protect you from COVID-19'],
 [0.9691514,
  'FACT: The #COVID19 virus CANNOT be spread through mosquito bites. Read more here: https://t.co/q3PKyqYNNw‚Ä¶ https://t.co/phnc4y5O6T',
  'The COVID-19 virus CANNOT be spread through mosquito bites'

In [None]:
# Getting the output dataset

df_final = pd.DataFrame(list_of_lists,columns=['Similarity_Score','Tweets','Facts'])

In [None]:
df_final

Unnamed: 0,Similarity_Score,Tweets,Facts
0,0.988172,"Follow advice from your national health authority on what to do if you have #COVID19 symptoms. \nIn some situations,‚Ä¶ https://t.co/cBCdHj4BhA",Follow advice from your national health authority on what to do if you have COVID-19 symptoms.‚Äã‚Äã
1,0.988172,Follow advice from your national health authority on what to do if you have #COVID19 symptoms. \n\nIn some situations‚Ä¶ https://t.co/VZ1NwDJmic,Follow advice from your national health authority on what to do if you have COVID-19 symptoms.‚Äã‚Äã
2,0.982749,Exposing yourself to the sun or temperatures higher than 25¬∞C DOES NOT protect you from COVID-19\n#covid19 #poll,Exposing yourself to the sun or temperatures higher than 25¬∞C DOES NOT protect you from COVID-19
3,0.969151,FACT: The #COVID19 virus CANNOT be spread through mosquito bites. Read more here: https://t.co/q3PKyqYNNw‚Ä¶ https://t.co/phnc4y5O6T,The COVID-19 virus CANNOT be spread through mosquito bites
4,0.964506,"Stay home if you sick, come over if you thick.\n\n#COVID19",Stay home if you are sick
...,...,...,...
195,0.861873,#COVID19 US ‚Äòfailures‚Äô are holding back search for coronavirus drugs https://t.co/euto1vtPjt,"If we stop following the key protective measures, coronavirus can come rushing back.‚Äã"
196,0.861759,Ending this #COVID19 thread\nMy isolation ended and recovering. \nStay safe. Stay at home.,Protect yourself and others. Do your part. Stay home.
197,0.861551,Much of the fear and panic associated with #COVID19 has been a result of misinformation spread by bots and augmente‚Ä¶ https://t.co/n5QAprqMPt,"When we‚Äôre worried or afraid, we can begin to automatically divide the world into ‚Äòus‚Äô vs ‚Äòthem‚Äô. That‚Äôs how stigma starts."
198,0.861527,@Mark52645278 @BriongloidBoy @aoifemcl @JackHoJo @dlabanyi @IrishTimes Grandparents should stay home. Get someone e‚Ä¶ https://t.co/l1ChGF2hhS,Protect yourself and others. Do your part. Stay home.


In [None]:
similarities_sorted = similarities.argsort()

In [None]:
similarities_sorted

array([[ 15,  81, 191, ..., 138,  13,   7],
       [ 74,  81,  51, ..., 149, 129,  62],
       [217, 173, 212, ...,  20,  29,  62],
       ...,
       [ 20,  93,   3, ..., 136, 200, 110],
       [ 93,  80,  82, ...,  90, 101, 135],
       [ 74, 112, 184, ...,  17,  82,  80]])

In [None]:
df_final.to_json('New_Result1.json', orient='records')

In [None]:
df_final.to_csv('Result.csv')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Get list of similarity indices i.e. doc at index 0 simialr with doc at index 1169 below.
id_1 = []
id_2 = []
score = []
for index,array in enumerate(similarities_sorted):
    id_1.append(index)
    id_2.append(array[-2])
    score.append(similarities[index][array[-2]])

In [None]:
index_df = pd.DataFrame({'id_1' : id_1,
                          'id_2' : id_2,
                          'score' : score})

In [None]:
index_df

Unnamed: 0,id_1,id_2,score
0,0,13,0.693195
1,1,129,0.796213
2,2,29,0.683392
3,3,18,0.719261
4,4,55,0.595881
...,...,...,...
179103,179103,140,0.627856
179104,179104,17,0.613379
179105,179105,200,0.508737
179106,179106,101,0.494292


In [None]:
def most_similar(doc_id,similarities_sorted,matrix):
    print (f'Tweets: {Tweets_df.iloc[doc_id]["text"]}')
    print ('\n')
    print (f'Similar Tweets using {matrix}:')
    if matrix=='Cosine Similarity':
        similar_ix=np.argsort(similarities_sorted[doc_id])[::-1]
    for ix in similar_ix:
        if ix==doc_id:
            continue
        print('\n')
        print (f'Tweets: {Tweets_df.iloc[ix]["text"]}')
        print (f'{matrix} : {similarities_sorted[doc_id][ix]}')

In [None]:
most_similar(0,B_text_embeddings,'Cosine Similarity')

Tweets: smelled scent hand sanitizers today someone past would think intoxicated


Similar Tweets using Cosine Similarity:


Tweets: maharashtra india new confirmed case reported last hour info via mohfw
Cosine Similarity : 2.6561906337738037


Tweets: change work general recruiting specifically via recruiting
Cosine Similarity : 2.1582958698272705


Tweets: drkawana minister fishery marine resource whats hold please release announce name
Cosine Similarity : 1.990936517715454


Tweets: iit kharagpurs portable device test covid r covidtesting covid
Cosine Similarity : 1.8002917766571045


Tweets: hahahahha morning humor russia russia russia schiff russiancollusion interference potus
Cosine Similarity : 1.7707929611206055


Tweets: coronavirus testing fiasco st mirrenhave pledged undertake urgent review testing proced
Cosine Similarity : 1.7689961194992065


Tweets: know station like wwmt sleep night
Cosine Similarity : 1.732478380203247


Tweets: two student tested positive coronavirus 