<a href="https://colab.research.google.com/github/vidulakamat/FB_Posts_SC_Analysis/blob/main/Count_of_People_Receiving_Antibodies_CrossEncoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [6]:
import re
import spacy
import time

In [38]:
from kneed import KneeLocator

In [7]:
pip install -U sentence-transformers

Requirement already up-to-date: sentence-transformers in /usr/local/lib/python3.6/dist-packages (0.4.1.2)


In [8]:
pip install kneed



In [9]:
df = pd.read_excel('SC_Posts_1.xls', header=None)
df.columns = ['post']

df.shape

(17946, 1)

In [10]:
df = df[(df['post'].str.find('FALSE')==-1)]
df = df[(df['post'].str.find('Comments')==-1)]
df.shape

(12274, 1)

In [11]:
# function to preprocess speech
def clean(text):
    
    # removing paragraph numbers
    text = re.sub('[0-9]+.\t','',str(text))
    
    #text.replace('[^a-zA-Z]', '')
    #text.replace('[^\w\s]', '')
    #text = re.sub('[^a-zA-Z]', ' ', text)
    #text = re.sub(r'\s+', ' ', text)
    
    # removing new line characters
    text = re.sub('\n ','',str(text))
    text = re.sub('\n',' ',str(text))
    # removing apostrophes
    text = re.sub("'s",'',str(text))
    # removing hyphens
    text = re.sub("-",' ',str(text))
    text = re.sub("— ",'',str(text))
    # removing quotation marks
    text = re.sub('\"','',str(text))
    # removing any reference to outside text
    text = re.sub("[\(\[].*?[\)\]]", "", str(text))
    
    return text

# preprocessing speeches
df['post_clean'] = df['post'].apply(clean)
df.shape

(12274, 2)

In [12]:
def preprocess_sent(tweet):
    processed_tweet = tweet
    processed_tweet.replace('[^a-zA-Z]', '')
    processed_tweet.replace('[^\w\s]', '')
    processed_tweet = re.sub('[^a-zA-Z]', ' ', processed_tweet)
    processed_tweet = re.sub(r'\s+', ' ', processed_tweet)
    return(processed_tweet)

df['Processed_Sent'] = df['post_clean'].apply(lambda x: preprocess_sent(x))
df.head()

Unnamed: 0,post,post_clean,Processed_Sent
0,Shannon Murray Gormley,Shannon Murray Gormley,Shannon Murray Gormley
2,Just wanted to let anyone know if you are able...,Just wanted to let anyone know if you are able...,Just wanted to let anyone know if you are able...
4,Alicia Gottschalk,Alicia Gottschalk,Alicia Gottschalk
5,My hubby had it last week and it has done wond...,My hubby had it last week and it has done wond...,My hubby had it last week and it has done wond...
7,Laura Sutera,Laura Sutera,Laura Sutera


In [13]:
df = df[df['Processed_Sent'].str.strip().str.len()>0]
df.shape

(11557, 3)

In [14]:
# load english language model
nlp = spacy.load('en_core_web_sm')

In [15]:
# function to retrieve FIRST person name from post

def get_person_name(text):
    
    doc = nlp(text)
    
    sent = []
    
    persons_count = 0
    person_names = []
    prev_person_name_index = -1
    token_count = 0
    
    for token in doc:
        if (token.ent_type_=='PERSON'):
          persons_count+= 1
          if(prev_person_name_index == -1):
            person_names.append(token.text)
            prev_person_name_index = token_count
          else:
            if((token_count-prev_person_name_index)==1):
              person_names.append(token.text)
              prev_person_name_index = token_count
        token_count += 1

    person_name = ' '.join(person_names)

    return(person_name)

In [16]:
# function for checking if post contains only person names
def check_only_person_names(text):
    
    doc = nlp(text)
    
    sent = []
    
    persons_count = 0
    for token in doc:
        if (token.ent_type_=='PERSON'):
          persons_count+= 1

    return((persons_count == len(doc)))

In [17]:
df['Person_Name'] = df['Processed_Sent'].apply(lambda x: get_person_name(x))
df['Is_Only_Person_Name'] = df['Processed_Sent'].apply(lambda x: check_only_person_names(x))

In [18]:
df.head()

Unnamed: 0,post,post_clean,Processed_Sent,Person_Name,Is_Only_Person_Name
0,Shannon Murray Gormley,Shannon Murray Gormley,Shannon Murray Gormley,Shannon Murray Gormley,True
2,Just wanted to let anyone know if you are able...,Just wanted to let anyone know if you are able...,Just wanted to let anyone know if you are able...,,False
4,Alicia Gottschalk,Alicia Gottschalk,Alicia Gottschalk,Alicia Gottschalk,True
5,My hubby had it last week and it has done wond...,My hubby had it last week and it has done wond...,My hubby had it last week and it has done wond...,,False
7,Laura Sutera,Laura Sutera,Laura Sutera,Laura Sutera,True


In [19]:
df.sample(20)

Unnamed: 0,post,post_clean,Processed_Sent,Person_Name,Is_Only_Person_Name
12899,"That said, I'm not gonna lie: I thought ""bamla...","That said, I'm not gonna lie: I thought bamlan...",That said I m not gonna lie I thought bamlaniv...,,False
2301,Shantell Sheffield,Shantell Sheffield,Shantell Sheffield,,False
1479,Thank you for sharing your story. ??????,Thank you for sharing your story. ??????,Thank you for sharing your story,,False
3528,Tina Fedasz Addo I edited my response above I ...,Tina Fedasz Addo I edited my response above I ...,Tina Fedasz Addo I edited my response above I ...,Tina Fedasz,False
8659,Hope my answers got counted,Hope my answers got counted,Hope my answers got counted,,False
10155,Daniel Chulpayev,Daniel Chulpayev,Daniel Chulpayev,Daniel Chulpayev,True
3493,Rita Bissell,Rita Bissell,Rita Bissell,Rita Bissell,True
11224,Sarah Lisa Kniseley,Sarah Lisa Kniseley,Sarah Lisa Kniseley,Sarah Lisa Kniseley,True
3650,Kimberly Parker,Kimberly Parker,Kimberly Parker,Kimberly Parker,True
12841,How can one be eligible?,How can one be eligible?,How can one be eligible,,False


In [20]:
# Total number of people present in post (those who posted/commented + those referred in post by others)

total_number_of_people = len(set(list(df['Person_Name'])))
total_number_of_people

2845

In [21]:
# Removing those entries where only person names are present in post

df = df[df['Is_Only_Person_Name']==False]

In [22]:
# Removing duplicate posts

df = df.drop_duplicates(subset = ["Processed_Sent"])

In [23]:
# Final posts to be considered for similarities and further processing

df.shape

(5290, 5)

In [24]:
df.sample(10)

Unnamed: 0,post,post_clean,Processed_Sent,Person_Name,Is_Only_Person_Name
16900,Sallyann Holland I lived in NJ too. I'm origin...,Sallyann Holland I lived in NJ too. I'm origin...,Sallyann Holland I lived in NJ too I m origina...,Sallyann Holland,False
16768,Plus like... Some people in this thread almost...,Plus like... Some people in this thread almost...,Plus like Some people in this thread almost di...,,False
9405,That headache I had to go to er they gave me s...,That headache I had to go to er they gave me s...,That headache I had to go to er they gave me s...,,False
12683,Hang in there !!! I had very bad symptoms post...,Hang in there !!! I had very bad symptoms post...,Hang in there I had very bad symptoms post vac...,,False
527,Randi Gilbert DeMinno she has leukemia. She is...,Randi Gilbert DeMinno she has leukemia. She is...,Randi Gilbert DeMinno she has leukemia She is ...,Randi Gilbert DeMinno,False
3792,WHO.INT,WHO.INT,WHO INT,,False
5092,Godspeed ! ??,Godspeed ! ??,Godspeed,,False
16857,Eileen Tarrell I'm glad to hear that. Wishing ...,Eileen Tarrell I'm glad to hear that. Wishing ...,Eileen Tarrell I m glad to hear that Wishing y...,,False
100,I said from day 1 they should be giving them! ...,I said from day 1 they should be giving them! ...,I said from day they should be giving them RN ...,,False
1618,About,About,About,,False


In [25]:
# Column to be utilized is Processed_Sent

In [26]:
unique_posts = list(df['Processed_Sent'])
len(unique_posts)

5290

In [27]:
from sentence_transformers import CrossEncoder
model = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-2')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=612.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=17565609.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=543.0, style=ProgressStyle(description_…




In [28]:
queries = ["received Infusion", "received antibodies", "received regeneron", "received eli lilly", "received bamlanivimab"]
passages = list(df['Processed_Sent'])

In [30]:
matching_queries_dict = {}
matching_posts_count = 0

#Search in a loop for the individual queries
for query in queries:
    matching_tuples = []
    start_time = time.time()

    #Concatenate the query and all passages and predict the scores for the pairs [query, passage]
    model_inputs = [[query, passage] for passage in passages]
    scores = model.predict(model_inputs)

    #Sort the scores in decreasing order
    results = [{'input': inp, 'score': score} for inp, score in zip(model_inputs, scores)]
    results = sorted(results, key=lambda x: x['score'], reverse=True)

    print("Query:", query)
    print("Search took {:.2f} seconds".format(time.time() - start_time))

    for hit in results:
        if(float(("{:.2f}".format(hit['score']))) > 0):
          matching_tuple = (hit['input'][1], float(("{:.2f}".format(hit['score']))))
          matching_tuples.append(matching_tuple)

    matching_queries_dict[query] = matching_tuples
    matching_posts_count += len(matching_tuples)
    print("Number of Matching Posts:", len(matching_tuples))
    print("==========")

print('\n')
print('For ',len(queries),' Queries, Total Matching Posts: ',matching_posts_count)

Query: received Infusion
Search took 27.51 seconds
Number of Matching Posts: 280
Query: received antibodies
Search took 27.15 seconds
Number of Matching Posts: 293
Query: received regeneron
Search took 27.40 seconds
Number of Matching Posts: 196
Query: received eli lilly
Search took 27.86 seconds
Number of Matching Posts: 30
Query: received bamlanivimab
Search took 28.71 seconds
Number of Matching Posts: 82


For  5  Queries, Total Matching Posts:  881


In [37]:
matched_posts_dfs = []
for query_matched in matching_queries_dict.keys():
  matched_tuples = matching_queries_dict[query_matched]
  matched_tuple_df = pd.DataFrame({'Match_Count':([x for x in range(len(matched_tuples))]), 'Processed_Sent':[i[0] for i in matched_tuples], 'Score':[i[1] for i in matched_tuples]})
  matched_tuple_df.sort_values(by=['Score'], ascending=False, inplace=True)
  matched_posts_dfs.append(matched_tuple_df)

In [39]:
# Find knee in every df and collect all matching posts (Processed_Sent) in a list

matching_posts_list = []

matched_df_count = 0
for matched_post_df in matched_posts_dfs:
  x = matched_post_df['Match_Count']
  y = matched_post_df['Score']
  elbow = -1
  curve_type_direction_list = [('convex','decreasing'), ('convex''increasing'), ('concave','decreasing'), ('concave','increasing')]
  for combination in curve_type_direction_list:
    kn = KneeLocator(x, y, curve=combination[0], direction=combination[1])
    if((kn.knee > 0) & (kn.knee < (len(x)-1))):
      elbow = kn.knee
      print('For DF: ', matched_df_count, ', Eblow at: ', elbow, ' for ', combination[0], '-', combination[1])
      break
  if(elbow > -1):
    current_matching_posts = list(matched_post_df['Processed_Sent'][:elbow])
    matching_posts_list.extend(current_matching_posts)
  matched_df_count += 1

print('\n')
print('Total Number of Matching Posts after Elbow: ', len(matching_posts_list))

For DF:  0 , Eblow at:  23  for  convex - decreasing
For DF:  1 , Eblow at:  31  for  convex - decreasing
For DF:  2 , Eblow at:  16  for  convex - decreasing
For DF:  3 , Eblow at:  5  for  convex - decreasing
For DF:  4 , Eblow at:  10  for  convex - decreasing


Total Number of Matching Posts after Elbow:  85


In [41]:
matching_posts_list = list(set(matching_posts_list))
print('Total Number of Unique Matching Posts: ', len(matching_posts_list))

Total Number of Unique Matching Posts:  65


In [42]:
list(matching_posts_list)

['Ruth Adelman mine was separated by mos and I haven t had antibodies since July',
 'Robyn Snyder monoclonal antibodies ',
 'Praise God prayers answered ',
 'He had antibody treatment day two God please protect him ',
 'Jessica Garza Monoclonal antibodies are laboratory made proteins that mimic the immune system ability to fight off harmful antigens such as viruses Bamlanivimab is a monoclonal antibody that is specifically directed against the spike protein of SARS CoV designed to block the virus attachment and entry into human cells ',
 'Diane Sater he received the one produced by Regeneron Eli Lilly has bamlanivimab available also which I believe is being more widely used right now',
 'Today I received the monoclonal antibodies treatment for Covid complication prevention The government has spent an unbelievable amount of money on these treatments and they re going widely underutilized partially due to the lack of public awareness ',
 'I received this but I m curious some MDs are sayi

In [48]:
unique_posts_tuples = []

for x in matching_posts_list:
  post_count = 0
  for y in df['Processed_Sent']:
    if(x == y):
      unique_posts_tuple = ((list(df['post'])[post_count]), x)
      unique_posts_tuples.append(unique_posts_tuple)
    post_count += 1

unique_posts_df = pd.DataFrame({'Post':[i[0] for i in unique_posts_tuples], 'Processed_Sent':[i[1] for i in unique_posts_tuples]})
unique_posts_df.head()

Unnamed: 0,Post,Processed_Sent
0,Ruth Adelman mine was separated by 9mos and I ...,Ruth Adelman mine was separated by mos and I h...
1,Robyn Snyder monoclonal antibodies.,Robyn Snyder monoclonal antibodies
2,Praise God ...prayers answered!,Praise God prayers answered
3,He had antibody treatment day two. God please...,He had antibody treatment day two God please p...
4,Jessica Garza Monoclonal antibodies are labora...,Jessica Garza Monoclonal antibodies are labora...


In [51]:
unique_posts_df.to_csv('Unique_Matching_Posts_Antidodies_Taken.csv', index=False, columns=unique_posts_df.columns)