In [1]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
from langdetect import DetectorFactory, detect
from langdetect.lang_detect_exception import LangDetectException
DetectorFactory.seed = 0

def is_english(text):
    try:
        if detect(text) != "en":
            return False
    except LangDetectException:
        return False
    return True

def scrape_tweets(keyword_search:str, since_date:str, until_date:str):
    tweets = []
    for j, tweet in enumerate(sntwitter.TwitterSearchScraper(keyword_search+' since:'+since_date+' until:'+until_date).get_items()):
#        if j>100000:
#           break
        tweets.append([tweet.date, tweet.id, tweet.content, tweet.user.username])
    tweets_df = pd.DataFrame(tweets, columns=['Datetime', 'Tweet Id', 'Text', 'Username'])
    tweets_df['language'] =  tweets_df['Text'].apply(is_english)
    tweets_df = tweets_df[tweets_df['language'] == True].reset_index()
    return tweets_df

In [2]:
tweets_df = scrape_tweets("maribavir", "2022-01-01", "2022-07-07")
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 133 entries, 0 to 132
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype              
---  ------    --------------  -----              
 0   index     133 non-null    int64              
 1   Datetime  133 non-null    datetime64[ns, UTC]
 2   Tweet Id  133 non-null    int64              
 3   Text      133 non-null    object             
 4   Username  133 non-null    object             
 5   language  133 non-null    bool               
dtypes: bool(1), datetime64[ns, UTC](1), int64(2), object(2)
memory usage: 5.5+ KB


In [3]:
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 133 entries, 0 to 132
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype              
---  ------    --------------  -----              
 0   index     133 non-null    int64              
 1   Datetime  133 non-null    datetime64[ns, UTC]
 2   Tweet Id  133 non-null    int64              
 3   Text      133 non-null    object             
 4   Username  133 non-null    object             
 5   language  133 non-null    bool               
dtypes: bool(1), datetime64[ns, UTC](1), int64(2), object(2)
memory usage: 5.5+ KB


In [4]:
from transformers import pipeline

tweets_df['transplant'] = ''
tweets_df['maribavir'] = ''
tweets_df['health_care'] = ''
tweets_df['burden'] = ''
tweets_df['care'] = ''
tweets_df['financial'] = ''

classifier = pipeline("zero-shot-classification")

for tweets, row in tweets_df.iterrows():
    #print(tweets)
    a = classifier(
        tweets_df['Text'][tweets],
        candidate_labels=["transplant", "maribavir", "health care", "burden", "care", "financial"],)
    
    a = pd.DataFrame.from_dict(a)
    
    tweets_df['transplant'][tweets] = a.iloc[0,2]
    tweets_df['maribavir'][tweets] = a.iloc[1,2]
    tweets_df['health_care'][tweets] = a.iloc[2,2]
    tweets_df['burden'][tweets] = a.iloc[3,2]
    tweets_df['care'][tweets] = a.iloc[4,2]
    tweets_df['financial'][tweets] = a.iloc[5,2]

No model was supplied, defaulted to roberta-large-mnli (https://huggingface.co/roberta-large-mnli)
All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at roberta-large-mnli.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_df['transplant'][1] = a.iloc[0,2]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_df['maribavir'][1] = a.iloc[1,2]
A value is tryi

In [5]:
tweets_df

Unnamed: 0,index,Datetime,Tweet Id,Text,Username,language,transplant,maribavir,health_care,burden,care,financial
0,0,2022-06-28 12:47:05+00:00,1541765127061573634,@CATN0IRS idk man probably adderall and maribavir,mrcls_world,True,,,,,,
1,1,2022-06-26 20:36:45+00:00,1541158548343869440,"In this study of 10 SOT on #maribavir, only ~1...",TransplantIDNet,True,0.833913,0.09746,0.02993,0.025963,0.007052,0.005682
2,2,2022-06-23 13:24:25+00:00,1539962581049348096,"As we use maribavir, we need more experiences ...",MichaelGIsonMD,True,,,,,,
3,3,2022-06-23 12:26:00+00:00,1539947880542265344,Interesting letter-to-the-editor which address...,TheTxIDjournal,True,,,,,,
4,4,2022-06-22 18:02:16+00:00,1539670117667176455,"Gilead’s Lenacapavir, Takeda’s Maribavir Due F...",PharmaPinkSheet,True,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
128,139,2022-01-14 15:00:04+00:00,1482004597111603202,Feature Friday - Resident @themTORyouknow epis...,accpimtrprn,True,,,,,,
129,140,2022-01-14 05:51:15+00:00,1481866479171739648,Letermovir and maribavir for pan‐resistant cyt...,GarbineLizeaga,True,,,,,,
130,141,2022-01-08 08:00:44+00:00,1479724741036445696,The #FDA has approved the first drug for #adul...,GaneshManjeri,True,,,,,,
131,142,2022-01-07 16:19:17+00:00,1479487815226470401,Cancer therapies lead way as FDA drug approval...,ABsteward,True,,,,,,
