# Notebook for Iterative Filtering on US-Twitter data 

In [None]:
import numpy as np
import pandas as pd
from TTMonitor.preprocess import *
from TTMonitor.TwitterLDA import TwitterLDA
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
import pyLDAvis.gensim
from nltk.corpus import stopwords
stopwords.words("english")
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
stop_words = ENGLISH_STOP_WORDS
from datetime import date

# needed to use the TTMonitor module
class SkTokenizer():
    def __init__(self):
        tfidf = TfidfVectorizer()
        self.tokenize = tfidf.build_tokenizer()

# Setup

In [None]:
# set up month to be analyzed
month = ['Nov']

In [None]:
# these words will be omitted for topic model fitting 
conf_stopwords = ['amp', '000']

COUNTRY = "US"
NAME ="nov_conspi_monthly_5_topics"
preprocess_params = {"include_hashtags": True}

In [None]:
# The full dictionary after several rounds of iterative filtering. We just added the words found in every iteration to this list:
include_keywords = [
    # (non-conspiracy) COVID related keywords
    'covid',
    'covid19',
    'coronavirus',
    'covid__19',
    'covid_19',
    'pandemic',
    'covidー19',
    'covd',
    'sars-cov-2',
    'wearamask',
    'quarantine',
    'stayhome',
    'coviddays',
    'cdc',
    'wuhancoronavirus',
    'wuhanlockdown',
    'socialdistancingnow',
    'panicbuy',
    '14dayquarantine',
    'duringmy14dayquarantine',
    'inmyquarantinesurvivalkit',
    'coronakindness',
    'quarantinelife',
    'stayhomechallenge',
    'dontbeaspreader',
    'lockdown',
    'shelteringinplace',
    'sheltering',
    'staysafestayhome',
    'flattenthecurve',
    'quarentinelife',
    'saferathome',
    'stayathome',
    'epitwitter',

    # potentially conspiracy related keywords         
    'thestorm',
    'fooked',
    'thegreatawakeningworldwide',
    'justiceforjay',
    'msm',
    'dirtyjoe',
    'obamagate',
    'lockthemallup',
    'pizzagate',
    'pedogateglobal',
    'treason',
    'qanon2020',
    'antifaterrorists',
    'russiacollusion',
    'arrestthemboth',
    'corruption',
    'doitqarmy',
    'fightback',
    'herdhurdheard',
    'savethechildren',
    'darktolight',
    'nomask',
    'criticalthinker',
    'nomasks',
    'wearethenews',
    'digitalsoldiers',
    'scamdemic',
    'wedonotconsent',
    'idonotconsent',
    'epstein',
    'truthmatters',
    'fuckbillgates',
    'wearetherevolution',
    'conspiracy',
    'saveourchildren',
    'stoptheabuse',
    'greatawakening',
    'wearethenewsnow',
    'pedowood',
    'pedogate',
    'pedogatisrealwwg1wga',
    'wwg1wwa',
    'qanon',
    'qnn',
    'whereshunter',
    'wakeupamerica',
    'theplantosavetheworld',
    'deepstatetakedown',
    'qajf',
    'justiceiscoming',
    'thedamisbreaking',
    'disclosure',
    'sheepnomore',
    'savehumanity',
    'questioneverything',
    'themaskshavefallen',
    'bilderberg',
    'bio-engineered',
    'plandemic',
    'indoctornation',
    'virushoax',
    'hoax',
    'fakenews',
    'fakescience',
    'scamdemic2020hospitalsarenotfull',
    'democratscamdemic',
    'covidhoax',
    'covid1984',
    'covidvaccineispoison',
    'qarmy',
    'trumprussia',
    'traitor',
    'exposebillgates',
    'arrestbillgates',
    'arrestfauci',
    'firefauci',
    'covidcult',
    'destroydnc',
    'voterfraud',
    'electionfraud',
    'draintheswamp',
    'nwo',
    'newworldorder',
    'illuminati',
    'ftsn',
    'chemtrails',
    'freemason',
    'freemasonry',
    'masonic',
    'thegreatreset',
    'flatearth',
    'earthisflat',
    'lizardpeople',
    'covidiot',
    'covididiot',
    'coronaswimwewr',
    'viruscoronavirus',
    'kungflu',
    'coronapocalypse',
    'chinesevirus',

    # added in last round of iteration
    'alexjones',
    'infowars',
    'russiahoax'
]

In [None]:
# The full blacklist dictionary after several rounds of iterative filtering. We just added the words found in every iteration to this list:
exclude_keywords = [                    
    'cannabis',
    'CannabisCommunity',
    'weed',
    'orlandoweed',
    'weedporn',
    'ubranstreetphotography',
    'streetphotographer',
    'photodocumentary',
    'deliciousness',
    'cheese',
    'garlic',
    'photography',
    'nfl',
    'vikings',
    'football',
    'beach',
    'gym',
    'gyms',
    'nba',
    'nbabubble',
    'nbafinals',
    'lakers',
    'championship',
    'happynewyear',
    'happynewyear2021',
    'bye2020',
    'newyear',
    'newyearseve',
    'newyear2021nye2020',
    'nye',
    'welcome2021',
    'music',
    'rap',
    'hiphop',
    'sundayvibes',
    'trap',
    'quarantinelife',
    'influencer',
    'resistancebands',
    'fitness',
    'xboxshare',
    'gears5',
    'gridiron',
    'batman'  
]

# Preprocessing

In [None]:
# Prepare LDA
tokenizer = SkTokenizer()
lda_params = {"stop_words":stop_words, "tokenizer":tokenizer, "enrich":True,
              "doc_threshold":10, "similarity_threshold":0.8}

fit_params={"n_topics":5, "n_jobs":31, "no_below":5,
            "no_above":0.8,"passes":200,"chunksize":100}

# stopword set is adjusted and then passed as argument as a frozenset
stop_words_adjust = set(stop_words)
stop_words_adjust.update(conf_stopwords)

lda_params['stop_words'] = frozenset(stop_words_adjust)

In [None]:
# load data
data = read_datafiles('../../Twitter_US_11_01_2021/load_in_data') 
# filter data by month
data = [obs for obs in data if obs['created_at'].split()[1] in month] 
# parse tweets
data = parse_tweets(data, **preprocess_params)

# apply the filters
data = filter_tweets(data, include_keywords, include=True)
data = filter_tweets(data, exclude_keywords, include=False)

# LDA estimation

In [None]:
# using TTMonitor module for LDA estimation
tlda = TwitterLDA(data, **lda_params)
tlda.fit(**fit_params)
topics, scores = tlda.classify_tweets()
print(tlda.coherence)
tlda.get_topics()

In [None]:
# save the results
identifier = datetime.now().strftime("%y%m%d%H%M") +"_"+NAME+"_"+COUNTRY
gensim_data = {'model': tlda.model, 'dictionary': tlda.doc_id2bigram, 'corpus': tlda.corpus_bi}

pickle.dump(tlda, open(identifier+"_topics_len_"+str(len(data))+".pkl", "wb") )
pickle.dump(gensim_data, open(identifier+"_gensim_data.pkl", "wb") )

# Vizualization

In [None]:
pyLDAvis.enable_notebook()
m = tlda
lda = m.model
dictionary = m.doc_id2bigram
corpus = m.corpus_bi

p = pyLDAvis.gensim.prepare(lda, corpus, dictionary, mds='mmds', sort_topics=False)
pyLDAvis.save_html(p, identifier+".html")