In [50]:
import sys, os, pickle
import tweepy as tw
import pandas as pd

In [51]:
def twitter_auth():
    try:
        consumer_key = os.environ['TWITTER_API_KEY']
        consumer_secret = os.environ['TWITTER_API_SECRET']
        access_token = os.environ['TWITTER_API_ACCESS_TOKEN']
        access_secret=os.environ['TWITTER_API_ACCESS_TOKEN_SECRET']
    except KeyError:
        sys.stderr.write("Environment variable not set\n")
        sys.exit(1)
    
    auth = tw.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_secret)
    
    return auth

In [52]:
def get_twitter_client():
    auth = twitter_auth()
    client = tw.API(auth, wait_on_rate_limit=True)
    return client

In [53]:
def get_users(filename=None):
    
    user_list = []
    
    if not filename:
        return None
    
    
    with open(filename) as f:
        for line in f:
            user_list.append(line.strip())
    
    return user_list

In [54]:
users = get_users('users.txt')

In [55]:
# users = ['LynAldenContact']

In [69]:
DATE_SINCE = '2022-06-22'
NUM_TWEETS = 1000

In [64]:
# client = get_twitter_client()
# user_tweets_dict = {}
# for user in users:
#     user_tweets_dict[user] = []

# user_tweets_dict

In [70]:
client = get_twitter_client()
user_tweets_dict = {}
for user in users:
    user_tweets_dict[user] = []

for user in users:
    
    tweets = client.user_timeline(screen_name=user,
                                 count=NUM_TWEETS,
                                 include_rts=True,
                                 tweet_mode='extended'
                                 )
    
    for tweet in tweets:
        user_tweets_dict[user].append(tweet.full_text)

user_tweets_dict

{'PeterSchiff': ['@Convertbond @EconguyRosie @Nouriel You forgot about me.  https://t.co/J4IG9nZTRC',
  "The Fed's balance sheet just expanded for the third week in a row in June. The rise of $1.9 billion increased the size of the Fed's balance sheet to $8.934 trillion. I wonder when the #Fed will stop creating #inflation by ending QE and actually start fighting it by beginning QT.",
  '@DAK1067 Based on the high inflation we already had the 10-year average is already above 2%.',
  '@goldexchangenyc Yes, but then it will happen later as apposed to sooner, and they can blame it on currency speculators and corporate greed.',
  'The #Fed can either bring #inflation down to 2%, cause a financial crisis, bank failures, stock, real estate and bond market crashes, the U.S. government to default on Treasury debt and slash Social Security benefits, while #unemployment soars, or it can tolerate high inflation.',
  "@RiskPack Not for me. It still doesn't come up at all.",
  "@pascalcharpent4 It's

In [66]:
len(user_tweets_dict['MacroAlf'])

100

In [None]:
user_tweets_dict['LynAldenContact']

In [67]:
# Pickle tweets

with open('user_tweets.txt', 'wb') as file:
    pickle.dump(user_tweets_dict, file)

In [68]:
with open('user_tweets.txt', 'rb') as file:
    data_dict = pickle.load(file)

In [26]:
len(data_dict)

3

In [71]:
# Apply first round of data cleaning

import re
import string

def clean_text_round1(text):
    '''Make text lowercase and remove punctuation'''
    
    text = text.lower()
    text = re.sub('\[.*?\]','', text)
    text = re.sub('["%s"]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub(r'http\S+', '', text)
    
    return text

data_clean_1 = {}
for user, _ in data_dict.items():
    data_clean_1[user] = []

for user, tweet_list in data_dict.items():
    for tweet in tweet_list:
        data_clean_1[user].append(clean_text_round1(tweet))

data_clean_1
        

{'PeterSchiff': ['convertbond econguyrosie nouriel you forgot about me  ',
  'the feds balance sheet just expanded for the third week in a row in june the rise of  billion increased the size of the feds balance sheet to  trillion i wonder when the fed will stop creating inflation by ending qe and actually start fighting it by beginning qt',
  ' based on the high inflation we already had the  average is already above ',
  'goldexchangenyc yes but then it will happen later as apposed to sooner and they can blame it on currency speculators and corporate greed',
  'the fed can either bring inflation down to  cause a financial crisis bank failures stock real estate and bond market crashes the us government to default on treasury debt and slash social security benefits while unemployment soars or it can tolerate high inflation',
  'riskpack not for me it still doesnt come up at all',
  ' its on rumble it some up first with only  views',
  'anjomfaramarz its from two days ago',
  'csuhartanto

In [76]:
# Apply round 2 of data cleaning
import emoji

def clean_text_round2(text):
    
    text = emoji.get_emoji_regexp().sub(r'', text)
    text = text.replace("\n", "")
    
    return text

data_clean = {}

for user, _ in data_clean_1.items():
    data_clean[user] = []

for user, tweet_list in data_clean_1.items():
    
    for tweet in tweet_list:
        data_clean[user].append(clean_text_round2(tweet))
    
data_clean['LynAldenContact']

  text = emoji.get_emoji_regexp().sub(r'', text)


['',
 'only chance i can see to reach those numbers would be to change slr and unload it on the commercial banking system',
 'nfturbo treasury markets and credit markets going illiquid seizing upcrashing markets also reduces tax receipts and increases deficits treasury issuance while the fed would be net seller of treasuries',
 'does anyone think the fed will be successful at reducing their balance sheet by over  trilliongood luck to them is all ill say ',
 'louisphdsb yes',
 'fedwire performs about  million transactions per year averaging about  million per transaction resulting in roughly  quadrillion usd in annual gross settlement value ',
 'replyingape   for lack of good money people monetize other assets and monetizing assets with a lot of necessary utility is deleterious for societywhen people perceive that their money is devaluing or that it is in a risky jurisdiction theyd rather get it in something else',
 'the gld etf was launched in  it has outperformed us treasury bond etfs

In [77]:
# Pickle cleaned tweets

with open('user_cleaned_tweets.txt', 'wb') as file:
    pickle.dump(data_clean, file)

In [78]:
with open('user_cleaned_tweets.txt', 'rb') as file:
    data_cleaned_dict = pickle.load(file)

In [79]:
data_cleaned_dict['PeterSchiff']

['convertbond econguyrosie nouriel you forgot about me  ',
 'the feds balance sheet just expanded for the third week in a row in june the rise of  billion increased the size of the feds balance sheet to  trillion i wonder when the fed will stop creating inflation by ending qe and actually start fighting it by beginning qt',
 ' based on the high inflation we already had the  average is already above ',
 'goldexchangenyc yes but then it will happen later as apposed to sooner and they can blame it on currency speculators and corporate greed',
 'the fed can either bring inflation down to  cause a financial crisis bank failures stock real estate and bond market crashes the us government to default on treasury debt and slash social security benefits while unemployment soars or it can tolerate high inflation',
 'riskpack not for me it still doesnt come up at all',
 ' its on rumble it some up first with only  views',
 'anjomfaramarz its from two days ago',
 'csuhartanto yes if you add my name 

In [81]:
df_dict = {}

for user, tweet_list in data_cleaned_dict.items():
    df_dict[user] = pd.DataFrame(tweet_list)


df_dict['LynAldenContact']

Unnamed: 0,0
0,
1,only chance i can see to reach those numbers w...
2,nfturbo treasury markets and credit markets go...
3,does anyone think the fed will be successful a...
4,louisphdsb yes
...,...
95,stablecoins are centralized
96,francispouliot esixtyone none of this creates ...
97,share your evidence for who it is happy to se...
98,francispouliot good times create weak menill l...


In [82]:
df_dict['PeterSchiff']

Unnamed: 0,0
0,convertbond econguyrosie nouriel you forgot ab...
1,the feds balance sheet just expanded for the t...
2,based on the high inflation we already had th...
3,goldexchangenyc yes but then it will happen la...
4,the fed can either bring inflation down to ca...
...,...
95,acertivoo cnbc no everyone who bought bitcoin ...
96,now that bitcoin has collapsed below and ethe...
97,pres joebiden claims high inflation isnt his f...
98,incomesharks if they dont sell eventually they...


In [83]:
for user, df in df_dict.items():
    df.columns=['tweets']

In [84]:
df_dict['PeterSchiff']

Unnamed: 0,tweets
0,convertbond econguyrosie nouriel you forgot ab...
1,the feds balance sheet just expanded for the t...
2,based on the high inflation we already had th...
3,goldexchangenyc yes but then it will happen la...
4,the fed can either bring inflation down to ca...
...,...
95,acertivoo cnbc no everyone who bought bitcoin ...
96,now that bitcoin has collapsed below and ethe...
97,pres joebiden claims high inflation isnt his f...
98,incomesharks if they dont sell eventually they...


In [95]:
data_cleaned_dict['PeterSchiff']

['convertbond econguyrosie nouriel you forgot about me  ',
 'the feds balance sheet just expanded for the third week in a row in june the rise of  billion increased the size of the feds balance sheet to  trillion i wonder when the fed will stop creating inflation by ending qe and actually start fighting it by beginning qt',
 ' based on the high inflation we already had the  average is already above ',
 'goldexchangenyc yes but then it will happen later as apposed to sooner and they can blame it on currency speculators and corporate greed',
 'the fed can either bring inflation down to  cause a financial crisis bank failures stock real estate and bond market crashes the us government to default on treasury debt and slash social security benefits while unemployment soars or it can tolerate high inflation',
 'riskpack not for me it still doesnt come up at all',
 ' its on rumble it some up first with only  views',
 'anjomfaramarz its from two days ago',
 'csuhartanto yes if you add my name 

In [97]:
# Create Document-Term Matrix for each tweet

from sklearn.feature_extraction.text import CountVectorizer

dtm_dict = {}

for user, _ in df_dict.items():
    dtm_dict[user] = []

    
    
for user, tweet_list in data_cleaned_dict.items():
    
    for tweet in tweet_list:
        try:
            cv = CountVectorizer(stop_words='english')
            split_tweet = tweet.split()
            print(user, split_tweet)
            data_cv = cv.fit_transform(split_tweet)
            data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())

            dtm_dict[user].append(data_dtm)

        except ValueError as err:
            # for empty tweets / tweets only containing stop words
            pass
# for user, df in df_dict.items():

#     cv = CountVectorizer(stop_words='english')
#     data_cv = cv.fit_transform(df.tweets)
#     data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())

#     dtm_dict[user] = data_dtm

PeterSchiff ['convertbond', 'econguyrosie', 'nouriel', 'you', 'forgot', 'about', 'me']
PeterSchiff ['the', 'feds', 'balance', 'sheet', 'just', 'expanded', 'for', 'the', 'third', 'week', 'in', 'a', 'row', 'in', 'june', 'the', 'rise', 'of', 'billion', 'increased', 'the', 'size', 'of', 'the', 'feds', 'balance', 'sheet', 'to', 'trillion', 'i', 'wonder', 'when', 'the', 'fed', 'will', 'stop', 'creating', 'inflation', 'by', 'ending', 'qe', 'and', 'actually', 'start', 'fighting', 'it', 'by', 'beginning', 'qt']
PeterSchiff ['based', 'on', 'the', 'high', 'inflation', 'we', 'already', 'had', 'the', 'average', 'is', 'already', 'above']
PeterSchiff ['goldexchangenyc', 'yes', 'but', 'then', 'it', 'will', 'happen', 'later', 'as', 'apposed', 'to', 'sooner', 'and', 'they', 'can', 'blame', 'it', 'on', 'currency', 'speculators', 'and', 'corporate', 'greed']
PeterSchiff ['the', 'fed', 'can', 'either', 'bring', 'inflation', 'down', 'to', 'cause', 'a', 'financial', 'crisis', 'bank', 'failures', 'stock', 're



MacroAlf ['michaelaarouet', 'because', 'of', 'the', 'negative', 'carry', 'in', 'shorting', 'btps', 'drawing', 'a', 'line', 'in', 'the', 'sand', 'and', 'saying', '“you', 'shouldn’t', 'cross', 'this', 'line”', 'would', 'probably', 'work', 'okaysh', 'without', 'having', 'to', 'expand', 'the', 'bs', 'by', 'muchstill', 'not', 'an', 'iron', 'clad', 'strategy…']
MacroAlf ['ill', 'talk', 'about', 'that', 'soon']
MacroAlf ['nickgiva', 'andreassteno', 'had', 'a', 'good', 'ride', 'there', 'nick', 'had', 'a', 'monthly', 'move', 'my', 'way', 'in', 'a', 'short', 'period', 'of', 'time', 'nothing', 'compared', 'to', 'your', 'schatz', 'trade', 'but', 'stilli', 'hate', 'sitting', 'on', 'a', 'negative', 'carry', 'trade', 'with', 'a', 'binary', 'outcome', 'ahead', 'of', 'me']
MacroAlf ['michaelaarouet', 'if', 'they', 'can', 'come', 'up', 'with', 'a', 'credible', 'backstop', 'id', 'say', 'first', 'round', 'effect', 'is', 'eurusd', 'rally', 'on', 'restored', 'credibility', 'and', 'focus', 'on', 'rate', 'hik



In [100]:
dtm_dict['LynAldenContact'][1]

Unnamed: 0,credit,deficits,fed,going,illiquid,increases,issuance,markets,net,nfturbo,receipts,reduces,seizing,seller,tax,treasuries,treasury,upcrashing
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
6,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [102]:
# Pickle cleaned tweets

with open('user_dtm.txt', 'wb') as file:
    pickle.dump(dtm_dict, file)