In [1]:
import sys, os, pickle
import tweepy as tw
import pandas as pd

In [2]:
def twitter_auth():
    try:
        consumer_key = os.environ['TWITTER_API_KEY']
        consumer_secret = os.environ['TWITTER_API_SECRET']
        access_token = os.environ['TWITTER_API_ACCESS_TOKEN']
        access_secret=os.environ['TWITTER_API_ACCESS_TOKEN_SECRET']
    except KeyError:
        sys.stderr.write("Environment variable not set\n")
        sys.exit(1)
    
    auth = tw.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_secret)
    
    return auth

In [3]:
def get_twitter_client():
    auth = twitter_auth()
    client = tw.API(auth, wait_on_rate_limit=True)
    return client

In [17]:
SEARCH_TERM = "$CHPT"
date_since = '2022-06-24'
NUM_TWEETS = 1000

In [18]:
client = get_twitter_client()
tweets = tw.Cursor(client.search_tweets, q=SEARCH_TERM, lang='en', since=date_since).items(NUM_TWEETS)
saved_tweets = {}

for tweet in tweets:
    saved_tweets[tweet.user.screen_name] = tweet.text

Unexpected parameter: since
Unexpected parameter: since
Unexpected parameter: since


In [19]:
saved_tweets

{'hunter62046908': "$CHPT ~'Top analyst price target today. https://t.co/OsCHYh8F00",
 'Gambiste1': 'RT @GinoDeLaRose: ◦#LetsGo $SPY $AMC $GME $MULN $HYMC $BBIG $RDBX  $SNDL $TWTR $CALA $NILE $BBBY $LCID $HSTO $GGPI $CENN $CEI $OP $CHPT $DW…',
 'GinoDeLaRose': '◦#LetsGo $SPY $AMC $GME $MULN $HYMC $BBIG $RDBX  $SNDL $TWTR $CALA $NILE $BBBY $LCID $HSTO $GGPI $CENN $CEI $OP… https://t.co/iksnegEUIr',
 'EssaronyMoha': '$CHPT 👌✌️ https://t.co/6p5lEO4n4r',
 'shortvolumes': 'Short sale volume (not short interest) for $NET at 2022-06-23 is 59%. https://t.co/JxgEjyWt5B $JNPR 66% $TCRT 58% $LUV 58% $CHPT 55%',
 'trackfunds': 'Increase in No. Funds Holding:\n\n$VALE = 57\n$DTE = 39\n$BA = 33\n$CHPT = 17\n$RTM = 17\n$CUZ = 7\n$JBLU = 5\n\nSee Which Fu… https://t.co/H4KFezdO03',
 'SurrenB': 'RT @AimanBbt: Tough market recently, not many nice daily setups, played $CHPT for potential momentum which sadly failed! -0.5R\n\n@BearBullTr…',
 'RustyCas': '@RevShark Not sure who’ll be building them, but EV’

In [7]:
# Pickle tweets

with open('tweets.txt', 'wb') as file:
    pickle.dump(saved_tweets, file)

In [8]:
with open('tweets.txt', 'rb') as file:
    data = pickle.load(file)

In [9]:
len(data)

107

In [10]:
# Apply first round of data cleaning

import re
import string

def clean_text_round1(text):
    '''Make text lowercase and remove punctuation'''
    
    text = text.lower()
    text = re.sub('\[.*?\]','', text)
    text = re.sub('["%s"]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub(r'http\S+', '', text)
    
    return text

data_clean = {}

for user, tweet in data.items():
    data_clean[user] = clean_text_round1(tweet)

In [11]:
data_clean

{'hunter62046908': 'chpt top analyst price target today ',
 'Gambiste1': 'rt isentiment idex ok 👇👇👇 \n\nchpt blnk muln evgo 📈 ',
 'GinoDeLaRose': '◦letsgo spy amc gme muln hymc bbig rdbx  sndl twtr cala nile bbby lcid hsto ggpi cenn cei op… ',
 'EssaronyMoha': 'chpt 👌✌️ ',
 'shortvolumes': ' short sale volume not short interest for chpt is   w  xhb  wve  mcd ',
 'trackfunds': 'no of funds increasing holding\no  \ngnrc  \nchpt  \nbros  \ncns  \natip  \ncut  \nsee w… ',
 'SurrenB': 'rt aimanbbt tough market recently not many nice daily setups played chpt for potential momentum which sadly failed \n\nbearbulltr…',
 'RustyCas': 'revshark not sure who’ll be building them but ev’s are inevitable don’t hear much talk about ev infrastructure chpt',
 'MC_OptionTrades': 'chpt makes an outsized move  the  option straddle expiring tomorrow implies a ± move ',
 'tiedyejamber': 'rt explorertx form  on deck for sirc form has been in the hands of auditors expect the filing any day now undervalued is a

In [12]:
# Round 2 cleaning

import emoji

def clean_text_round2(text):
    
    text = emoji.get_emoji_regexp().sub(r'', text)
    text = text.replace('\n', "")
    
    return text

data_clean_new = {}

for user, tweet in data_clean.items():
    data_clean_new[user] = [clean_text_round2(tweet)]
    
data_clean_new

  text = emoji.get_emoji_regexp().sub(r'', text)


{'hunter62046908': ['chpt top analyst price target today '],
 'Gambiste1': ['rt isentiment idex ok  chpt blnk muln evgo  '],
 'GinoDeLaRose': ['◦letsgo spy amc gme muln hymc bbig rdbx  sndl twtr cala nile bbby lcid hsto ggpi cenn cei op… '],
 'EssaronyMoha': ['chpt  '],
 'shortvolumes': [' short sale volume not short interest for chpt is   w  xhb  wve  mcd '],
 'trackfunds': ['no of funds increasing holdingo  gnrc  chpt  bros  cns  atip  cut  see w… '],
 'SurrenB': ['rt aimanbbt tough market recently not many nice daily setups played chpt for potential momentum which sadly failed bearbulltr…'],
 'RustyCas': ['revshark not sure who’ll be building them but ev’s are inevitable don’t hear much talk about ev infrastructure chpt'],
 'MC_OptionTrades': ['chpt makes an outsized move  the  option straddle expiring tomorrow implies a ± move '],
 'tiedyejamber': ['rt explorertx form  on deck for sirc form has been in the hands of auditors expect the filing any day now undervalued is an underst…']

In [13]:
data_df = pd.DataFrame.from_dict(data_clean_new).transpose()
data_df.columns = ['tweets']
data_df

Unnamed: 0,tweets
hunter62046908,chpt top analyst price target today
Gambiste1,rt isentiment idex ok chpt blnk muln evgo
GinoDeLaRose,◦letsgo spy amc gme muln hymc bbig rdbx sndl ...
EssaronyMoha,chpt
shortvolumes,short sale volume not short interest for chpt...
...,...
Snakecase_,chpt from zarickman
zarickman,snakecase chpt
BullishCesar100,mrmikeinvesting youre not into chpt
TraderJike,chpt chart suggesting a possible move down to


In [14]:
data_df.to_pickle('clean_corpus.pkl')