In [1]:
import pandas as pd
import re
import emoji
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.preprocessing import MinMaxScaler
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('../raw_data/twitter_1.csv')

In [3]:
df.rename(columns={'rawContent': 'tweet'}, inplace=True)

In [4]:
def convert_to_datetime(df):
    df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d %H:%M:%S')
    return df

In [5]:
def convert_emojis(string):
    text = emoji.demojize(string)
    text = text.replace(":", "").replace("_", " ")
    return text

In [6]:
def clean_tweet(tweet):
    
    # Remove url
    tweet = re.sub(r'https?:\/\/\S+', '', tweet)
    tweet = re.sub(r'http?:\/\/\S+', '', tweet)
    
    # Remove \n
    tweet = re.sub(r'\n', '', tweet)
    
    # Remove @
    tweet = re.sub(r'@[A-Za-z0-9]+', '', tweet)
    
    # Remove #
    tweet = re.sub(r'#', '', tweet)
    
    # Remove RT
    tweet = re.sub(r'RT[\s]+', '', tweet)
    
    # Convert to lower case
    tweet = tweet.lower()
    
    # Remove "#&'()*/:;@[\]^`{|}~"
    tweet = re.sub(r'[#&\'\(\)\*\+\/:;@\[\]\^`{|}~]', '', tweet)
    
    # Remove double space
    tweet = re.sub(r'\s\s+', ' ', tweet)
    
    return tweet

In [7]:
def min_max_normalization(df):
    df = df.copy()
    
    num_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
    scaler = MinMaxScaler()
    df[num_cols] = scaler.fit_transform(df[num_cols])
    
    return df

In [8]:
df = convert_to_datetime(df)
df['tweet'] = df['tweet'].apply(clean_tweet)
df['tweet'] = df['tweet'].apply(convert_emojis)
df = min_max_normalization(df)

  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


In [9]:
list1 = ['AAPL', 'Apple Inc.', 'Appl', 
         'AMC', 'AMC Entertainment Holdings Inc.', 'AAMC', 
         'AC', 'AMZN', 'Amazon.com Inc.', 'Amazn', 'AMD', 
         'Advanced Micro Devices Inc.', 'AMED', 
         'BB', 'BlackBerry Ltd.', 'Blacberry', 
         'BBBY', 'Bed Bath & Beyond Inc.', 'Bedbathbeyond', 
         'BTC', 'Bitcoin', 'bit coin', 'BYND', 'Beyond Meat Inc.', 'Bynd', 
         'CLNE', 'Clean Energy Fuels Corp.', 'Clnenergy', 
         'CMG', 'Chipotle Mexican Grill Inc.', 'Chipotle', 'COST', 
         'Costco Wholesale Corp.', 'Coscto', 'CRSR', 'Corsair Gaming Inc.', 
         'Corsair', 'DIS', 'Walt Disney Co.', 'Disney', 'DOGE', 'Dogecoin', 
         'Doge coin', 'ET', 'Energy Transfer LP', 'Energy Transfer', 'F', 
         'Ford Motor Co.', 'Ford', 'FB', 'Facebook Inc.', 'Fb', 'GME', 
         'GameStop Corp.', 'Gamestop', 'GOOG', 'Alphabet Inc.', 'Google', 
         'HD', 'Home Depot Inc.', 'Home Depo', 'INTC', 'Intel Corp.', 
         'Intell', 'JNJ', 'Johnson & Johnson', 'Johnsohn', 'KO', 
         'Coca-Cola Co.', 'Coca Cola', 'LULU', 'Lululemon Athletica Inc.', 
         'Lululemn', 'MCD', "McDonald's Corp.", "McDonalds", 'MGM', 
         'MGM Resorts International', 'Mgmresorts', 'MSFT', 'Microsoft Corp.', 
         'Micrsoft', 'MU', 'Micron Technology Inc.', 'Micron', 
         'NIO', 'NIO Inc.', 'Nio', 'NVDA', 'NVIDIA Corp.', 'Nivida', 
         'PFE', 'Pfizer Inc.', 'Pifzer', 'PINS', 'Pinterest Inc.', 
         'Pintrst', 'PLTR', 'Palantir Technologies Inc.', 'Palintir', 
         'QQQ', 'Invesco QQQ Trust', 'InvescoQQQ', 'RBLX', 
         'Roblox Corp.', 'Robloks', 'RIOT', 'Riot Blockchain Inc.', 
         'Riott', 'ROKU', 'Roku Inc.', 'Rokue', 'SNDL', 
         'Sundial Growers Inc.', 'Sundail', 'SPCE', 
         'Virgin Galactic Holdings Inc.', 'Spacex', 'SQ', 'Square Inc.', 
         'Squar', 'T', 'AT&T Inc.', 'Att', 'TSLA', 'Tesla Inc.', 
         'Teslla', 'TWTR', 'Twitter Inc.', 'Twiter', 'UBER', 
         'Uber Technologies Inc.', 'Ube', 'UPST', 'Upstart Holdings Inc.', 
         'Upstartholdings', 'V', 'Visa Inc.', 'Visa', 'WMT', 'Walmart Inc.', 
         'Walmrt', 'XOM', 'Exxon Mobil Corp.', 'Exxonmobil']
list2 = ['AAL', 'American Airlines Group Inc.', 'Americanairlines',
        'ABNB', 'Airbnb Inc.', 'AirBnB',
        'ACB', 'Aurora Cannabis Inc.', 'Auroracannabis',
        'AMRN', 'Amarin Corp. plc', 'Amerin',
        'ARKK', 'ARK Innovation ETF', 'ARKinovation',
        'BABA', 'Alibaba Group Holding Ltd.', 'Alibaba',
        'BA', 'Boeing Co.', 'Boing',
        'BAC', 'Bank of America Corp.', 'Bankofamerica',
        'BIDU', 'Baidu Inc.', 'Bido',
        'BILI', 'Bilibili Inc.', 'Billibili',
        'BLNK', 'Blink Charging Co.', 'Blinkcharg',
        'BMY', 'Bristol Myers Squibb Co.', 'Bristolmyers',
        'BRK.A', 'Berkshire Hathaway Inc.', 'Berkshira',
        'CCL', 'Carnival Corp.', 'Carnival',
        'CGC', 'Canopy Growth Corp.', 'Canopygrowth',
        'CHWY', 'Chewy Inc.', 'Chewycom',
        'CSCO', 'Cisco Systems Inc.', 'Cicsco',
        'CVS', 'CVS Health Corp.', 'Cvshealth',
        'DAL', 'Delta Air Lines Inc.', 'Deltaairlines',
        'DDOG', 'Datadog Inc.', 'DataDog',
        'DISCA', 'Discovery Inc. - Class A', 'Discovera',
        'DKNG', 'DraftKings Inc.', 'DraftKings',
        'ENPH', 'Enphase Energy Inc.', 'Enphase',
        'EQT', 'EQT Corp.', 'Eqtcorporation',
        'FCEL', 'FuelCell Energy Inc.', 'Fuelcell',
        'FSLY', 'Fastly Inc.', 'Fastly',
        'GE', 'General Electric Co.', 'GeneralElectric',
        'GM', 'General Motors Co.', 'GeneralMotors',
        'GOLD', 'Barrick Gold Corp.', 'Goldmining',
        'GPRO', 'GoPro Inc.', 'Gopro',
        'GRWG', 'GrowGeneration Corp.', 'Growgen',
        'HPE', 'Hewlett Packard Enterprise Co.', 'Hewlettpackard',
        'IBB', 'iShares NASDAQ Biotechnology ETF', 'iSharesNasdaqBiotechnology',
        'IBKR', 'Interactive Brokers Group Inc.', 'InterectiveBrokers',
        'INO', 'Inovio Pharmaceuticals Inc.', 'Inovio',
        'JD', 'JD.com Inc.', 'JD',
        'JMIA', 'Jumia Technologies AG', 'Jumia',
        'JPM', 'JPMorgan Chase & Co.', 'JPmorgan',
        'KHC', 'Kraft Heinz Co.', 'KraftHeinz',
        'LIT', 'Global X Lithium & Battery Tech ETF', 'GlobalXlithium',
        'MARA', 'Marathon Digital Holdings Inc.', 'MarathonDigital',
        'MRNA', 'Moderna Inc.', 'Modernna',
        'NCLH', 'Norwegian Cruise Line Holdings Ltd.', 'Norwegiancruiseline',
        'NET', 'Cloudflare Inc.', 'CloudFlare',
        'NFLX', 'Netflix Inc.', 'Netlfix',
        'NKE', 'Nike Inc.', 'Nike', 'NKLA', 'Nikola Corp.', 'Nicola', 'NOK', 'Nokia Corp.', 'Noka', 'O', 'Realty Income Corp.', 'Realtyincome', 'OGI', 'OrganiGram Holdings Inc.', 'Organigram', 'OTRK', 'Ontrak Inc.', 'OnTrack', 'PDD', 'Pinduoduo Inc.', 'Pinduoduo', 'PENN', 'Penn National Gaming Inc.', 'PennNational']


In [10]:
keywords = []

for word in list1:
    word = clean_tweet(word)
    word = word.replace('inc.', '').replace('ltd.', '').replace('co.', '').replace('corp.', '').replace('.com', '')
    keywords.append(word)

for word in list2:
    word = clean_tweet(word)
    word = word.replace('inc.', '').replace('ltd.', '').replace('co.', '').replace('corp.', '').replace('.com', '')
    keywords.append(word)

In [11]:
top50 = pd.read_csv('../raw_data/top50.csv')

In [12]:
cashtags = top50['cashtags'].tolist()
hashtags = top50['hashtags'].tolist()

In [13]:
for cashtag in cashtags:
    cashtag = clean_tweet(cashtag)
    cashtag = re.sub(r'[^\w\s]', '', cashtag)
    cashtag = cashtag.replace(' ', '')
    
    if cashtag not in keywords:
        keywords.append(cashtag)

In [14]:
for hashtag in hashtags:
    hashtag = clean_tweet(hashtag)
    hashtag = re.sub(r'[^\w\s]', '', hashtag)
    hashtag = hashtag.replace(' ', '')
    
    if hashtag not in keywords:
        keywords.append(hashtag)

In [15]:
for keyword in keywords:
    if len(keyword) < 2:
        keywords.remove(keyword)

In [16]:
df_filtered = df[df['tweet'].str.contains(keyword).any() | df['cashtags'].notnull() | df['hashtags'].notnull()]

In [17]:
df_filtered

Unnamed: 0,url,date,tweet,username,likeCount,replyCount,retweetCount,quoteCount,cashtags,hashtags,viewCount,1min,60min,1day,1week,stock_symbol,current_value
0,https://twitter.com/whale_alert/status/1477067...,2022-01-01 00:00:29+00:00,unlocked unlocked unlocked unlocked unlocked u...,whale_alert,0.003971,0.001852,0.004138,0.001049,,['XRP'],,,,,,,
1,https://twitter.com/whale_alert/status/1477067...,2022-01-01 00:00:29+00:00,unlocked unlocked unlocked unlocked unlocked u...,whale_alert,0.005253,0.003598,0.004562,0.002832,,['XRP'],,,,,,,
4,https://twitter.com/alphatrends/status/1477070...,2022-01-01 00:15:26+00:00,"for the year, bitcoin was up 46.5%the average ...",alphatrends,0.000555,0.000285,0.000658,0.000000,,['Bitcoin'],,,,,,,
5,https://twitter.com/whale_alert/status/1477071...,2022-01-01 00:17:20+00:00,police car light police car light police car l...,whale_alert,0.000621,0.000997,0.000752,0.000210,,"['WBTC', 'Binance']",,,,,,,
6,https://twitter.com/whale_alert/status/1477072...,2022-01-01 00:21:40+00:00,police car light police car light police car l...,whale_alert,0.000786,0.000819,0.000894,0.000105,,"['WBTC', 'Binance']",,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139463,https://twitter.com/NekozTek/status/1608946370...,2022-12-30 22:01:22+00:00,$btc positive correlation w sampp 500 = record...,NekozTek,0.000330,0.000321,0.000329,0.000000,['BTC'],['Bitcoin'],0.001325,,,,,,
139474,https://twitter.com/whale_alert/status/1608957...,2022-12-30 22:45:56+00:00,police car light police car light police car l...,whale_alert,0.000700,0.000178,0.000658,0.000000,,"['USDC', 'Coinbase']",0.021086,,,,,,
139475,https://twitter.com/whale_alert/status/1608957...,2022-12-30 22:45:57+00:00,police car light police car light police car l...,whale_alert,0.000727,0.000178,0.000376,0.000000,,"['USDC', 'Coinbase']",0.022362,,,,,,
139483,https://twitter.com/CryptoWizardd/status/16089...,2022-12-30 23:25:39+00:00,safelanding,CryptoWizardd,0.000040,0.000107,0.000000,0.000000,,['safelanding'],0.000794,,,,,,


In [41]:
small_df = df_filtered.sample(n=1000, random_state=1)

In [42]:
def sentiment_analysis(df):
    from transformers import BertTokenizer, BertForSequenceClassification, pipeline

    finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone', num_labels=3)
    tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
    nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)
    
    sentiment_labels = []  # create empty list to store sentiment labels
    
    for i in range(len(df)):
        tweet = df.iloc[i]['tweet']
        result = nlp(tweet)[0]
        sentiment_label = result['label']
        sentiment_labels.append(sentiment_label)  # add sentiment label to list
        
    df['sentiment'] = sentiment_labels  # assign list to new column
    
    return df 

In [43]:
sentiment_analysis(small_df)

Unnamed: 0,url,date,tweet,username,likeCount,replyCount,retweetCount,quoteCount,cashtags,hashtags,viewCount,1min,60min,1day,1week,stock_symbol,current_value,sentiment
24270,https://twitter.com/johnscharts/status/1497494...,2022-02-26 08:52:40+00:00,$mos breakout on expanding volume with strong ...,johnscharts,0.000112,0.000036,0.000000,0.000105,['MOS'],,,,,,,,,Positive
2632,https://twitter.com/PeterLBrandt/status/147943...,2022-01-07 12:55:17+00:00,in 1980 gold topped at $873 $gc finvestors kep...,PeterLBrandt,0.005266,0.002957,0.005690,0.001888,['GC_F'],,,,,,,,,Neutral
101900,https://twitter.com/NekozTek/status/1576303616...,2022-10-01 20:10:43+00:00,playing this range for $matic.i dont expect a ...,NekozTek,0.000324,0.000499,0.000941,0.000000,['MATIC'],,,,,,,,,Neutral
51956,https://twitter.com/CheddarFlow/status/1524043...,2022-05-10 15:06:40+00:00,$spy $239m dark pool printvery light activity ...,CheddarFlow,0.000178,0.000036,0.000094,0.000000,"['SPY', 'SPY']",,,,,,,,,Negative
126538,https://twitter.com/Jake__Wujastyk/status/1596...,2022-11-27 16:18:31+00:00,$tsla tsla both perspectives below.bear case-p...,Jake__Wujastyk,0.000714,0.000285,0.000705,0.000210,['TSLA'],['TSLA'],,,,,,,,Neutral
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85501,https://twitter.com/eWhispers/status/155841004...,2022-08-13 11:08:04+00:00,earnings for the week $wmt $tgt $hd $zim $li $...,eWhispers,0.004394,0.000748,0.013168,0.007446,"['WMT', 'TGT', 'HD', 'ZIM', 'LI', 'NIU', 'SE',...",['earnings'],,,,,,,,Neutral
106905,https://twitter.com/whale_alert/status/1580790...,2022-10-14 05:21:21+00:00,"fire 95,352,674 busd 95,362,021 usd burned at ...",whale_alert,0.000568,0.000321,0.000235,0.000000,,"['BUSD', 'Binance']",,,,,,,,Neutral
63628,https://twitter.com/alphatrends/status/1535651...,2022-06-11 15:52:56+00:00,one worduglybitcoin,alphatrends,0.000998,0.000000,0.000894,0.000315,,['Bitcoin'],,,,,,,,Neutral
101480,https://twitter.com/CheddarFlow/status/1575864...,2022-09-30 15:06:46+00:00,$spy $2.4m 0dte calleyes strike 360 expiration...,CheddarFlow,0.000720,0.000570,0.000376,0.000420,['SPY'],,,,,,,,,Neutral


In [44]:
small_df['sentiment'].value_counts()

Neutral     811
Positive    131
Negative     58
Name: sentiment, dtype: int64

In [45]:
small_df.shape

(1000, 18)