In [None]:
import torch
import pandas as pd
import re
import pickle
import emoji
import preprocessor
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from nltk.corpus import stopwords
from sklearn.preprocessing import MinMaxScaler
from transformers import BertTokenizer, BertForSequenceClassification, pipeline

In [None]:
data = pd.read_csv('raw_data/tweets.csv')

In [None]:
df = data.drop(columns=['media', 'inReplyToUser', 'mentionedUsers', 'lang', 'source', 'location'])
df.rename(columns={'rawContent': 'tweet'}, inplace=True)

In [None]:
def convert_to_datetime(df):
    df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d %H:%M:%S')
    return df

In [None]:
def convert_emojis(string):
    text = emoji.demojize(string)
    text = text.replace(":", "").replace("_", " ")
    return text

In [None]:
def clean_tweet(tweet):
    
    # Remove url
    tweet = re.sub(r'https?:\/\/\S+', '', tweet)
    tweet = re.sub(r'http?:\/\/\S+', '', tweet)
    
    # Remove \n
    tweet = re.sub(r'\n', '', tweet)
    
    # Remove @
    tweet = re.sub(r'@[A-Za-z0-9]+', '', tweet)
    
    # Remove #
    tweet = re.sub(r'#', '', tweet)
    
    # Remove RT
    tweet = re.sub(r'RT[\s]+', '', tweet)
    
    # Convert to lower case
    tweet = tweet.lower()
    
    # Remove "#&'()*/:;@[\]^`{|}~"
    tweet = re.sub(r'[#&\'\(\)\*\+\/:;@\[\]\^`{|}~]', '', tweet)
    
    # Remove double space
    tweet = re.sub(r'\s\s+', ' ', tweet)
    
    return tweet

In [None]:
def min_max_normalization(df):
    df = df.copy()
    
    num_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
    scaler = MinMaxScaler()
    df[num_cols] = scaler.fit_transform(df[num_cols])
    
    return df

In [None]:
df = convert_to_datetime(df)
df['tweet'] = df['tweet'].apply(clean_tweet)
df['tweet'] = df['tweet'].apply(convert_emojis)
df = min_max_normalization(df)

In [None]:
list1 = ['AAPL', 'Apple Inc.', 'Appl', 'AMC', 'AMC Entertainment Holdings Inc.', 'AAMC', 'AC', 'AMZN', 'Amazon.com Inc.', 'Amazn', 'AMD', 'Advanced Micro Devices Inc.', 'AMED', 'BB', 'BlackBerry Ltd.', 'Blacberry', 'BBBY', 'Bed Bath & Beyond Inc.', 'Bedbathbeyond', 'BTC', 'Bitcoin', 'bit coin', 'BYND', 'Beyond Meat Inc.', 'Bynd', 'CLNE', 'Clean Energy Fuels Corp.', 'Clnenergy', 'CMG', 'Chipotle Mexican Grill Inc.', 'Chipotle', 'COST', 'Costco Wholesale Corp.', 'Coscto', 'CRSR', 'Corsair Gaming Inc.', 'Corsair', 'DIS', 'Walt Disney Co.', 'Disney', 'DOGE', 'Dogecoin', 'Doge coin', 'ET', 'Energy Transfer LP', 'Energy Transfer', 'F', 'Ford Motor Co.', 'Ford', 'FB', 'Facebook Inc.', 'Fb', 'GME', 'GameStop Corp.', 'Gamestop', 'GOOG', 'Alphabet Inc.', 'Google', 'HD', 'Home Depot Inc.', 'Home Depo', 'INTC', 'Intel Corp.', 'Intell', 'JNJ', 'Johnson & Johnson', 'Johnsohn', 'KO', 'Coca-Cola Co.', 'Coca Cola', 'LULU', 'Lululemon Athletica Inc.', 'Lululemn', 'MCD', "McDonald's Corp.", "McDonalds", 'MGM', 'MGM Resorts International', 'Mgmresorts', 'MSFT', 'Microsoft Corp.', 'Micrsoft', 'MU', 'Micron Technology Inc.', 'Micron', 'NIO', 'NIO Inc.', 'Nio', 'NVDA', 'NVIDIA Corp.', 'Nivida', 'PFE', 'Pfizer Inc.', 'Pifzer', 'PINS', 'Pinterest Inc.', 'Pintrst', 'PLTR', 'Palantir Technologies Inc.', 'Palintir', 'QQQ', 'Invesco QQQ Trust', 'InvescoQQQ', 'RBLX', 'Roblox Corp.', 'Robloks', 'RIOT', 'Riot Blockchain Inc.', 'Riott', 'ROKU', 'Roku Inc.', 'Rokue', 'SNDL', 'Sundial Growers Inc.', 'Sundail', 'SPCE', 'Virgin Galactic Holdings Inc.', 'Spacex', 'SQ', 'Square Inc.', 'Squar', 'T', 'AT&T Inc.', 'Att', 'TSLA', 'Tesla Inc.', 'Teslla', 'TWTR', 'Twitter Inc.', 'Twiter', 'UBER', 'Uber Technologies Inc.', 'Ube', 'UPST', 'Upstart Holdings Inc.', 'Upstartholdings', 'V', 'Visa Inc.', 'Visa', 'WMT', 'Walmart Inc.', 'Walmrt', 'XOM', 'Exxon Mobil Corp.', 'Exxonmobil']
list2 = ['AAL', 'American Airlines Group Inc.', 'Americanairlines',
        'ABNB', 'Airbnb Inc.', 'AirBnB',
        'ACB', 'Aurora Cannabis Inc.', 'Auroracannabis',
        'AMRN', 'Amarin Corp. plc', 'Amerin',
        'ARKK', 'ARK Innovation ETF', 'ARKinovation',
        'BABA', 'Alibaba Group Holding Ltd.', 'Alibaba',
        'BA', 'Boeing Co.', 'Boing',
        'BAC', 'Bank of America Corp.', 'Bankofamerica',
        'BIDU', 'Baidu Inc.', 'Bido',
        'BILI', 'Bilibili Inc.', 'Billibili',
        'BLNK', 'Blink Charging Co.', 'Blinkcharg',
        'BMY', 'Bristol Myers Squibb Co.', 'Bristolmyers',
        'BRK.A', 'Berkshire Hathaway Inc.', 'Berkshira',
        'CCL', 'Carnival Corp.', 'Carnival',
        'CGC', 'Canopy Growth Corp.', 'Canopygrowth',
        'CHWY', 'Chewy Inc.', 'Chewycom',
        'CSCO', 'Cisco Systems Inc.', 'Cicsco',
        'CVS', 'CVS Health Corp.', 'Cvshealth',
        'DAL', 'Delta Air Lines Inc.', 'Deltaairlines',
        'DDOG', 'Datadog Inc.', 'DataDog',
        'DISCA', 'Discovery Inc. - Class A', 'Discovera',
        'DKNG', 'DraftKings Inc.', 'DraftKings',
        'ENPH', 'Enphase Energy Inc.', 'Enphase',
        'EQT', 'EQT Corp.', 'Eqtcorporation',
        'FCEL', 'FuelCell Energy Inc.', 'Fuelcell',
        'FSLY', 'Fastly Inc.', 'Fastly',
        'GE', 'General Electric Co.', 'GeneralElectric',
        'GM', 'General Motors Co.', 'GeneralMotors',
        'GOLD', 'Barrick Gold Corp.', 'Goldmining',
        'GPRO', 'GoPro Inc.', 'Gopro',
        'GRWG', 'GrowGeneration Corp.', 'Growgen',
        'HPE', 'Hewlett Packard Enterprise Co.', 'Hewlettpackard',
        'IBB', 'iShares NASDAQ Biotechnology ETF', 'iSharesNasdaqBiotechnology',
        'IBKR', 'Interactive Brokers Group Inc.', 'InterectiveBrokers',
        'INO', 'Inovio Pharmaceuticals Inc.', 'Inovio',
        'JD', 'JD.com Inc.', 'JD',
        'JMIA', 'Jumia Technologies AG', 'Jumia',
        'JPM', 'JPMorgan Chase & Co.', 'JPmorgan',
        'KHC', 'Kraft Heinz Co.', 'KraftHeinz',
        'LIT', 'Global X Lithium & Battery Tech ETF', 'GlobalXlithium',
        'MARA', 'Marathon Digital Holdings Inc.', 'MarathonDigital',
        'MRNA', 'Moderna Inc.', 'Modernna',
        'NCLH', 'Norwegian Cruise Line Holdings Ltd.', 'Norwegiancruiseline',
        'NET', 'Cloudflare Inc.', 'CloudFlare',
        'NFLX', 'Netflix Inc.', 'Netlfix',
        'NKE', 'Nike Inc.', 'Nike', 'NKLA', 'Nikola Corp.', 'Nicola', 'NOK', 'Nokia Corp.', 'Noka', 'O', 'Realty Income Corp.', 'Realtyincome', 'OGI', 'OrganiGram Holdings Inc.', 'Organigram', 'OTRK', 'Ontrak Inc.', 'OnTrack', 'PDD', 'Pinduoduo Inc.', 'Pinduoduo', 'PENN', 'Penn National Gaming Inc.', 'PennNational']


In [None]:
keywords = []

for word in list1:
    word = clean_tweet(word)
    word = word.replace('inc.', '').replace('ltd.', '').replace('co.', '').replace('corp.', '').replace('.com', '')
    keywords.append(word)

for word in list2:
    word = clean_tweet(word)
    word = word.replace('inc.', '').replace('ltd.', '').replace('co.', '').replace('corp.', '').replace('.com', '')
    keywords.append(word)

In [None]:
top50 = pd.read_csv('raw_data/top50.csv')

In [None]:
cashtags = top50['cashtags'].tolist()
hashtags = top50['hashtags'].tolist()

In [None]:
for cashtag in cashtags:
    cashtag = clean_tweet(cashtag)
    cashtag = re.sub(r'[^\w\s]', '', cashtag)
    cashtag = cashtag.replace(' ', '')
    
    if cashtag not in keywords:
        keywords.append(cashtag)

In [None]:
for hashtag in hashtags:
    hashtag = clean_tweet(hashtag)
    hashtag = re.sub(r'[^\w\s]', '', hashtag)
    hashtag = hashtag.replace(' ', '')
    
    if hashtag not in keywords:
        keywords.append(hashtag)

In [None]:
for keyword in keywords:
    if len(keyword) < 2:
        keywords.remove(keyword)

In [None]:
df_filtered = df[df['tweet'].str.contains(keyword).any() | df['cashtags'].notnull() | df['hashtags'].notnull()]

In [None]:
df_filtered

In [None]:
def sentiment_analysis(df):
    
    from transformers import BertTokenizer, BertForSequenceClassification, pipeline

    finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
    tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
    nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)
    
    for i in range(len(df)):
        results = nlp(df.iloc[i]['tweet'])
        df['sentiment'] = results[0]['label']
    
    return df
  

In [None]:
sentiment_analysis(df_filtered)