In [1]:
import numpy as np
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertTokenizer, BertForSequenceClassification, pipeline

In [2]:
data = pd.read_csv('../raw_data/twitter.csv')

In [3]:
data.rename(columns={'rawContent': 'tweet'}, inplace=True)
#add the following columns: stock_symbol, price_at_tweet, , after_1_min, after_1_hour, after_1_day, after_1_week, after_1_month, positive, negative, neutral, sentiment.
data['stock'] = ''
data['price_at_tweet'] = ''
data['price_1_min'] = ''
data['after_1_hour'] = ''
data['after_1_day'] = ''
data['after_1_week'] = ''
data['after_1_month'] = ''

In [4]:
def convert_to_datetime(data):
    data['date'] = pd.to_datetime(data['date'], format='%Y-%m-%d %H:%M:%S')
    return data

In [5]:
def clean_tweet(tweet):
    # Remove url
    tweet = re.sub(r'https?:\/\/\S+', ' ', tweet)
    tweet = re.sub(r'http?:\/\/\S+', ' ', tweet)
    # Remove \n
    tweet = re.sub(r'\n', ' ', tweet)
    # Remove @
    tweet = re.sub(r'@[A-Za-z0-9]+', ' ', tweet)
    # Remove #
    tweet = re.sub(r'#', ' ', tweet)
    # Remove RT
    tweet = re.sub(r'RT[\s]+', ' ', tweet)
    # Remove double space
    tweet = re.sub(r'\s\s+', ' ', tweet)
    return tweet

In [6]:
data = convert_to_datetime(data)
data['tweet'] = data['tweet'].apply(clean_tweet)

In [7]:
symbols_list = {
    'AAPL': ['Apple', 'Appl', 'Apple Inc.'],
    'AMC': ['AMC Entertainment Holdings Inc.', 'AAMC'],
    'AMZN': ['Amazon.com Inc.', 'Amazn', 'Amazon'],
    'AMD': ['Advanced Micro Devices Inc.', 'AMED'],
    'BB': ['BlackBerry Ltd.', 'Blacberry'],
    'BBBY': ['Bed Bath & Beyond Inc.', 'Bedbathbeyond'],
    'BTC': ['Bitcoin', 'bit coin'],
    'BYND': ['Beyond Meat Inc.', 'Bynd'],
    'CLNE': ['Clean Energy Fuels Corp.', 'Clnenergy'],
    'CMG': ['Chipotle Mexican Grill Inc.', 'Chipotle'],
    'COST': ['Costco Wholesale Corp.', 'Coscto'],
    'CRSR': ['Corsair Gaming Inc.', 'Corsair'],
    'DIS': ['Walt Disney Co.', 'Disney'],
    'DOGE': ['Dogecoin', 'Doge coin'],
    'ET': ['Energy Transfer LP', 'Energy Transfer'],
    'FB': ['Facebook Inc.', 'Fb'],
    'GME': ['GameStop Corp.', 'Gamestop'],
    'GOOG': ['Alphabet Inc.', 'Google'],
    'HD': ['Home Depot Inc.', 'Home Depo'],
    'INTC': ['Intel Corp.', 'Intell'],
    'JNJ': ['Johnson & Johnson', 'Johnsohn'],
    'KO': ['Coca-Cola Co.', 'Coca Cola'],
    'LULU': ['Lululemon Athletica Inc.', 'Lululemn'],
    'MCD': ["McDonald's Corp.", "McDonalds"],
    'MGM': ['MGM Resorts International', 'Mgmresorts'],
    'MSFT': ['Microsoft Corp.', 'Micrsoft'],
    'MU': ['Micron Technology Inc.', 'Micron'],
    'NIO': ['NIO Inc.', 'Nio'],
    'NVDA': ['NVIDIA Corp.', 'Nivida'],
    'PFE': ['Pfizer Inc.', 'Pifzer'],
    'PINS': ['Pinterest Inc.', 'Pintrst'],
    'PLTR': ['Palantir Technologies Inc.', 'Palintir'],
    'QQQ': ['Invesco QQQ Trust', 'InvescoQQQ'],
    'RBLX': ['Roblox Corp.', 'Robloks'],
    'RIOT': ['Riot Blockchain Inc.', 'Riott'],
    'ROKU': ['Roku Inc.', 'Rokue'],
    'SNDL': ['Sundial Growers Inc.', 'Sundail'],
    'SPCE': ['Virgin Galactic Holdings Inc.', 'Spacex'],
    'SQ': ['Square Inc.', 'Squar'],
    'TSLA': ['Tesla Inc.', 'Teslla'],
    'TWTR': ['Twitter Inc.', 'Twiter'],
    'UBER': ['Uber Technologies Inc.', 'Ube'],
    'UPST': ['Upstart Holdings Inc.', 'Upstartholdings'],
    'WMT': ['Walmart Inc.', 'Walmrt'],
    'XOM': ['Exxon Mobil Corp.', 'Exxonmobil'],
    'AAL': ['American Airlines Group Inc.', 'Americanairlines'],
    'ABNB': ['Airbnb Inc.', 'AirBnB'],
    'ACB': ['Aurora Cannabis Inc.', 'Auroracannabis'],
    'AMRN': ['Amarin Corp. plc', 'Amerin'],
    'ARKK': ['ARK Innovation ETF', 'ARKinovation'],
    'BABA': ['Alibaba Group Holding Ltd.', 'Alibaba'],
    'BA': ['Boeing Co.', 'Boing'],
    'BAC': ['Bank of America Corp.', 'Bankofamerica'],
    'BIDU': ['Baidu Inc.', 'Bido'],
    'BILI': ['Bilibili Inc.', 'Billibili'],
    'BLNK': ['Blink Charging Co.', 'Blinkcharg'],
    'BMY': ['Bristol Myers Squibb Co.', 'Bristolmyers'],
    'BRK.A': ['Berkshire Hathaway Inc.', 'Berkshira'],
    'CCL': ['Carnival Corp.', 'Carnival'],
    'CGC': ['Canopy Growth Corp.', 'Canopygrowth'],
    'CHWY': ['Chewy Inc.', 'Chewycom'],
    'CSCO': ['Cisco Systems Inc.', 'Cicsco'],
    'CVS': ['CVS Health Corp.', 'Cvshealth'],
    'DAL': ['Delta Air Lines Inc.', 'Deltaairlines'],
    'DDOG': ['Datadog Inc.', 'DataDog'],
    'DISCA': ['Discovery Inc. - Class A', 'Discovera'],
    'DKNG': ['DraftKings Inc.', 'DraftKings'],
    'ENPH': ['Enphase Energy Inc.', 'Enphase'],
    'EQT': ['EQT Corp.', 'Eqtcorporation'],
    'FCEL': ['FuelCell Energy Inc.', 'Fuelcell'],
    'FSLY': ['Fastly Inc.', 'Fastly'],
    'GE': ['General Electric Co.', 'GeneralElectric'],
    'GM': ['General Motors Co.', 'GeneralMotors'],
    'GOLD': ['Barrick Gold Corp.', 'Goldmining'],
    'GPRO': ['GoPro Inc.', 'Gopro'],
    'GRWG': ['GrowGeneration Corp.', 'Growgen'],
    'HPE': ['Hewlett Packard Enterprise Co.', 'Hewlettpackard'],
    'IBB': ['iShares NASDAQ Biotechnology ETF', 'iSharesNasdaqBiotechnology'],
    'IBKR': ['Interactive Brokers Group Inc.', 'InterectiveBrokers'],
    'INO': ['Inovio Pharmaceuticals Inc.', 'Inovio'],
    'JD': ['JD.com Inc.', 'JD'],
    'JMIA': ['Jumia Technologies AG', 'Jumia'],
    'JPM': ['JPMorgan Chase & Co.', 'JPmorgan'],
    'KHC': ['Kraft Heinz Co.', 'KraftHeinz'],
    'LIT': ['Global X Lithium & Battery Tech ETF', 'GlobalXlithium'],
    'MARA': ['Marathon Digital Holdings Inc.', 'MarathonDigital'],
    'MRNA': ['Moderna Inc.', 'Modernna'],
    'NCLH': ['Norwegian Cruise Line Holdings Ltd.', 'Norwegiancruiseline'],
    'NET': ['Cloudflare Inc.', 'CloudFlare'],
    'NFLX': ['Netflix Inc.', 'Netlfix'],
    'NKE': ['Nike Inc.', 'Nike'],
    'NKLA': ['Nikola Corp.', 'Nicola'],
    'NOK': ['Nokia Corp.', 'Noka'],
    'OGI': ['OrganiGram Holdings Inc.', 'Organigram'],
    'OTRK': ['Ontrak Inc.', 'OnTrack'],
    'PDD': ['Pinduoduo Inc.', 'Pinduoduo'],
    'PENN': ['Penn National Gaming Inc.', 'PennNational'],
    'NQ': ['NASDAQ', 'Nasdaq'],
    'SPY': ['S&P 500 ETF Trust', 'SPDRS&P500', 'SPDR', 'SPY'],
    'ES-F': ['S&P 500 E-Mini Futures', 'S&P500E-Mini', 'S&P500E', 'S&P500', 'S&P'],
    'DJIA': ['Dow Jones Industrial Average', 'DowJones', 'DowJonesIndustrialAverage', 'DowJonesIndustrial']
}

In [8]:
#inversing the dictionary
inv_stocks = {value: key for key, values in symbols_list.items() for value in values}
#creating a regex pattern for each inversed key
stocks_patterns = {
    key: re.compile("(" + "|".join(re.escape(value) for value in values) + ")")
        for key, values in symbols_list.items()
}

In [9]:
#function to replace the stock names with the stock symbols
def replace_stock_names(tweet):
    tweet_clean = tweet    
    for key, pattern in stocks_patterns.items():
        tweet_clean = re.sub(pattern, key, tweet_clean)
    return tweet_clean
#apply replace_stock_names function to the tweet column
data['tweet'] = data['tweet'].apply(lambda x: replace_stock_names(x))

In [10]:
#get a list of stocks mentioned in a tweet and return it as a list
def get_stocks_mentioned(tweet):
    stocks_mentioned = []
    for key, pattern in stocks_patterns.items():
        if re.search(key, tweet):
            stocks_mentioned.append(key)
    return stocks_mentioned
#apply get_stocks_mentioned function to the tweet column and store the result in a new column
data['stocks_without_cashtag'] = data['tweet'].apply(lambda x: get_stocks_mentioned(x))

In [11]:
def clean_cashtag(column):
    cleaned_column = []
    for value in column:
        # Check if the value is NaN
        if isinstance(value, float) and np.isnan(value):
            # If so, replace with an empty list
            cleaned_column.append([])
        else:
            # Remove brackets and extra spaces from string values
            cleaned_value = str(value).strip('[]').replace(' ','').replace("'",'')
            # Split cleaned string into a list
            cleaned_list = cleaned_value.split(',')
            # Append cleaned list to cleaned column
            cleaned_column.append(cleaned_list)
    return cleaned_column
data['cashtags'] = clean_cashtag(data['cashtags'])

In [12]:
#if cashatag available, add the first cashatag to the stock column, else put the first stock mentioned in the stocks_without_cashtag column
data['stock'] = data.apply(lambda x: x['cashtags'][0] if len(x['cashtags']) > 0 else x['stocks_without_cashtag'][0] if len(x['stocks_without_cashtag']) > 0 else np.nan, axis=1)

In [13]:
#drop the rows with stock column null
data = data[data['stock'].notnull()]

In [14]:
data.to_csv('data.csv', index=False)