In [194]:
import warnings
warnings.filterwarnings("ignore")

In [195]:
import pandas as pd
import os

stock_directory = 'data/price/raw'
stock_data = {}

csv_files = [file for file in os.listdir(stock_directory) if file.endswith('.csv')]
for file in csv_files:
    symbol = file.split('.')[0]  # Extract stock symbol from filename
    df = pd.read_csv(os.path.join(stock_directory, file))
    df['Date'] = pd.to_datetime(df['Date'])
    df.fillna(method='ffill', inplace=True)
    stock_data[symbol] = df

In [196]:
stock_data['WFC']

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2012-09-04,34.040001,34.130001,33.730000,33.799999,29.324499,14717600
1,2012-09-05,33.840000,34.009998,33.740002,33.750000,29.281118,16121100
2,2012-09-06,33.980000,34.950001,33.970001,34.840000,30.226799,32143100
3,2012-09-07,34.959999,35.189999,34.740002,35.000000,30.365608,26773900
4,2012-09-10,34.869999,34.939999,34.590000,34.590000,30.009895,19616700
...,...,...,...,...,...,...,...
1253,2017-08-28,51.950001,52.020000,51.419998,51.630001,51.630001,12425900
1254,2017-08-29,51.209999,51.500000,51.130001,51.419998,51.419998,10715500
1255,2017-08-30,51.459999,51.740002,51.200001,51.360001,51.360001,11427500
1256,2017-08-31,51.000000,51.240002,50.869999,51.070000,51.070000,25231100


In [197]:
import os
import json

tweet_directory = 'data/tweet/raw'
tweets_data = {}
tweet_threshold = 500

for stock_folder in os.listdir(tweet_directory):
    stock_path = os.path.join(tweet_directory, stock_folder)
    if os.path.isdir(stock_path):
        all_tweets = []
        count = 0
        for tweet_file in os.listdir(stock_path):
            count += 1
            file_path = os.path.join(stock_path, tweet_file)
            with open(file_path, 'r', encoding='utf-8') as f:
                for line in f:
                    try:
                        tweet_json = json.loads(line.strip())
                        tweet_data = {
                            'Date': pd.to_datetime(tweet_json['created_at']),
                            'Text': tweet_json['text'],
                            'User': tweet_json['user']['screen_name'],
                            'Followers': tweet_json['user']['followers_count'],
                            'Friends': tweet_json['user']['friends_count']
                        }
                        all_tweets.append(tweet_data)
                    except json.JSONDecodeError:
                        print(f"Error decoding JSON in file {file_path}")
        # if len(all_tweets) >= tweet_threshold:
        if count >= tweet_threshold:
            tweets_data[stock_folder] = pd.DataFrame(all_tweets)

In [198]:
stock_data = {symbol: df for symbol, df in stock_data.items() if symbol in tweets_data}

In [199]:
len(stock_data)

27

In [200]:
for stock, df in tweets_data.items():
    df['Date'] = pd.to_datetime(df['Date']).dt.date
    tweets_data[stock] = df

In [201]:
for stock, df in tweets_data.items():
    grouped = df.groupby('Date').agg({
        'Text': list,          
        'User': list,          
        'Followers': list,     
        'Friends': list        
    }).reset_index()

    tweets_data[stock] = grouped

In [202]:
stock_data['BA']['Date'] = pd.to_datetime(stock_data['BA']['Date'])
tweets_data['BA']['Date'] = pd.to_datetime(tweets_data['BA']['Date'])

common_dates = pd.Series(list(set(stock_data['BA']['Date']).intersection(set(tweets_data['BA']['Date']))))

common_dates_count = len(common_dates)

print("Count of common dates:", common_dates_count)

Count of common dates: 373


In [203]:
for stock in stock_data.keys():
    if stock in tweets_data:
        
        stock_data[stock]['Date'] = pd.to_datetime(stock_data[stock]['Date'])
        tweets_data[stock]['Date'] = pd.to_datetime(tweets_data[stock]['Date'])

        merged_data = stock_data[stock].merge(tweets_data[stock], on='Date', how='left')
        
        stock_data[stock] = merged_data

In [204]:
filtered_data = stock_data['BA'][stock_data['BA']['Text'].notna()]
filtered_data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Text,User,Followers,Friends
333,2014-01-02,136.009995,137.250000,135.509995,136.669998,123.228493,3366700,[ARROM&amp;¨%#@$BA http://t.co/2Osw4Xl9GF],[bowsjauregui],[1748],[835]
334,2014-01-03,137.059998,138.500000,137.050003,137.619995,124.085045,3177400,[$BA Jan. 3 Premarket Briefing: 10 Things You ...,[CordiaBranam],[156],[0]
335,2014-01-06,139.399994,139.759995,137.800003,138.410004,124.797371,4196500,[$BA [video] Why Would Zynga Take Bitcoin? htt...,"[MonteMose, CordiaBranam, sfef84, stockwire24]","[3, 153, 225, 931]","[0, 0, 201, 1]"
336,2014-01-07,138.580002,141.100006,138.500000,140.509995,126.690819,4238500,[The Boeing Company (BA): Boeing logs record n...,"[stockwire24, TheStockHerald, stocknews77]","[932, 9, 207]","[1, 0, 6]"
337,2014-01-08,140.690002,141.399994,139.360001,140.820007,126.970329,4236100,[The Boeing Company : Boeing stays top planema...,[stockwire24],[935],[1]
...,...,...,...,...,...,...,...,...,...,...,...
891,2016-03-22,135.139999,136.100006,134.779999,135.119995,129.024033,4989000,[RSALAZAR: OTC $TPAC with NYSE $BA https://t.c...,[WallStreetPenni],[11017],[2328]
892,2016-03-23,134.990005,135.000000,132.740005,132.860001,126.865990,4361200,"[First Citizens Bank &amp; Trust Co. Sells 7,0...",[AmericanBanking],[6044],[2296]
895,2016-03-29,129.830002,130.929993,128.179993,130.880005,124.975327,4910200,[$BA $AAMRQ:\n\nEgypt Hijacking Without Passen...,"[ProVesting, _aerospace, _aerospace, _aerospac...","[730, 40, 40, 39, 730, 40, 40]","[6, 3, 3, 3, 6, 28, 28]"
896,2016-03-30,131.479996,132.729996,128.020004,128.580002,122.779083,5949000,[Dow Analysts Forecast 44.21% More Gain With I...,"[SA_IncomeInvest, InvestorPlace, KanikaSikka, ...","[1819, 5986, 529, 529, 2, 9, 9, 200, 200]","[31, 657, 874, 874, 22, 1, 1, 27, 27]"
