In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import os

stock_directory = '../data/price/raw'
stock_data = {}

csv_files = [file for file in os.listdir(stock_directory) if file.endswith('.csv')]
for file in csv_files:
    symbol = file.split('.')[0]  # Extract stock symbol from filename
    df = pd.read_csv(os.path.join(stock_directory, file))
    df['Date'] = pd.to_datetime(df['Date'])
    df.fillna(method='ffill', inplace=True)
    stock_data[symbol] = df

In [4]:
import os
import json

tweet_directory = '../data/tweet/raw'
tweets_data = {}
tweet_threshold = 500

for stock_folder in os.listdir(tweet_directory):
    stock_path = os.path.join(tweet_directory, stock_folder)
    if os.path.isdir(stock_path):
        all_tweets = []
        count = 0
        for tweet_file in os.listdir(stock_path):
            count += 1
            file_path = os.path.join(stock_path, tweet_file)
            with open(file_path, 'r', encoding='utf-8') as f:
                for line in f:
                    try:
                        tweet_json = json.loads(line.strip())
                        tweet_data = {
                            'Date': pd.to_datetime(tweet_json['created_at']),
                            'Text': tweet_json['text'],
                            'User': tweet_json['user']['screen_name'],
                            'Followers': tweet_json['user']['followers_count'],
                            'Friends': tweet_json['user']['friends_count']
                        }
                        all_tweets.append(tweet_data)
                    except json.JSONDecodeError:
                        print(f"Error decoding JSON in file {file_path}")
        # if len(all_tweets) >= tweet_threshold:
        if count >= tweet_threshold:
            tweets_data[stock_folder] = pd.DataFrame(all_tweets)

In [5]:
stock_data = {symbol: df for symbol, df in stock_data.items() if symbol in tweets_data}

In [6]:
stock_data = stock_data['AAPL']
stock_data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2012-09-04,95.108574,96.448570,94.928574,96.424286,87.121140,91973000
1,2012-09-05,96.510002,96.621429,95.657143,95.747147,86.509338,84093800
2,2012-09-06,96.167145,96.898575,95.828575,96.610001,87.288956,97799100
3,2012-09-07,96.864288,97.497147,96.538574,97.205711,87.827171,82416600
4,2012-09-10,97.207146,97.612854,94.585716,94.677139,85.542564,121999500
...,...,...,...,...,...,...,...
1253,2017-08-28,160.139999,162.000000,159.929993,161.470001,161.470001,25966000
1254,2017-08-29,160.100006,163.119995,160.000000,162.910004,162.910004,29516900
1255,2017-08-30,163.800003,163.889999,162.610001,163.350006,163.350006,27269600
1256,2017-08-31,163.639999,164.520004,163.479996,164.000000,164.000000,26785100


In [7]:
tweets_data = tweets_data['AAPL']
tweets_data

Unnamed: 0,Date,Text,User,Followers,Friends
0,2015-12-29 09:33:17+00:00,"AAPL Apple, Inc. Bid Size\nhttps://t.co/Z2nP2D...",DennyT14,875,847
1,2015-12-29 18:29:22+00:00,RT @CNNMoney: What was your favorite app of th...,cyprusbiz,4841,5105
2,2015-12-29 04:13:13+00:00,"$AAPL Investor Opinions Updated Monday, Decemb...",bosocial,603,142
3,2015-12-29 17:28:49+00:00,RT @LearnBonds: Apple Inc. App Store Was Just ...,AppleNewsOutlet,99,22
4,2015-12-29 19:03:15+00:00,$AAPL I don't mean to spoil it but rather put ...,DayTradeTK,34,0
...,...,...,...,...,...
20845,2015-03-16 19:26:21+00:00,@The_Real_Fly Apple $AAPL,TheArmoTrader,8364,984
20846,2015-03-15 23:59:08+00:00,RT @TheStreet: Each Apple Watch Edition would ...,xavierpol,170,357
20847,2015-03-16 01:18:34+00:00,Weekend Review VIDEO (3/15): $SPY $IWM $QQQ $T...,JustinPulitzer,3890,236
20848,2015-03-16 01:20:05+00:00,Free Stock Options Tutorial #payoff diagrams!!...,stocknugget,2959,1437


In [8]:
from transformers import pipeline

sentiment_analyzer = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')
sentiment_analyzer = pipeline("sentiment-analysis", model="StephanAkkerman/FinTwitBERT-sentiment")

tweets_data['sentiment'] = tweets_data['Text'].apply(sentiment_analyzer)

tweets_data['sentiment'] = tweets_data['sentiment'].apply(lambda x: x[0]['label'])

print(tweets_data.head())

2024-11-05 16:57:50.095223: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-05 16:57:50.112975: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-05 16:57:50.118436: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-05 16:57:50.132782: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Special tokens have been added in the vocabulary, mak

                       Date  \
0 2015-12-29 09:33:17+00:00   
1 2015-12-29 18:29:22+00:00   
2 2015-12-29 04:13:13+00:00   
3 2015-12-29 17:28:49+00:00   
4 2015-12-29 19:03:15+00:00   

                                                Text             User  \
0  AAPL Apple, Inc. Bid Size\nhttps://t.co/Z2nP2D...         DennyT14   
1  RT @CNNMoney: What was your favorite app of th...        cyprusbiz   
2  $AAPL Investor Opinions Updated Monday, Decemb...         bosocial   
3  RT @LearnBonds: Apple Inc. App Store Was Just ...  AppleNewsOutlet   
4  $AAPL I don't mean to spoil it but rather put ...       DayTradeTK   

   Followers  Friends sentiment  
0        875      847   NEUTRAL  
1       4841     5105   NEUTRAL  
2        603      142   NEUTRAL  
3         99       22   NEUTRAL  
4         34        0   BULLISH  


In [10]:
tweets_data

Unnamed: 0,Date,Text,User,Followers,Friends,sentiment
0,2015-12-29 09:33:17+00:00,"AAPL Apple, Inc. Bid Size\nhttps://t.co/Z2nP2D...",DennyT14,875,847,NEUTRAL
1,2015-12-29 18:29:22+00:00,RT @CNNMoney: What was your favorite app of th...,cyprusbiz,4841,5105,NEUTRAL
2,2015-12-29 04:13:13+00:00,"$AAPL Investor Opinions Updated Monday, Decemb...",bosocial,603,142,NEUTRAL
3,2015-12-29 17:28:49+00:00,RT @LearnBonds: Apple Inc. App Store Was Just ...,AppleNewsOutlet,99,22,NEUTRAL
4,2015-12-29 19:03:15+00:00,$AAPL I don't mean to spoil it but rather put ...,DayTradeTK,34,0,BULLISH
...,...,...,...,...,...,...
20845,2015-03-16 19:26:21+00:00,@The_Real_Fly Apple $AAPL,TheArmoTrader,8364,984,NEUTRAL
20846,2015-03-15 23:59:08+00:00,RT @TheStreet: Each Apple Watch Edition would ...,xavierpol,170,357,NEUTRAL
20847,2015-03-16 01:18:34+00:00,Weekend Review VIDEO (3/15): $SPY $IWM $QQQ $T...,JustinPulitzer,3890,236,NEUTRAL
20848,2015-03-16 01:20:05+00:00,Free Stock Options Tutorial #payoff diagrams!!...,stocknugget,2959,1437,NEUTRAL


In [13]:
tweets_data.to_csv('../data/tweets-sentiment-AAPL.csv')

In [28]:
tweets_data = pd.read_csv('../data/tweets-sentiment-AAPL.csv')
tweets_data

Unnamed: 0.1,Unnamed: 0,Date,Text,User,Followers,Friends,sentiment
0,0,2015-12-29 09:33:17+00:00,"AAPL Apple, Inc. Bid Size\nhttps://t.co/Z2nP2D...",DennyT14,875,847,NEUTRAL
1,1,2015-12-29 18:29:22+00:00,RT @CNNMoney: What was your favorite app of th...,cyprusbiz,4841,5105,NEUTRAL
2,2,2015-12-29 04:13:13+00:00,"$AAPL Investor Opinions Updated Monday, Decemb...",bosocial,603,142,NEUTRAL
3,3,2015-12-29 17:28:49+00:00,RT @LearnBonds: Apple Inc. App Store Was Just ...,AppleNewsOutlet,99,22,NEUTRAL
4,4,2015-12-29 19:03:15+00:00,$AAPL I don't mean to spoil it but rather put ...,DayTradeTK,34,0,BULLISH
...,...,...,...,...,...,...,...
20845,20845,2015-03-16 19:26:21+00:00,@The_Real_Fly Apple $AAPL,TheArmoTrader,8364,984,NEUTRAL
20846,20846,2015-03-15 23:59:08+00:00,RT @TheStreet: Each Apple Watch Edition would ...,xavierpol,170,357,NEUTRAL
20847,20847,2015-03-16 01:18:34+00:00,Weekend Review VIDEO (3/15): $SPY $IWM $QQQ $T...,JustinPulitzer,3890,236,NEUTRAL
20848,20848,2015-03-16 01:20:05+00:00,Free Stock Options Tutorial #payoff diagrams!!...,stocknugget,2959,1437,NEUTRAL


In [29]:
tweets_data.index = pd.to_datetime(tweets_data.index).date
tweets_data['Date'] = pd.to_datetime(tweets_data['Date']).dt.date
tweets_data = tweets_data.reset_index(drop=True).drop('Unnamed: 0', axis=1)
tweets_data

Unnamed: 0,Date,Text,User,Followers,Friends,sentiment
0,2015-12-29,"AAPL Apple, Inc. Bid Size\nhttps://t.co/Z2nP2D...",DennyT14,875,847,NEUTRAL
1,2015-12-29,RT @CNNMoney: What was your favorite app of th...,cyprusbiz,4841,5105,NEUTRAL
2,2015-12-29,"$AAPL Investor Opinions Updated Monday, Decemb...",bosocial,603,142,NEUTRAL
3,2015-12-29,RT @LearnBonds: Apple Inc. App Store Was Just ...,AppleNewsOutlet,99,22,NEUTRAL
4,2015-12-29,$AAPL I don't mean to spoil it but rather put ...,DayTradeTK,34,0,BULLISH
...,...,...,...,...,...,...
20845,2015-03-16,@The_Real_Fly Apple $AAPL,TheArmoTrader,8364,984,NEUTRAL
20846,2015-03-15,RT @TheStreet: Each Apple Watch Edition would ...,xavierpol,170,357,NEUTRAL
20847,2015-03-16,Weekend Review VIDEO (3/15): $SPY $IWM $QQQ $T...,JustinPulitzer,3890,236,NEUTRAL
20848,2015-03-16,Free Stock Options Tutorial #payoff diagrams!!...,stocknugget,2959,1437,NEUTRAL


In [30]:
tweets_start_date = tweets_data['Date'].min()
tweets_end_date = tweets_data['Date'].max()

In [31]:
stock_data = stock_data.reset_index()

stock_data['Date'] = pd.to_datetime(stock_data['Date']).dt.date

stock_data.drop(columns = ['index'], inplace = True)
stock_data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2012-09-04,95.108574,96.448570,94.928574,96.424286,87.121140,91973000
1,2012-09-05,96.510002,96.621429,95.657143,95.747147,86.509338,84093800
2,2012-09-06,96.167145,96.898575,95.828575,96.610001,87.288956,97799100
3,2012-09-07,96.864288,97.497147,96.538574,97.205711,87.827171,82416600
4,2012-09-10,97.207146,97.612854,94.585716,94.677139,85.542564,121999500
...,...,...,...,...,...,...,...
1253,2017-08-28,160.139999,162.000000,159.929993,161.470001,161.470001,25966000
1254,2017-08-29,160.100006,163.119995,160.000000,162.910004,162.910004,29516900
1255,2017-08-30,163.800003,163.889999,162.610001,163.350006,163.350006,27269600
1256,2017-08-31,163.639999,164.520004,163.479996,164.000000,164.000000,26785100


In [32]:
merged_data = stock_data.merge(tweets_data, on='Date', how='left')

In [33]:
sentiment_mapping = {'NEUTRAL': 0, 'BULLISH': 1, 'BEARISH': -1}
merged_data['sentiment'] = merged_data['sentiment'].map(sentiment_mapping)

In [34]:
merged_data = merged_data[
    (merged_data['Date'] >= tweets_start_date) & 
    (merged_data['Date'] <= tweets_end_date)
]

In [35]:
aggregated_data = merged_data.groupby('Date').agg({
    'Open': 'first',
    'High': 'first',
    'Low': 'first',
    'Close': 'first',
    'Adj Close': 'first',
    'Volume': 'first',
    'sentiment': lambda x: x.mode()[0] if not x.mode().empty else 0
}).reset_index()

# Fill missing sentiments with neutral value
aggregated_data['sentiment'] = aggregated_data['sentiment'].fillna(0)

# Save the processed dataset
aggregated_data.to_csv('../data/processed_stock_data_aapl.csv', index=False)

In [36]:
aggregated_data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,sentiment
0,2013-12-31,79.167145,80.182854,79.142860,80.145714,74.571281,55771100,0.0
1,2014-01-02,79.382858,79.575714,78.860001,79.018570,73.522530,58671200,1.0
2,2014-01-03,78.980003,79.099998,77.204285,77.282860,71.907555,98116900,-1.0
3,2014-01-06,76.778572,78.114288,76.228569,77.704285,72.299644,103152700,1.0
4,2014-01-07,77.760002,77.994286,76.845711,77.148575,71.782608,79302300,1.0
...,...,...,...,...,...,...,...,...
561,2016-03-24,105.470001,106.250000,104.889999,105.669998,102.653854,26133000,0.0
562,2016-03-28,106.000000,106.190002,105.059998,105.190002,102.187561,19411400,-1.0
563,2016-03-29,104.889999,107.790001,104.879997,107.680000,104.606491,31190100,-1.0
564,2016-03-30,108.650002,110.419998,108.599998,109.559998,106.432831,45601100,1.0


In [1]:
####################### Saving the data for top 10 stocks #########################

In [6]:
import warnings
import pandas as pd
import os
import json
from transformers import pipeline
from tqdm import tqdm
warnings.filterwarnings("ignore")

In [3]:
def load_stock_data(stock_directory):
    """Load stock price data from CSV files"""
    stock_data = {}
    csv_files = [file for file in os.listdir(stock_directory) if file.endswith('.csv')]
    for file in csv_files:
        symbol = file.split('.')[0]
        df = pd.read_csv(os.path.join(stock_directory, file))
        df['Date'] = pd.to_datetime(df['Date'])
        df.fillna(method='ffill', inplace=True)
        stock_data[symbol] = df
    return stock_data

In [4]:
def load_tweet_data(tweet_directory):
    """Load tweet data and count tweets per stock"""
    tweets_count = {}
    tweets_data = {}
    
    for stock_folder in os.listdir(tweet_directory):
        stock_path = os.path.join(tweet_directory, stock_folder)
        if os.path.isdir(stock_path):
            all_tweets = []
            tweet_count = 0
            
            for tweet_file in os.listdir(stock_path):
                file_path = os.path.join(stock_path, tweet_file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    for line in f:
                        try:
                            tweet_json = json.loads(line.strip())
                            tweet_data = {
                                'Date': pd.to_datetime(tweet_json['created_at']),
                                'Text': tweet_json['text'],
                                'User': tweet_json['user']['screen_name'],
                                'Followers': tweet_json['user']['followers_count'],
                                'Friends': tweet_json['user']['friends_count']
                            }
                            all_tweets.append(tweet_data)
                            tweet_count += 1
                        except json.JSONDecodeError:
                            print(f"Error decoding JSON in file {file_path}")
            
            tweets_count[stock_folder] = tweet_count
            if len(all_tweets) > 0:
                tweets_data[stock_folder] = pd.DataFrame(all_tweets)
    
    return tweets_data, tweets_count

In [5]:
def process_stock(symbol, stock_df, tweets_df, output_directory):
    """Process and merge stock and tweet data for a single stock"""
    # Initialize sentiment analyzer
    sentiment_analyzer = pipeline("sentiment-analysis", model="StephanAkkerman/FinTwitBERT-sentiment")
    
    # Process tweets
    tweets_df['sentiment'] = tweets_df['Text'].apply(sentiment_analyzer)
    tweets_df['sentiment'] = tweets_df['sentiment'].apply(lambda x: x[0]['label'])
    
    # Save raw tweets with sentiment
    tweets_df.to_csv(os.path.join(output_directory, f'tweets-sentiment-{symbol}.csv'))
    
    # Process dates
    tweets_df['Date'] = pd.to_datetime(tweets_df['Date']).dt.date
    stock_df = stock_df.reset_index()
    stock_df['Date'] = pd.to_datetime(stock_df['Date']).dt.date
    if 'index' in stock_df.columns:
        stock_df.drop(columns=['index'], inplace=True)
    
    # Merge data
    tweets_start_date = tweets_df['Date'].min()
    tweets_end_date = tweets_df['Date'].max()
    
    merged_data = stock_df.merge(tweets_df, on='Date', how='left')
    sentiment_mapping = {'NEUTRAL': 0, 'BULLISH': 1, 'BEARISH': -1}
    merged_data['sentiment'] = merged_data['sentiment'].map(sentiment_mapping)
    
    # Filter date range
    merged_data = merged_data[
        (merged_data['Date'] >= tweets_start_date) & 
        (merged_data['Date'] <= tweets_end_date)
    ]
    
    # Aggregate daily data
    aggregated_data = merged_data.groupby('Date').agg({
        'Open': 'first',
        'High': 'first',
        'Low': 'first',
        'Close': 'first',
        'Adj Close': 'first',
        'Volume': 'first',
        'sentiment': lambda x: x.mode()[0] if not x.mode().empty else 0
    }).reset_index()
    
    # Fill missing sentiments
    aggregated_data['sentiment'] = aggregated_data['sentiment'].fillna(0)
    
    # Save processed data
    aggregated_data.to_csv(os.path.join(output_directory, f'processed_stock_data_{symbol.lower()}.csv'), index=False)
    
    return aggregated_data

In [7]:
# Set up directories
stock_directory = '../data/price/raw'
tweet_directory = '../data/tweet/raw'
output_directory = '../data'

# Create output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Load data
print("Loading stock data...")
stock_data = load_stock_data(stock_directory)
print("Loading tweet data...")
tweets_data, tweets_count = load_tweet_data(tweet_directory)

# Get top 10 stocks by tweet count
top_stocks = dict(sorted(tweets_count.items(), key=lambda x: x[1], reverse=True)[:10])
print("\nTop 10 stocks by tweet count:")
for symbol, count in top_stocks.items():
    print(f"{symbol}: {count} tweets")

# Process each stock
processed_data = {}
for symbol in tqdm(top_stocks.keys()):
    if symbol in stock_data and symbol in tweets_data:
        processed_data[symbol] = process_stock(
            symbol, 
            stock_data[symbol], 
            tweets_data[symbol], 
            output_directory
        )
    else:
        print(f"Skipping {symbol} - missing stock or tweet data")

Loading stock data...
Loading tweet data...

Top 10 stocks by tweet count:
AAPL: 20850 tweets
FB: 12028 tweets
GOOG: 7607 tweets
AMZN: 7213 tweets
T: 5651 tweets
D: 5084 tweets
MSFT: 3601 tweets
BABA: 3432 tweets
PCLN: 2600 tweets
BAC: 2569 tweets


  0%|          | 0/10 [00:00<?, ?it/s]Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
 10%|█         | 1/10 [35:00<5:15:01, 2100.17s/it]Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
 20%|██        | 2/10 [55:38<3:32:27, 1593.46s/it]Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
 30%|███       | 3/10 [1:08:47<2:22:59, 1225.70s/it]Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
 40%|████      | 4/10 [1:21:14<1:43:42, 1037.11s/it]Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
 50%|█████     | 5/10 [1:30:45<1:12:24, 868.80s/it] Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained