In [4]:
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
stocks = ['AAPL', 'META', 'NVDA']
related_terms = {
    'AAPL': ['AAPL', 'Apple'],
    'META': ['META', 'Metaverse'],
    'NVDA': ['NVDA', 'NVIDIA'],
}
# company url names for Investing.com
url_names = {
    'AAPL': 'apple-computer-inc',
    'META': 'facebook-inc',
    'NVDA': 'nvidia-corp',
}
start = "2022-01-01"
end = "2024-01-01"

In [22]:
import numpy as np

def generate_sentiment_cnt_features(sentiment_df):
    # count the number of neutral, pos, neg, articles per day
    daily_sentiment_cnt = sentiment_df.groupby('published date')['sentiment_score'].value_counts().unstack(-1).reset_index()
    daily_sentiment_cnt.rename(columns={0: 'neg_sentiment_cnt',
                                        1: 'neu_sentiment_cnt',
                                        2: 'pos_sentiment_cnt'}, inplace=True)
    daily_sentiment_cnt.fillna(0, inplace=True)
    return daily_sentiment_cnt

def generate_sentiment_prob_features(sentiment_df):
    # compute average neutral, pos, neg probs across all articles
    daily_sentiment_mean_probs = sentiment_df.groupby('published date')[['neg_sentiment_prob', 'neu_sentiment_prob', 'pos_sentiment_prob']].mean()
    daily_sentiment_mean_probs.reset_index(inplace=True)
    daily_sentiment_mean_probs.rename(columns={'neg_sentiment_prob': 'mean_neg_sentiment_prob',
                                              'neu_sentiment_prob': 'mean_neu_sentiment_prob',
                                              'pos_sentiment_prob': 'mean_pos_sentiment_prob'}, inplace=True)
    return daily_sentiment_mean_probs

def generate_sentiment_features(sentiment_df):
    # # Convert 'publishedAt' to datetime and set index
    # sentiment_df['publishedAt'] = pd.to_datetime(sentiment_df['publishedAt']).dt.tz_localize(None)
    # sentiment_df['publishedAt_date'] = sentiment_df['publishedAt'].dt.date

    # separate sentiment_probs list into three columns for each sentiment (neg, neutral, pos)
    sentiment_df['neg_sentiment_prob'] = sentiment_df['sentiment_probs'].apply(
        lambda x: float(x.replace('[','').replace(']','').strip().split(',')[0])
    )
    sentiment_df['neu_sentiment_prob'] = sentiment_df['sentiment_probs'].apply(
        lambda x: float(x.replace('[','').replace(']','').strip().split(',')[1])
    )
    sentiment_df['pos_sentiment_prob'] = sentiment_df['sentiment_probs'].apply(
        lambda x: float(x.replace('[','').replace(']','').strip().split(',')[2])
    )
    daily_sentiment_cnt = generate_sentiment_cnt_features(sentiment_df)
    daily_sentiment_mean_probs = generate_sentiment_prob_features(sentiment_df)
    daily_sentiment_features = daily_sentiment_cnt.set_index('published date').join(daily_sentiment_mean_probs.set_index('published date'))
    return daily_sentiment_features

In [29]:
for stock in stocks:
    # create sentiment features
    csv = f"/content/drive/My Drive/WPI/Senior Year/CS539 (ML)/{stock}_sentiment.csv"
    df = pd.read_csv(csv)
    print(len(df))
    print(len(df[df.duplicated(['title']) == True]))
    df = df.drop_duplicates(['title'])
    df = df.dropna()
    print(len(df))
    sentiment_df = generate_sentiment_features(df)
    # print(sentiment_df.head(3))

    # load stock data
    csv = f"/content/drive/My Drive/WPI/Senior Year/CS539 (ML)/{stock}_stock.csv"
    stock_df = pd.read_csv(csv)

    # prepare join
    sentiment_df = sentiment_df.reset_index().rename(columns={'published date': 'Date'})
    merged_df = pd.merge(stock_df, sentiment_df, on=['Date'])
    print(len(merged_df))

    merged_df.to_csv(f"/content/drive/My Drive/WPI/Senior Year/CS539 (ML)/{stock}_stock_with_sentiment.csv", index=False)




3224
1076
2104
488
