In [None]:
!pip install pymongo dnspython
!pip install "pymongo[srv]"



In [None]:
import pandas as pd
from pymongo import MongoClient

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Loading the stock and sentiment data csvs

In [None]:
# Load stock and sentiment data from CSV and convert to dataframe
stock_data_path = "/content/drive/MyDrive/Final_DF/stocks.csv"
stock_df = pd.read_csv(stock_data_path)

sentiment_data_path = "/content/drive/MyDrive/Final_DF/stock_data_with_sentiment.csv"
sentiment_df = pd.read_csv(sentiment_data_path)

# Determine Association Between Stocks and Articles
Choose which stock the sentiment of an article is aimed towards.

In [None]:
import ast
sentiment_df['most_relevant_stock'] = sentiment_df['stock_names_freq'].apply(
    lambda x: max(ast.literal_eval(x), key=ast.literal_eval(x).get)
)

# Creating a dataframe of daily features for the entire AI sector



In [None]:
import numpy as np

# Convert 'publishedAt' to datetime and set index
sentiment_df['publishedAt'] = pd.to_datetime(sentiment_df['publishedAt']).dt.tz_localize(None)
sentiment_df['publishedAt_date'] = sentiment_df['publishedAt'].dt.date

# separate sentiment_probs list into three columns for each sentiment (neg, neutral, pos)
sentiment_df['neg_sentiment_prob'] = sentiment_df['sentiment_probs'].apply(
    lambda x: float(x.replace('[','').replace(']','').strip().split(',')[0])
)
sentiment_df['neu_sentiment_prob'] = sentiment_df['sentiment_probs'].apply(
    lambda x: float(x.replace('[','').replace(']','').strip().split(',')[1])
)
sentiment_df['pos_sentiment_prob'] = sentiment_df['sentiment_probs'].apply(
    lambda x: float(x.replace('[','').replace(']','').strip().split(',')[2])
)

# count the number of neutral, pos, neg, articles per day
daily_sentiment_cnt = sentiment_df.groupby('publishedAt_date')['sentiment_score'].value_counts().unstack(-1).reset_index()
daily_sentiment_cnt.rename(columns={0: 'neg_sentiment_cnt',
                                    1: 'neu_sentiment_cnt',
                                    2: 'pos_sentiment_cnt'}, inplace=True)
daily_sentiment_cnt.fillna(0, inplace=True)

# compute average neutral, pos, neg probs across all articles
daily_sentiment_mean_probs = sentiment_df.groupby('publishedAt_date')[['neg_sentiment_prob', 'neu_sentiment_prob', 'pos_sentiment_prob']].mean()
daily_sentiment_mean_probs.reset_index(inplace=True)
daily_sentiment_mean_probs.rename(columns={'neg_sentiment_prob': 'mean_neg_sentiment_prob',
                                          'neu_sentiment_prob': 'mean_neu_sentiment_prob',
                                          'pos_sentiment_prob': 'mean_pos_sentiment_prob'}, inplace=True)

daily_sector_sentiment_features = daily_sentiment_cnt.set_index('publishedAt_date').join(daily_sentiment_mean_probs.set_index('publishedAt_date'))
print(daily_sector_sentiment_features)



# Creating a dataframe of daily features for each company in the AI sector

In [None]:
import numpy as np

# Convert 'publishedAt' to datetime and set index
sentiment_df['publishedAt'] = pd.to_datetime(sentiment_df['publishedAt']).dt.tz_localize(None)
sentiment_df['publishedAt_date'] = sentiment_df['publishedAt'].dt.date

# separate sentiment_probs list into three columns for each sentiment (neg, neutral, pos)
sentiment_df['neg_sentiment_prob'] = sentiment_df['sentiment_probs'].apply(
    lambda x: float(x.replace('[','').replace(']','').strip().split(',')[0])
)
sentiment_df['neu_sentiment_prob'] = sentiment_df['sentiment_probs'].apply(
    lambda x: float(x.replace('[','').replace(']','').strip().split(',')[1])
)
sentiment_df['pos_sentiment_prob'] = sentiment_df['sentiment_probs'].apply(
    lambda x: float(x.replace('[','').replace(']','').strip().split(',')[2])
)

# count the number of neutral, pos, neg, articles per day
daily_sentiment_cnt = sentiment_df.groupby(['publishedAt_date', 'most_relevant_stock'])['sentiment_score'].value_counts().unstack(-1).reset_index()
daily_sentiment_cnt.rename(columns={0: 'neg_sentiment_cnt',
                                    1: 'neu_sentiment_cnt',
                                    2: 'pos_sentiment_cnt'}, inplace=True)
daily_sentiment_cnt.fillna(0, inplace=True)

# compute average neutral, pos, neg probs across all articles
daily_sentiment_mean_probs = sentiment_df.groupby(['publishedAt_date', 'most_relevant_stock'])[['neg_sentiment_prob',
                                                                                          'neu_sentiment_prob',
                                                                                          'pos_sentiment_prob']].mean()
daily_sentiment_mean_probs.reset_index(inplace=True)
daily_sentiment_mean_probs.rename(columns={'neg_sentiment_prob': 'mean_neg_sentiment_prob',
                                          'neu_sentiment_prob': 'mean_neu_sentiment_prob',
                                          'pos_sentiment_prob': 'mean_pos_sentiment_prob'}, inplace=True)

daily_company_sentiment_features = daily_sentiment_cnt.set_index(['publishedAt_date', 'most_relevant_stock']) \
                                  .join(daily_sentiment_mean_probs.set_index(['publishedAt_date', 'most_relevant_stock']))
print(daily_company_sentiment_features)

                                      neg_sentiment_cnt  neu_sentiment_cnt  \
publishedAt_date most_relevant_stock                                         
2024-03-03       AAPL                               0.0                3.0   
                 AI                                 2.0                2.0   
                 META                               0.0                1.0   
                 ORCL                               0.0                1.0   
                 TER                                0.0                1.0   
...                                                 ...                ...   
2024-04-02       SNPS                               0.0                0.0   
                 SQ                                 0.0                1.0   
                 TER                                0.0                1.0   
                 TSLA                               7.0                1.0   
                 ZS                                 0.0         

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')
sentiment_data_path = "/content/drive/My Drive/WPI/Senior Year/CS539 (ML)/sentiment.csv"
daily_company_sentiment_features.to_csv(sentiment_data_path)

In [None]:
# Merge daily sentiment features and stock features into one dataframe

# Prepare stock data
daily_company_sentiment_features.reset_index(inplace=True)
# print(daily_company_sentiment_features)
stock_df.rename(columns={'Date' : 'publishedAt_date'}, inplace=True)
print(stock_df)



In [None]:
import pandas as pd



# Rename columns for consistent naming conventions
sentiment_df.rename(columns={'most_relevant_stock': 'Ticker', 'publishedAt_date': 'Date'}, inplace=True)

# Merge the dataframes on 'Date' and 'Ticker'
merged_df = pd.merge(stock_df, sentiment_df, on=['Date', 'Ticker'])

# Display the first few rows of the merged dataframe
print("\nMerged Data:")
print(merged_df.head())



Merged Data:
         Date Ticker   Adj Close       Close         High         Low  \
0  2024-03-04   AAPL  175.100006  175.100006   176.899994  173.789993   
1  2024-03-04   ADBE  567.940002  567.940002   576.250000  564.099976   
2  2024-03-04     AI   34.070000   34.070000    37.540001   33.810001   
3  2024-03-04   AMZN  177.580002  177.580002   180.139999  177.490005   
4  2024-03-04   ASML  998.039978  998.039978  1007.679993  987.750000   

         Open    Volume  Avg Closing Price  Unnamed: 0  index  \
0  176.149994  81510100         215.063628           6      6   
1  572.849976   2556400         215.063628           7      7   
2   36.470001  14609400         215.063628           8      8   
3  177.529999  37381500         215.063628           9      9   
4  992.599976   1433000         215.063628          10     10   

   neg_sentiment_cnt  neu_sentiment_cnt  pos_sentiment_cnt  \
0                0.0                5.0                8.0   
1                0.0            

In [None]:
# Drop the 'Unnamed: 0' column from the merged dataframe
merged_df.drop(columns=['index', 'Unnamed: 0'], inplace=True)

# Display the first few rows of the cleaned merged dataframe to confirm the column is dropped
print("\nCleaned Merged Data:")
print(merged_df.head())



Cleaned Merged Data:
         Date Ticker   Adj Close       Close         High         Low  \
0  2024-03-04   AAPL  175.100006  175.100006   176.899994  173.789993   
1  2024-03-04   ADBE  567.940002  567.940002   576.250000  564.099976   
2  2024-03-04     AI   34.070000   34.070000    37.540001   33.810001   
3  2024-03-04   AMZN  177.580002  177.580002   180.139999  177.490005   
4  2024-03-04   ASML  998.039978  998.039978  1007.679993  987.750000   

         Open    Volume  Avg Closing Price  neg_sentiment_cnt  \
0  176.149994  81510100         215.063628                0.0   
1  572.849976   2556400         215.063628                0.0   
2   36.470001  14609400         215.063628                4.0   
3  177.529999  37381500         215.063628                1.0   
4  992.599976   1433000         215.063628                0.0   

   neu_sentiment_cnt  pos_sentiment_cnt  mean_neg_sentiment_prob  \
0                5.0                8.0                 0.000808   
1           

In [None]:
sentiment_data_path = "/content/drive/MyDrive/Final_DF/stock_final.csv"
daily_company_sentiment_features.to_csv(sentiment_data_path)