In [1]:
import json
import csv
from datetime import datetime

# Load the JSON data
with open('reddit-data.json', 'r', encoding='utf-8') as json_file:
    data = json.load(json_file)

# Prepare CSV file
with open('reddit-data.csv', 'w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)

    # Write header
    headers = [
        'id', 'title', 'description', 'upvotes', 'downvotes', 'subreddit',
        'date', 'num_comments', 'comments_sample', 'date_parsed'
    ]
    writer.writerow(headers)

    # Process each post
    for post in data:
        # Flatten comments - take first few and count total
        comments = post.get('comments', [])
        num_comments = len(comments)
        comments_sample = " | ".join(comments[:3])  # Take first 3 comments as sample

        # Parse date for better analysis
        date_str = post.get('date', '')
        try:
            date_parsed = datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S').strftime('%Y-%m-%d %H:%M:%S')
        except:
            date_parsed = ''

        # Write row
        row = [
            post.get('id', ''),
            post.get('title', '')[:200],  # Truncate long titles
            post.get('description', '')[:300],  # Truncate long descriptions
            post.get('upvotes', 0),
            post.get('downvotes', 0),
            post.get('subreddit', ''),
            date_str,
            num_comments,
            comments_sample,
            date_parsed
        ]
        writer.writerow(row)

print("CSV file created successfully!")

CSV file created successfully!


add sentiment score to csv by finBERT

In [2]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
from datetime import datetime
import matplotlib.pyplot as plt
# import ast  # For safely evaluating string literals
import torch

# Load FinBERT model and tokenizer
finbert = BertForSequenceClassification.from_pretrained('ProsusAI/finbert', num_labels=3)
tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [3]:
def get_finbert_score(text):
    if not isinstance(text, str):
        text = str(text) if text is not None else ""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = finbert(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1).squeeze()
        score = (-1 * probs[0]) + (0 * probs[1]) + (1 * probs[2])
        return score.item()


In [4]:
# Load the CSV we created earlier
df = pd.read_csv('reddit-data.csv')


In [5]:

# Analyze sentiment for titles
print("Analyzing sentiment for titles...")
df['title_sentiment'] = df['title'].apply(lambda x: get_finbert_score(x))


# Analyze sentiment for descriptions
print("Analyzing sentiment for descriptions...")
df['desc_sentiment'] = df['description'].apply(lambda x: get_finbert_score(x))


print("Analyzing sentiment for comments...")
df['comments_sentiment'] = df['comments_sample'].apply(lambda x: get_finbert_score(x))
df.head()

# Create weighted sentiment score (title 40%, description 30%, comments 30%)
df['sentiment'] = (
    df['title_sentiment'] * 0.4 +
    df['desc_sentiment'] * 0.3 +
    df['comments_sentiment'] * 0.3
)

# Save the new CSV
df.to_csv("reddit-data(finbert).csv", index=False)
print("Saved as reddit-data(finbert).csv")

Analyzing sentiment for titles...
Analyzing sentiment for descriptions...
Analyzing sentiment for comments...
Saved as reddit-data(finbert).csv


In [6]:
display(df.head())

Unnamed: 0,id,title,description,upvotes,downvotes,subreddit,date,num_comments,comments_sample,date_parsed,title_sentiment,desc_sentiment,comments_sentiment,sentiment
0,1j0w73o,Rate My Portfolio - r/Stocks Quarterly Thread ...,Please use this thread to discuss your portfol...,90,0,stocks,2025-03-01T05:00:42,5,100% S&P and bricking it | 100% 6-month tbills...,2025-03-01 05:00:42,0.914923,0.759563,0.89453,0.862197
1,1k0g9jq,"r/Stocks Daily Discussion Wednesday - Apr 16, ...",These daily discussions run from Monday to Fri...,4,0,stocks,2025-04-16T05:30:32,5,[deleted] | Business leaders [survey](https://...,2025-04-16 05:30:32,0.918951,0.91998,0.000552,0.64374
2,1k090ot,if you guys think this is close to being over ...,***US PLANS TO USE TARIFF NEGOTIATIONS TO ISO...,5160,0,stocks,2025-04-15T21:47:31,5,"Hi, you're on r/Stocks, please make sure your ...",2025-04-15 21:47:31,0.620303,-0.037889,0.853145,0.492698
3,1k0bs3n,WH: China now faces up to 245% imports,\nWhite House: China now faces up to a 245% ta...,2559,0,stocks,2025-04-16T00:17:56,5,"Hi, you're on r/Stocks, please make sure your ...",2025-04-16 00:17:56,-0.806567,-0.441198,0.890332,-0.187887
4,1k07y4h,Bloomberg reporting that Goldman Sachs adjuste...,[https://www.bloomberg.com/news/articles/2025-...,3356,0,stocks,2025-04-15T20:51:59,5,"""Trade wars are good and easy to win.""\n\n\-Do...",2025-04-15 20:51:59,0.005843,0.092783,0.010675,0.033375


In [7]:
# read two files and combine

import pandas as pd

# Load the first CSV file
df1 = pd.read_csv('AUGMENTED_entityMask_merged_news_stock(finbert).csv')
# Rename finbert_score to news_score_finbert
df1 = df1.rename(columns={'finbert_score': 'news_score_finbert'})

# Load the second CSV file
df2 = pd.read_csv('reddit-data(finbert).csv')
# Rename sentiment to social_score_finbert
df2 = df2.rename(columns={'sentiment': 'social_score_finbert'})

# Convert date columns to datetime format for proper merging
df1['date'] = pd.to_datetime(df1['date']).dt.date
df2['date'] = pd.to_datetime(df2['date']).dt.date

# Get the date range from the first table
min_date = df1['date'].min()
max_date = df1['date'].max()

# Filter the second table to only include dates within the first table's range
df2 = df2[(df2['date'] >= min_date) & (df2['date'] <= max_date)]


# Merge the dataframes on date
# We'll take all columns from df1 and only the social_score_finbert from df2
merged_df = pd.merge(
    df1,
    df2[['date', 'social_score_finbert']],
    on='date',
    how='left'
)

# Fill NaN values with 0
merged_df = merged_df.fillna(0)

# Save the merged dataframe to a new CSV file
merged_df.to_csv('news_socialmedia_merged_data(finbert).csv', index=False)

print("Files merged and saved as 'news_socialmedia_merged_data(finbert).csv'")

Files merged and saved as 'news_socialmedia_merged_data(finbert).csv'


In [8]:
merged_df.head()

Unnamed: 0,ticker,title,summary,sentiment,score,date,open,high,low,close,volume,news_score_finbert,social_score_finbert
0,AAPL,Mag 7 Earnings Preview: What Can Investors Exp...,We get into the heart of the Q1 earnings seaso...,Neutral,0.075539,2025-04-25,206.365,209.75,206.2,209.28,38222258,-0.851334,0.0
1,AAPL,Should You Invest in Bitcoin Now?,Bitcoin has broken away from the NASDAQ and is...,Somewhat-Bullish,0.21674,2025-04-25,206.365,209.75,206.2,209.28,38222258,0.59174,0.0
2,AAPL,Apple Plans iPhone Shift From China To India A...,Apple plans to import most U.S. iPhones from I...,Neutral,0.124279,2025-04-25,206.365,209.75,206.2,209.28,38222258,0.480554,0.0
3,AAPL,Earnings Data Deluge,Pre-market futures are in the red slightly thi...,Neutral,0.047797,2025-04-25,206.365,209.75,206.2,209.28,38222258,-0.917026,0.0
4,AAPL,Comparing Apple With Industry Competitors In T...,In today's rapidly changing and highly competi...,Somewhat-Bullish,0.260158,2025-04-25,206.365,209.75,206.2,209.28,38222258,-0.916217,0.0
