In [1]:
import json
import csv
from datetime import datetime

# Load the JSON data
with open('reddit-data.json', 'r', encoding='utf-8') as json_file:
    data = json.load(json_file)

# Prepare CSV file
with open('reddit-data.csv', 'w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)

    # Write header
    headers = [
        'id', 'title', 'description', 'upvotes', 'downvotes', 'subreddit',
        'date', 'num_comments', 'comments_sample', 'date_parsed'
    ]
    writer.writerow(headers)

    # Process each post
    for post in data:
        # Flatten comments - take first few and count total
        comments = post.get('comments', [])
        num_comments = len(comments)
        comments_sample = " | ".join(comments[:3])  # Take first 3 comments as sample

        # Parse date for better analysis
        date_str = post.get('date', '')
        try:
            date_parsed = datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S').strftime('%Y-%m-%d %H:%M:%S')
        except:
            date_parsed = ''

        # Write row
        row = [
            post.get('id', ''),
            post.get('title', '')[:200],  # Truncate long titles
            post.get('description', '')[:300],  # Truncate long descriptions
            post.get('upvotes', 0),
            post.get('downvotes', 0),
            post.get('subreddit', ''),
            date_str,
            num_comments,
            comments_sample,
            date_parsed
        ]
        writer.writerow(row)

print("CSV file created successfully!")

CSV file created successfully!


add sentiment score to csv by finBERT

In [22]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
from datetime import datetime
import matplotlib.pyplot as plt
# import ast  # For safely evaluating string literals
import torch

# Load FinBERT model and tokenizer
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone', num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')


In [37]:
def get_finbert_score(text):
    if not isinstance(text, str):
        text = str(text) if text is not None else ""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = finbert(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1).squeeze()
        score = (-1 * probs[0]) + (0 * probs[1]) + (1 * probs[2])
        return score.item()


In [38]:
# Load the CSV we created earlier
df = pd.read_csv('reddit-data.csv')


In [42]:

# Analyze sentiment for titles
print("Analyzing sentiment for titles...")
df['title_sentiment'] = df['title'].apply(lambda x: get_finbert_score(x))


# Analyze sentiment for descriptions
print("Analyzing sentiment for descriptions...")
df['desc_sentiment'] = df['description'].apply(lambda x: get_finbert_score(x))


print("Analyzing sentiment for comments...")
df['comments_sentiment'] = df['comments_sample'].apply(lambda x: get_finbert_score(x))
df.head()

# Create weighted sentiment score (title 40%, description 30%, comments 30%)
df['sentiment'] = (
    df['title_sentiment'] * 0.4 +
    df['desc_sentiment'] * 0.3 +
    df['comments_sentiment'] * 0.3
)

# Save the new CSV
df.to_csv("reddit-data(finbert).csv", index=False)
print("Saved as reddit-data(finbert).csv")

Analyzing sentiment for titles...
Analyzing sentiment for descriptions...
Analyzing sentiment for comments...
Saved as reddit-data(finbert).csv


In [46]:
# read two files and combine

import pandas as pd

# Load the first CSV file
df1 = pd.read_csv('merged_news_stock(finbert).csv')
# Rename finbert_score to news_score_finbert
df1 = df1.rename(columns={'finbert_score': 'news_score_finbert'})

# Load the second CSV file
df2 = pd.read_csv('reddit-data(finbert).csv')
# Rename sentiment to social_score_finbert
df2 = df2.rename(columns={'sentiment': 'social_score_finbert'})

# Convert date columns to datetime format for proper merging
df1['date'] = pd.to_datetime(df1['date']).dt.date
df2['date'] = pd.to_datetime(df2['date']).dt.date

# Get the date range from the first table
min_date = df1['date'].min()
max_date = df1['date'].max()

# Filter the second table to only include dates within the first table's range
df2 = df2[(df2['date'] >= min_date) & (df2['date'] <= max_date)]


# Merge the dataframes on date
# We'll take all columns from df1 and only the social_score_finbert from df2
merged_df = pd.merge(
    df1,
    df2[['date', 'social_score_finbert']],
    on='date',
    how='left'
)

# Fill NaN values with 0
merged_df = merged_df.fillna(0)

# Save the merged dataframe to a new CSV file
merged_df.to_csv('data_finbert.csv', index=False)

print("Files merged and saved as 'data_finbert.csv'")

Files merged and saved as 'data_finbert.csv'


In [47]:
merged_df.head()

Unnamed: 0,ticker,title,summary,sentiment,score,date,open,high,low,close,volume,news_score_finbert,social_score_finbert
0,AAPL,"Apple Stock Can Brush Off Tariff Concerns, Ana...",An Apple analyst sees the company beating Q2 r...,Neutral,0.096242,2025-04-23,206.0,208.0,202.799,204.6,52929165,0.942873,0.0
1,AAPL,PayPal is Trading Dirty Cheap at 11.86X P/E: B...,"PYPL's cheap valuation, strong portfolio, and ...",Somewhat-Bullish,0.291644,2025-04-23,206.0,208.0,202.799,204.6,52929165,0.927726,0.0
2,AAPL,Are Apple ETFs Ripe for a Rebound?,Apple (AAPL) shares have outperformed the S&P ...,Somewhat-Bullish,0.17578,2025-04-23,206.0,208.0,202.799,204.6,52929165,-0.976072,0.0
3,AAPL,SOUN vs. BBAI: Which AI Stock Has Bigger Poten...,SoundHound and BigBear.ai? See which AI stock ...,Somewhat-Bullish,0.175883,2025-04-23,206.0,208.0,202.799,204.6,52929165,-0.508791,0.0
4,AAPL,The Real Reason Trump Walked Back From The Bri...,"To gain an edge, this is what you need to know...",Somewhat-Bullish,0.319079,2025-04-23,206.0,208.0,202.799,204.6,52929165,-0.661042,0.0
