### Import Data
* For reddit_wsb.csv, no need to stem the text (The results are pretty similar, 29-Apr)

In [1]:
import pandas as pd

In [9]:
df = pd.read_csv('reddit_sentiment_data.csv')

In [12]:
df.shape

(81835, 8)

In [13]:
# Remove spaces in stock string
df['stock'] = df['stock'].str.replace(' ', '')

In [32]:
# Remove stock ticker longer than 5 (mis-extracted by Gemini)
# It would also removed row with nan
df = df[df['stock'].str.len() <= 5]

In [33]:
# Remove "$" in stock ticker
df['stock'] = df['stock'].str.replace(r'\$', '')

In [35]:
# df.to_csv('reddit_sentiment_data_1.csv', index=False)

### Data preprocessing
* Remove rows when title and body are null
* Change format for timstamp to only date (%Y-%m-%d)

In [36]:
df.shape

(55546, 8)

In [37]:
df = df[df['title'].notnull() | df['body'].notnull()]

In [38]:
df.shape

(55546, 8)

In [42]:
df['date'] = pd.to_datetime(df['date']).dt.strftime('%Y-%m-%d')

### Ticker extraction
* Extract stocker ticker from title and body and add a new column to store mentioned ticker
    * Extracts potential tickers (all caps, 1-5 letters)
    * Extracts potential tickers (Some people typed ${ticker} in title or body)

In [44]:
df.shape

(55546, 8)

In [48]:
ticker_list = list()
ticker_list_file = 'ticker_list.txt'
with open(ticker_list_file, 'r') as file:
    for line in file:
        ticker = line.strip()
        if ticker:
            ticker_list.append(ticker)

In [49]:
print("YES") if "GOOGL" in ticker_list else print("NO")

YES


In [71]:
def extract_potential_tickers(text) -> list:
    import re
    """
    Extracts potential stock tickers (all caps, 2-5 letters) from a text,
    skipping titles that are entirely in uppercase.
    """
    if isinstance(text, str):
        # Regex to find sequences of 1 to 5 uppercase letters
        tickers = re.findall(r'\b[A-Z]{2,5}\b', text)
        return tickers
    return []

In [64]:
def find_tickers_in_row(row) -> list:
    """
    Helper function to process each row of the DataFrame.
    """
    title = row['title'] if 'title' in row.index and pd.notnull(row['title']) else ''
    body = row['body'] if 'body' in row.index and pd.notnull(row['body']) else ''

    potential_tickers_title_list = extract_potential_tickers(title)
    potential_tickers_body_list = extract_potential_tickers(body)

    # Combine and filter for real tickers in one step
    real_tickers_in_row = [
        ticker for ticker in (potential_tickers_title_list + potential_tickers_body_list) if ticker in ticker_list
    ]

    # Convert to set for uniqueness, then back to list
    return list(set(real_tickers_in_row))
real_tickers_per_row = list(zip(df.index, df.apply(find_tickers_in_row, axis=1)))

In [73]:
# Create a new DataFrame to store the results with duplicated rows
new_rows = list()
for index, real_tickers in real_tickers_per_row:
    original_row = df.iloc[index]
    if real_tickers:
        for ticker in real_tickers:
            new_row = original_row.copy()
            new_row['stock'] = ticker
            new_rows.append(new_row)
    else:
        new_row = original_row.copy()
        new_row['stock'] = None  # Or some other indicator for no real ticker
        new_rows.append(new_row)

df_tickers_found = pd.DataFrame(new_rows)
df_tickers_found

Unnamed: 0,title,score,comms_num,body,date,stock
0,Thanks for the gains Vlad!,233,41,This will go even higher with SP inclusion.,2025-05-31,
1,YOLO nvda calls,148,155,Could be the best decision of my life or the w...,2025-05-31,
2,What a week 😮‍💨,115,21,"3rd straight week of trading GLD\n+24,846 The ...",2025-05-31,
3,Started scalping once I hit 25k. No looking ba...,425,251,"Hyper scalping SPX, 100+ trades in 2 days",2025-05-31,
4,momey,549,76,,2025-05-31,
...,...,...,...,...,...,...
536,Unh puts are they going to print ?,153,87,,2025-05-14,
537,ETORO IPO 175k YOLO - Fish Chart Spotted,141,23,"Today, I present to you: E-Toro (Electronic Bl...",2025-05-14,E
538,Foot Locker Surges 65% After-Hours on Reported...,329,107,* No paywall: [https://finance.yahoo.com/news/...,2025-05-14,
539,"Part 2: 11 days doubled portfolio (basis $1,00...",9,33,"Original post linked below, wanted to update y...",2025-05-14,


In [84]:
# Ensure ticker_list is a set for efficient lookups
ticker_set = set(ticker_list)

# Get the subset of df_tickers_found where 'stock' is null
df_null_stock = df_tickers_found[df_tickers_found['stock'].isnull()]

all_new_rows_from_dollar_sign = []

# Define a function to process text for $tickers
def extract_and_validate_dollar_tickers(text_content) -> list:
    if not isinstance(text_content, str) or '$' not in text_content:
        return [] # No tickers found or not a string

    potential_tickers = re.findall(r'\$(\w+)', text_content)
    real_tickers = set()
    for potential_ticker in potential_tickers:
        # Check if the potential ticker is not entirely digits and is in our valid ticker_set
        if not any(char.isdigit() for char in potential_ticker) and potential_ticker in ticker_set:
            real_tickers.add(potential_ticker)
    return list(real_tickers)

# Iterate once over the relevant rows from the original df's index
# It's better to get original rows from `df` itself based on the index from `df_null_stock`
for original_df_index, row_data in df_null_stock.iterrows():
    title_content = row_data['title']
    body_content = row_data['body']

    # Extract tickers from title
    real_tickers_from_title = extract_and_validate_dollar_tickers(title_content)
    # Extract tickers from body
    real_tickers_from_body = extract_and_validate_dollar_tickers(body_content)

    # Combine unique tickers from both
    combined_real_tickers = list(set(real_tickers_from_title + real_tickers_from_body))

    if combined_real_tickers:
        # Get the original row data from the main `df` using its index
        # This ensures we get the original columns, not just those in df_tickers_found
        original_source_row = df.loc[original_df_index].to_dict()

        for ticker in combined_real_tickers:
            new_row_dict = original_source_row.copy()
            new_row_dict['stock'] = ticker
            all_new_rows_from_dollar_sign.append(new_row_dict)

# Create the DataFrame once at the end
df_found_from_dollar_sign = pd.DataFrame(all_new_rows_from_dollar_sign)

# Ensure 'stock' column is at the front if the DataFrame is not empty
if not df_found_from_dollar_sign.empty:
    cols = ['stock'] + [col for col in df_found_from_dollar_sign.columns if col != 'stock']
    df_found_from_dollar_sign = df_found_from_dollar_sign[cols]

df_with_tickers = pd.concat([df_tickers_found[df_tickers_found['stock'].notnull()], df_found_from_dollar_sign]).reset_index(drop=True)

Unnamed: 0,title,score,comms_num,body,date,stock
0,UNH YOLO & Thesis,192,107,I'm betting over half my port on UNH. All shar...,2025-05-31,UNH
1,UNH YOLO & Thesis,192,107,I'm betting over half my port on UNH. All shar...,2025-05-31,A
2,150k gain on RKLB,164,30,Finally decided to sell my RKLB position,2025-05-31,RKLB
3,YTD check-in,1213,209,They were mainly puts and some calls for SPY a...,2025-05-31,INTC
4,Unite for UNH,98,122,Gather all my fellow UNH believers we have Hem...,2025-05-31,UNH
...,...,...,...,...,...,...
500,welfare check: still retarded. doubled down on...,124,157,"“if you can’t handle a 50% drop, you shouldn’t...",2025-05-14,UNH
501,welfare check: still retarded. doubled down on...,124,157,"“if you can’t handle a 50% drop, you shouldn’t...",2025-05-14,HTZ
502,UnitedHealth Group Is Under Criminal Investiga...,2555,184,$UNH down 7% after market. Nothing cheers more...,2025-05-14,UNH
503,UnitedHealth Group Is Under Criminal Investiga...,2555,184,$UNH down 7% after market. Nothing cheers more...,2025-05-14,L


In [None]:
# Drop rows with ticker is single character
df_with_tickers = df_with_tickers[df_with_tickers['stock'].str.len() > 1]

### Natural Language Processing
#### Use Stanza library
* Remove stop words
* Stemming words
* Get sentiment scores

In [86]:
import numpy as np
import stanza
from nltk.corpus import stopwords
from nltk.stem.porter import *
import re
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [87]:
# Stanza
stanza.download('en')
nlp = stanza.Pipeline('en', processors='tokenize, sentiment')


2025-06-15 12:07:01 INFO: Downloaded file to /Users/wenshinluo/stanza_resources/resources.json
2025-06-15 12:07:01 INFO: Downloading default packages for language: en (English) ...
2025-06-15 12:07:02 INFO: File exists: /Users/wenshinluo/stanza_resources/en/default.zip
2025-06-15 12:07:06 INFO: Finished downloading models and saved to /Users/wenshinluo/stanza_resources
2025-06-15 12:07:06 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES

2025-06-15 12:07:06 INFO: Downloaded file to /Users/wenshinluo/stanza_resources/resources.json
2025-06-15 12:07:06 INFO: Loading these models for language: en (English):
| Processor | Package        |
------------------------------
| tokenize  | combined       |
| mwt       | combined       |
| sentiment | sstplus_charlm |

2025-06-15 12:07:06 INFO: Using device: cpu
2025-06-15 12:07:06 INFO: Loading: tokeniz

In [88]:
# Stemmer
port_stem = PorterStemmer()

In [89]:
# Sentiment score function
def sentiment_score(text):
    try:
        doc = nlp(text)
        if doc.sentences:
            return doc.sentences[0].sentiment
        else:
            return None  # Handle empty strings or cases with no sentences
    except Exception as e:
        print(f"Error processing text: '{text}' - {e}")
        return None

In [90]:
%time df_with_tickers['title_sentiment'] = df_with_tickers['title'].apply(sentiment_score)

CPU times: user 1min 19s, sys: 2.04 s, total: 1min 21s
Wall time: 43.9 s


In [91]:
df_with_tickers

Unnamed: 0,title,score,comms_num,body,date,stock,title_sentiment
0,UNH YOLO & Thesis,192,107,I'm betting over half my port on UNH. All shar...,2025-05-31,UNH,1
1,UNH YOLO & Thesis,192,107,I'm betting over half my port on UNH. All shar...,2025-05-31,A,1
2,150k gain on RKLB,164,30,Finally decided to sell my RKLB position,2025-05-31,RKLB,1
3,YTD check-in,1213,209,They were mainly puts and some calls for SPY a...,2025-05-31,INTC,1
4,Unite for UNH,98,122,Gather all my fellow UNH believers we have Hem...,2025-05-31,UNH,1
...,...,...,...,...,...,...,...
500,welfare check: still retarded. doubled down on...,124,157,"“if you can’t handle a 50% drop, you shouldn’t...",2025-05-14,UNH,0
501,welfare check: still retarded. doubled down on...,124,157,"“if you can’t handle a 50% drop, you shouldn’t...",2025-05-14,HTZ,0
502,UnitedHealth Group Is Under Criminal Investiga...,2555,184,$UNH down 7% after market. Nothing cheers more...,2025-05-14,UNH,1
503,UnitedHealth Group Is Under Criminal Investiga...,2555,184,$UNH down 7% after market. Nothing cheers more...,2025-05-14,L,1


In [92]:
%time df_with_tickers['body_sentiment'] = df_with_tickers['body'].apply(sentiment_score)

Error processing text: 'nan' - input should be either str, list or Document
Error processing text: 'nan' - input should be either str, list or Document
Error processing text: 'nan' - input should be either str, list or Document
Error processing text: 'nan' - input should be either str, list or Document
Error processing text: 'nan' - input should be either str, list or Document
Error processing text: 'nan' - input should be either str, list or Document
Error processing text: 'nan' - input should be either str, list or Document
Error processing text: 'nan' - input should be either str, list or Document
Error processing text: 'nan' - input should be either str, list or Document
Error processing text: 'nan' - input should be either str, list or Document
Error processing text: 'nan' - input should be either str, list or Document
Error processing text: 'nan' - input should be either str, list or Document
Error processing text: 'nan' - input should be either str, list or Document
Error proces

### Post Data processing
* Adjust mistaken sentiment score for null post body

In [106]:
# Readjust unknown assignment of sentiment score to Null body
df_with_tickers.loc[df_with_tickers['body'].isnull(), 'body_sentiment'] = 0

In [107]:
df_with_tickers.to_csv('reddit_sentiment_data.csv', index=False)