### Load raw data from Reddit

In [1]:
import pandas as pd

In [2]:
from google import genai

In [3]:
df = pd.read_csv('reddit_sentiment_data_unstemmed.csv')
len(df)

92072

### Send title and body to Gemini API
* Extract stock mentioned in title or body

In [4]:
client = genai.Client(api_key="")

def find_ticker(text) -> list:
    if isinstance(text, str):
        tickers = []
        prompt=f'''
            From given text "{text}", try to find US stock ticker
            Instruction:
            - If found multiple tickers, use comma to separate, do not include "\n", example: TSLA, AAPL
            - return in string format
            - if not found, return just "nan"
            '''
        response = client.models.generate_content(
            model="gemini-2.0-flash-lite",
            contents=prompt
        )
        # print(text)
        # print(response.text)
        res = response.text
        ts = res.split(',')
        for t in ts:
            tickers.append(t)
        return tickers
    return []


In [5]:
import time
def find_tickers_in_row(row) -> list:
    """
    Helper function to process each row of the DataFrame.
    """
    title = row['title'] if 'title' in row.index and pd.notnull(row['title']) else ''
    body = row['body'] if 'body' in row.index and pd.notnull(row['body']) else ''

    potential_tickers_title_list = find_ticker(title)
    time.sleep(1)
    potential_tickers_body_list = find_ticker(body)
    time.sleep(1)

    # Combine and filter for real tickers in one step
    real_tickers_in_row = potential_tickers_title_list + potential_tickers_body_list

    # Convert to set for uniqueness, then back to list
    return list(set(real_tickers_in_row))


In [6]:
# Done! (90 min one 2000 rows), 92072 rows total
for i in range(84000, 92072, 2000):
    df_part = df.iloc[i:i+2000]
    real_tickers_per_row = list(zip(df_part.index, df_part.apply(find_tickers_in_row, axis=1)))

    # Create a new DataFrame to store the results with duplicated rows
    new_rows = list()
    for index, real_tickers in real_tickers_per_row:
        original_row = df.iloc[index]
        if real_tickers:
            for ticker in real_tickers:
                new_row = original_row.copy()
                new_row['stock_gemini'] = ticker
                new_rows.append(new_row)
        else:
            new_row = original_row.copy()
            new_row['stock_gemini'] = None  # Or some other indicator for no real ticker
            new_rows.append(new_row)

    df_tickers_found = pd.DataFrame(new_rows)
    df_tickers_found['stock_gemini'] = df_tickers_found['stock_gemini'].astype(str)
    rows_to_remove_mask = df_tickers_found['stock_gemini'].str.contains('nan', case=False, na=False)
    df_processed = df_tickers_found[~rows_to_remove_mask].copy()
    df_processed['stock_gemini'] = df_processed['stock_gemini'].str.replace('\n', '', regex=False)
    df_processed.to_csv(f'df_found_ticker_{i}.csv', index=False)
