In [1]:
import os
import sys
sys.path.append(os.path.abspath(os.path.dirname(os.getcwd())))
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'data_fetch'))

In [75]:
import re
import pandas as pd
from yahoo_fin import stock_info as si
import yfinance as yf
import pytz
from datetime import datetime, timedelta
from collections import Counter
from data_fetch.fetch_reddit_comments import get_existing_spreadsheet, authenticate_google_services
from data_fetch.credentials import TRADING_FOLDER_ID
from data_fetch.utils import bucket_by_time

import plotly.graph_objects as go
from plotly.subplots import make_subplots

CREDS_PATH = os.path.join(os.path.dirname(os.getcwd()), 'Trading_Access.json')
TICKER_PATTERN = r'\b[A-Z]{2,5}\b'
ISRAEL_TZ = pytz.timezone("Asia/Jerusalem")
interval_minutes = 5

In [77]:
# Function to extract tickers along with timestamps
def extract_tickers(row):
    words = set(re.findall(TICKER_PATTERN, row.Body, re.IGNORECASE))  # Remove duplicates within each comment
    return [(word.upper(), row.Timestamp) for word in words]

def extract_tickers_uppercase(row):
    words = set(re.findall(TICKER_PATTERN, row.Body))  # Remove duplicates within each comment
    return [(word.upper(), row.Timestamp) for word in words]

def validate_tickers(ticker_list):
    """Validate extracted tickers using Yahoo Finance"""
    # all_tickers = set(si.tickers_dow() + si.tickers_sp500() + si.tickers_nasdaq() + si.tickers_other())
    all_tickers = set(si.tickers_dow() + si.tickers_sp500() + si.tickers_nasdaq())
    valid_tickers = set()
    invalid_tickers = set()

    for ticker in ticker_list:
        if ticker in all_tickers:
            valid_tickers.add(ticker)
        else:
            invalid_tickers.add(ticker)
    
    return valid_tickers, invalid_tickers

In [None]:
date = '2025-02-20'
sheet_name = f'pennystocks_subreddit_{date}'
spreadsheet_id = get_existing_spreadsheet(sheet_name, TRADING_FOLDER_ID, CREDS_PATH)
gspread_client, _ = authenticate_google_services(CREDS_PATH)
spreadsheet = gspread_client.open_by_key(spreadsheet_id)
spreadsheet = spreadsheet.worksheet(date).get_all_values()

df = pd.DataFrame(spreadsheet[1:], columns=spreadsheet[0])
df["Timestamp"] = pd.to_datetime(df["Timestamp"])
df.sort_values(by='Timestamp', inplace=True)

comments = df[['Body', 'Timestamp']]

tickers_with_time = comments.apply(extract_tickers, axis=1)
tickers_with_time = [item for sublist in tickers_with_time for item in sublist]

ticker_list = list(dict.fromkeys([ticker for ticker, _ in tickers_with_time]))
valid_tickers, invalid_tickers = validate_tickers(ticker_list)

time_buckets = bucket_by_time([(ticker, ts.tz_localize(ISRAEL_TZ)) for ticker, ts in tickers_with_time], ISRAEL_TZ, interval_minutes)

# Keep only keys that are tuples of length 2 (date, bucket)
valid_keys = [key for key in time_buckets.keys() if isinstance(key, tuple) and len(key) == 2]

rows = []
    
# Count mentions in each time bucket
for (date, bucket) in sorted(valid_keys):
    
    start_time = (datetime.strptime(date, "%Y-%m-%d").replace(tzinfo=ISRAEL_TZ) + timedelta(minutes=bucket)).strftime("%H:%M")
    end_time = (datetime.strptime(date, "%Y-%m-%d").replace(tzinfo=ISRAEL_TZ) + timedelta(minutes=bucket + interval_minutes)).strftime("%H:%M")

    ticker_counts = Counter(time_buckets[(date, bucket)])
    filtered_counts = {ticker: count for ticker, count in ticker_counts.items() if ticker in valid_tickers}

    for ticker, count in filtered_counts.items():
        rows.append({"date": date, "start_time": start_time, "end_time": end_time, "ticker": ticker, "mentions": count})

# Convert to DataFrame
df_ticker_mentions = pd.DataFrame(rows, columns=["date", "start_time", "end_time", "ticker", "mentions"])

df_ticker_mentions["start_datetime"] = pd.to_datetime(df_ticker_mentions["date"] + " " + df_ticker_mentions["start_time"])
df_ticker_mentions["end_datetime"] = pd.to_datetime(df_ticker_mentions["date"] + " " + df_ticker_mentions["end_time"])
df_ticker_mentions.sort_values(by=['date', 'start_time', 'end_time', 'ticker'], inplace=True)
df_ticker_mentions = df_ticker_mentions[df_ticker_mentions.date == date].reset_index(drop=True)

df_ticker_mentions_daily = df_ticker_mentions.groupby('ticker')['mentions'].sum().reset_index()
df_ticker_mentions_daily.sort_values(by='mentions', ascending=False, inplace=True)

upper_ticker = comments.apply(extract_tickers_uppercase, axis=1)
upper_tickers_with_time = [item for sublist in upper_ticker for item in sublist]
upper_ticker_list = list(dict.fromkeys([ticker for ticker, _ in upper_tickers_with_time]))
upper_valid_tickers, _ = validate_tickers(upper_ticker_list)

upper_df_ticker_mentions_daily = df_ticker_mentions_daily[df_ticker_mentions_daily.ticker.isin(upper_valid_tickers)]

📄 Spreadsheet 'pennystocks_subreddit_2025-02-20' already exists: https://docs.google.com/spreadsheets/d/1yfNpU59DYp2E9iN_3lPwIYpspF_-r9DOhFmHOem7Mhw


In [102]:
upper_df_ticker_mentions_daily.iloc[:50]

Unnamed: 0,ticker,mentions
176,ON,459
102,GOOD,232
3,ADTX,214
35,CAN,193
6,ANY,183
160,ME,181
17,BACK,145
230,SOBR,106
237,STSS,99
233,SPGC,93


In [104]:
ticker = 'SOBR'
ticker_df = df_ticker_mentions[df_ticker_mentions.ticker == ticker][['end_datetime', 'mentions']]
ticker_df.rename(columns={'end_datetime': 'Datetime'}, inplace=True)

stock = yf.Ticker(ticker)
stock_df = stock.history(start=date, end=(pd.to_datetime(date) + timedelta(days=1)).strftime('%Y-%m-%d'), interval="5m")
stock_df.reset_index(inplace=True)
israel_tz = pytz.timezone('Asia/Jerusalem')
stock_df['Datetime'] = stock_df['Datetime'].dt.tz_convert(israel_tz).dt.tz_localize(None)

stock_df = stock_df.merge(ticker_df, on='Datetime', how='outer')
stock_df["mentions"] = stock_df["mentions"].fillna(0)
stock_df = stock_df[(stock_df["Datetime"].dt.strftime('%Y-%m-%d') <= date)]

# Create subplots with shared x-axis
fig = make_subplots(rows=2, cols=1, shared_xaxes=True, 
                    subplot_titles=('Close Price', 'Mentions'),
                    vertical_spacing=0.2)

# Close price subplot
fig.add_trace(go.Scatter(
    x=stock_df['Datetime'],
    y=stock_df['Close'],
    mode='lines+markers',
    name='Close',
    line=dict(color='blue'),
    marker=dict(size=6)
), row=1, col=1)

# Mentions subplot
fig.add_trace(go.Scatter(
    x=stock_df['Datetime'],
    y=stock_df['mentions'],
    mode='lines+markers',
    name='Mentions',
    line=dict(color='red'),
    marker=dict(size=6)
), row=2, col=1)

# Update layout
fig.update_layout(
    title='Close Price and Mentions Over Time',
    xaxis2_title='Datetime',
    yaxis_title='Close',
    yaxis2_title='Mentions',
    showlegend=False,
    hovermode='x unified'
)

# Display the plot
fig.show()

In [105]:
stock_df

Unnamed: 0,Datetime,Open,High,Low,Close,Volume,Dividends,Stock Splits,mentions
0,2025-02-20 00:10:00,,,,,,,,2.0
1,2025-02-20 00:35:00,,,,,,,,1.0
2,2025-02-20 00:40:00,,,,,,,,1.0
3,2025-02-20 01:10:00,,,,,,,,2.0
4,2025-02-20 03:05:00,,,,,,,,1.0
...,...,...,...,...,...,...,...,...,...
112,2025-02-20 22:40:00,1.0300,1.0399,1.0275,1.0350,33649.0,0.0,0.0,0.0
113,2025-02-20 22:45:00,1.0300,1.0499,1.0250,1.0401,39007.0,0.0,0.0,0.0
114,2025-02-20 22:50:00,1.0451,1.0482,1.0300,1.0318,52855.0,0.0,0.0,0.0
115,2025-02-20 22:55:00,1.0300,1.0400,1.0200,1.0300,142303.0,0.0,0.0,0.0
