In [None]:
pip install praw yfinance

In [17]:
import praw
import pandas as pd
import numpy as np
import re
from datetime import datetime, timedelta
import yfinance as yf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.optimizers.legacy import Adam
from transformers import pipeline
import joblib

In [2]:
def initialize_reddit(client_id, client_secret):
    return praw.Reddit(
        client_id=client_id,
        client_secret=client_secret,
        user_agent="stock_sentiment_analyzer/1.0"
    )

In [3]:
def initialize_sentiment_analyzer():
    return pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

In [69]:
def scrape_reddit_content(reddit, company_ticker, subreddits=None, days=365):
    subreddits = subreddits or ['stocks', 'investing', 'wallstreetbets', 'pennystocks', 'stockmarket']

    scraped_content = []

    for subreddit_name in subreddits:
        try:
            subreddit = reddit.subreddit(subreddit_name)
            posts = subreddit.search(
                query=company_ticker,
                sort='new',
                time_filter='all',
                limit=250
            )

            for post in posts:
                post_age = datetime.utcnow() - datetime.fromtimestamp(post.created_utc)
                if post_age.days <= days:
                    content = {
                        'title': post.title,
                        'text': post.selftext or '',
                        'score': post.score,
                        'comments_count': post.num_comments,
                        'created_at': datetime.fromtimestamp(post.created_utc)
                    }
                    scraped_content.append(content)
        except Exception as e:
            print(f"Error scraping {subreddit_name}: {e}")

    return pd.DataFrame(scraped_content)

In [54]:
def analyze_sentiment(sentiment_analyzer, data):
    sentiment_results = []

    for _, row in data.iterrows():
        text = f"{row['title']} {row['text']}"
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        if not text.strip():
            continue
        try:
            sentiment = sentiment_analyzer(text[:512])[0]
            sentiment_results.append({
                'created_at': row['created_at'],
                'sentiment_label': sentiment['label'],
                'sentiment_score': sentiment['score'] if sentiment['label'] == 'POSITIVE' else -sentiment['score'],
                'post_score': row['score'],
                'comments_count': row['comments_count']
            })
        except Exception as e:
            print(f"Sentiment analysis error: {e}")

    return pd.DataFrame(sentiment_results)

In [55]:
def prepare_dataset(stock_data, sentiment_data, lookback=30):
    stock_data['Date'] = pd.to_datetime(stock_data['Date'])

    sentiment_agg = sentiment_data.groupby(
        pd.to_datetime(sentiment_data['created_at']).dt.date
    ).agg({
        'sentiment_score': 'mean',
        'post_score': 'sum',
        'comments_count': 'sum'
    }).reset_index()

    for col in ['sentiment_score', 'post_score', 'comments_count']:
        if col not in stock_data.columns:
            stock_data[col] = 0

    merged_data = stock_data.copy()
    merged_data['sentiment_score'] = merged_data['Date'].map(
        dict(zip(sentiment_agg['created_at'], sentiment_agg['sentiment_score']))
    ).fillna(0)
    merged_data['post_score'] = merged_data['Date'].map(
        dict(zip(sentiment_agg['created_at'], sentiment_agg['post_score']))
    ).fillna(0)
    merged_data['comments_count'] = merged_data['Date'].map(
        dict(zip(sentiment_agg['created_at'], sentiment_agg['comments_count']))
    ).fillna(0)

    features = ['Close', 'Volume', 'sentiment_score', 'post_score', 'comments_count']
    data = merged_data[features].values

    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(data)

    X, y = [], []
    for i in range(len(scaled_data) - lookback):
        X.append(scaled_data[i:i + lookback, :])
        y.append(scaled_data[i + lookback, 0])

    return np.array(X), np.array(y), scaler

In [56]:
def create_model(input_shape):
    model = Sequential([
        LSTM(64, activation='relu', input_shape=input_shape, return_sequences=True),
        Dropout(0.2),
        LSTM(32, activation='relu'),
        Dropout(0.2),
        Dense(16, activation='relu'),
        Dense(1)
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
    return model

In [19]:
def train_model(model, X_train, X_test, y_train, y_test):
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True
    )

    history = model.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        epochs=250,
        batch_size=32,
        callbacks=[early_stopping],
        verbose=0
    )
    return history

In [9]:
def evaluate_model(model, X_test, y_test, scaler):
    predictions = model.predict(X_test).flatten()

    y_test_full = np.zeros((y_test.shape[0], scaler.scale_.shape[0]))
    y_test_full[:, 0] = y_test

    predictions_full = np.zeros((predictions.shape[0], scaler.scale_.shape[0]))
    predictions_full[:, 0] = predictions

    y_test_original = scaler.inverse_transform(y_test_full)[:, 0]
    predictions_original = scaler.inverse_transform(predictions_full)[:, 0]

    mse = mean_squared_error(y_test_original, predictions_original)
    mae = mean_absolute_error(y_test_original, predictions_original)

    print(f"Mean Squared Error: {mse}")
    print(f"Mean Absolute Error: {mae}")

    return predictions_original

In [10]:
def predict_future(model, stock_data, scaler, lookback=30):
    features = ['Close', 'Volume', 'sentiment_score', 'post_score', 'comments_count']
    data = stock_data[features].values

    scaled_data = scaler.transform(data)

    last_sequence = scaled_data[-lookback:]
    last_sequence = last_sequence.reshape(1, lookback, -1)

    predicted_scaled = model.predict(last_sequence).flatten()

    prediction_full = np.zeros((1, scaler.scale_.shape[0]))
    prediction_full[0, 0] = predicted_scaled
    predicted_price = scaler.inverse_transform(prediction_full)[0, 0]

    return predicted_price

In [None]:
# def main():
#     TICKER = 'JANX'
#     CLIENT_ID = 'TYrklKt5KWR9ljrcZqyJPQ'
#     CLIENT_SECRET = '_RbOwvRafkLDwQe6KkWTqM_LHLPPvA'

#     reddit = initialize_reddit(CLIENT_ID, CLIENT_SECRET)
#     sentiment_analyzer = initialize_sentiment_analyzer()

#     stock_data = yf.download(TICKER, start=datetime.now() - timedelta(days=1825))
#     stock_data.reset_index(inplace=True)
#     stock_data['Date'] = stock_data['Date'].dt.date

#     reddit_data = scrape_reddit_content(reddit, TICKER)
#     print("Reddit data downloaded.")
#     sentiment_data = analyze_sentiment(sentiment_analyzer, reddit_data)
#     sentiment_data['created_at'] = sentiment_data['created_at'].dt.date

#     X, y, scaler = prepare_dataset(stock_data, sentiment_data)
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#     model = create_model(X_train.shape[1:])
#     train_model(model, X_train, X_test, y_train, y_test)

#     predictions = evaluate_model(model, X_test, y_test, scaler)

#     future_price = predict_future(model, stock_data, scaler)
#     print(f"Predicted Future Price: ${future_price:.2f}")

# if __name__ == "__main__":
#     main()

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
[*********************100%***********************]  1 of 1 completed


Reddit data downloaded.
Mean Squared Error: 4.859606178610601
Mean Absolute Error: 1.5641132972114196
Predicted Future Price: $46.56


In [None]:
TICKER = 'AAPL'         #Change this to the desired stock
CLIENT_ID = ''          #Change the API keys fo reddit
CLIENT_SECRET = ''

reddit = initialize_reddit(CLIENT_ID, CLIENT_SECRET)
sentiment_analyzer = initialize_sentiment_analyzer()

stock_data = yf.download(TICKER, start=datetime.now() - timedelta(days=1825))
stock_data.reset_index(inplace=True)
stock_data['Date'] = stock_data['Date'].dt.date

reddit_data = scrape_reddit_content(reddit, TICKER)
print("Reddit data downloaded.")
sentiment_data = analyze_sentiment(sentiment_analyzer, reddit_data)
sentiment_data['created_at'] = sentiment_data['created_at'].dt.date

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
[*********************100%***********************]  1 of 1 completed


Reddit data downloaded.


In [74]:
sentiment_data

Unnamed: 0,created_at,sentiment_label,sentiment_score,post_score,comments_count
0,2024-11-29,NEGATIVE,-0.995668,8,174
1,2024-11-22,NEGATIVE,-0.995668,13,255
2,2024-11-15,NEGATIVE,-0.995668,22,481
3,2024-11-08,NEGATIVE,-0.995668,16,413
4,2024-11-02,NEGATIVE,-0.975987,28,9
...,...,...,...,...,...
622,2023-12-16,POSITIVE,0.748695,157,213
623,2023-12-14,NEGATIVE,-0.947190,45,16
624,2023-12-14,POSITIVE,0.994136,0,16
625,2023-12-08,POSITIVE,0.985799,1,3


In [75]:
X, y, scaler = prepare_dataset(stock_data, sentiment_data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = create_model(X_train.shape[1:])
train_model(model, X_train, X_test, y_train, y_test)

predictions = evaluate_model(model, X_test, y_test, scaler)

future_price = predict_future(model, stock_data, scaler)
print(f"Predicted Future Price: ${future_price:.2f}")


Mean Squared Error: 43.918962999743044
Mean Absolute Error: 5.347785542429231
Predicted Future Price: $220.40
