# Stock Price Prediction with Machine Learning

This notebook demonstrates building ML models for stock price prediction using technical indicators and market data.

## Features:
- Data collection from Yahoo Finance
- Technical indicator calculation
- LSTM and Random Forest models
- Model evaluation and comparison
- Real-time prediction API

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import yfinance as yf
import ta
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

In [None]:
# Configuration
SYMBOLS = ['AAPL', 'GOOGL', 'MSFT', 'AMZN', 'TSLA']
PERIOD = '2y'  # 2 years of data
PREDICTION_DAYS = 30  # Predict 30 days ahead

# Download stock data
def get_stock_data(symbol, period='2y'):
    """
    Download stock data from Yahoo Finance
    """
    ticker = yf.Ticker(symbol)
    data = ticker.history(period=period)
    return data

# Get data for all symbols
stock_data = {}
for symbol in SYMBOLS:
    print(f"Downloading data for {symbol}...")
    stock_data[symbol] = get_stock_data(symbol, PERIOD)
    
print(f"Downloaded data for {len(SYMBOLS)} symbols")
print(f"Date range: {stock_data[SYMBOLS[0]].index[0]} to {stock_data[SYMBOLS[0]].index[-1]}")

In [None]:
# Calculate technical indicators
def add_technical_indicators(df):
    """
    Add technical indicators to stock data
    """
    # Price-based indicators
    df['SMA_10'] = ta.trend.sma_indicator(df['Close'], window=10)
    df['SMA_30'] = ta.trend.sma_indicator(df['Close'], window=30)
    df['EMA_12'] = ta.trend.ema_indicator(df['Close'], window=12)
    df['EMA_26'] = ta.trend.ema_indicator(df['Close'], window=26)
    
    # MACD
    df['MACD'] = ta.trend.macd(df['Close'])
    df['MACD_signal'] = ta.trend.macd_signal(df['Close'])
    df['MACD_hist'] = ta.trend.macd_diff(df['Close'])
    
    # RSI
    df['RSI'] = ta.momentum.rsi(df['Close'], window=14)
    
    # Bollinger Bands
    df['BB_upper'] = ta.volatility.bollinger_hband(df['Close'])
    df['BB_middle'] = ta.volatility.bollinger_mavg(df['Close'])
    df['BB_lower'] = ta.volatility.bollinger_lband(df['Close'])
    df['BB_width'] = (df['BB_upper'] - df['BB_lower']) / df['BB_middle']
    df['BB_position'] = (df['Close'] - df['BB_lower']) / (df['BB_upper'] - df['BB_lower'])
    
    # Volume indicators
    df['Volume_SMA'] = ta.volume.volume_sma(df['Close'], df['Volume'], window=10)
    df['OBV'] = ta.volume.on_balance_volume(df['Close'], df['Volume'])
    
    # Volatility
    df['ATR'] = ta.volatility.average_true_range(df['High'], df['Low'], df['Close'])
    
    # Price features
    df['Price_change'] = df['Close'].pct_change()
    df['High_Low_ratio'] = df['High'] / df['Low']
    df['Volume_ratio'] = df['Volume'] / df['Volume'].rolling(window=10).mean()
    
    return df

# Add indicators to all stocks
for symbol in SYMBOLS:
    print(f"Adding indicators for {symbol}...")
    stock_data[symbol] = add_technical_indicators(stock_data[symbol])
    
print("Technical indicators added!")
print(f"Features per stock: {len(stock_data[SYMBOLS[0]].columns)}")

In [None]:
# Visualize data for one stock
symbol = 'AAPL'
df = stock_data[symbol].copy()

# Create subplots
fig = make_subplots(
    rows=4, cols=1,
    subplot_titles=('Price & Moving Averages', 'MACD', 'RSI', 'Volume'),
    vertical_spacing=0.05,
    row_heights=[0.4, 0.2, 0.2, 0.2]
)

# Price and moving averages
fig.add_trace(go.Scatter(x=df.index, y=df['Close'], name='Close Price', line=dict(color='blue')), row=1, col=1)
fig.add_trace(go.Scatter(x=df.index, y=df['SMA_10'], name='SMA 10', line=dict(color='orange')), row=1, col=1)
fig.add_trace(go.Scatter(x=df.index, y=df['SMA_30'], name='SMA 30', line=dict(color='red')), row=1, col=1)

# MACD
fig.add_trace(go.Scatter(x=df.index, y=df['MACD'], name='MACD', line=dict(color='blue')), row=2, col=1)
fig.add_trace(go.Scatter(x=df.index, y=df['MACD_signal'], name='Signal', line=dict(color='red')), row=2, col=1)

# RSI
fig.add_trace(go.Scatter(x=df.index, y=df['RSI'], name='RSI', line=dict(color='purple')), row=3, col=1)
fig.add_hline(y=70, line_dash="dash", line_color="red", row=3, col=1)
fig.add_hline(y=30, line_dash="dash", line_color="green", row=3, col=1)

# Volume
fig.add_trace(go.Bar(x=df.index, y=df['Volume'], name='Volume'), row=4, col=1)

fig.update_layout(height=800, title=f'{symbol} - Technical Analysis')
fig.show()

In [None]:
# Prepare data for machine learning
def prepare_ml_data(df, target_days=1):
    """
    Prepare data for machine learning
    """
    # Select features (exclude OHLCV and derived price columns)
    feature_columns = [
        'SMA_10', 'SMA_30', 'EMA_12', 'EMA_26',
        'MACD', 'MACD_signal', 'MACD_hist',
        'RSI', 'BB_width', 'BB_position',
        'Volume_SMA', 'OBV', 'ATR',
        'Price_change', 'High_Low_ratio', 'Volume_ratio'
    ]
    
    # Create target (future price change)
    df['Target'] = df['Close'].shift(-target_days) / df['Close'] - 1
    
    # Remove rows with NaN values
    df_clean = df[feature_columns + ['Target']].dropna()
    
    X = df_clean[feature_columns]
    y = df_clean['Target']
    
    return X, y, feature_columns

# Prepare data for all symbols
ml_data = {}
for symbol in SYMBOLS:
    X, y, features = prepare_ml_data(stock_data[symbol], target_days=1)
    ml_data[symbol] = {'X': X, 'y': y, 'features': features}
    print(f"{symbol}: {len(X)} samples, {len(features)} features")

print(f"\nFeatures used: {features}")

In [None]:
# Train Random Forest model
def train_random_forest(X, y, test_size=0.2, random_state=42):
    """
    Train Random Forest model
    """
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, shuffle=False
    )
    
    # Train model
    model = RandomForestRegressor(
        n_estimators=100,
        max_depth=10,
        random_state=random_state,
        n_jobs=-1
    )
    
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # Metrics
    train_mse = mean_squared_error(y_train, y_pred_train)
    test_mse = mean_squared_error(y_test, y_pred_test)
    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)
    
    results = {
        'model': model,
        'train_mse': train_mse,
        'test_mse': test_mse,
        'train_r2': train_r2,
        'test_r2': test_r2,
        'feature_importance': dict(zip(X.columns, model.feature_importances_))
    }
    
    return results

# Train models for all symbols
rf_results = {}
for symbol in SYMBOLS:
    print(f"Training Random Forest for {symbol}...")
    X, y = ml_data[symbol]['X'], ml_data[symbol]['y']
    results = train_random_forest(X, y)
    rf_results[symbol] = results
    
    print(f"  Train R²: {results['train_r2']:.4f}, Test R²: {results['test_r2']:.4f}")
    print(f"  Train MSE: {results['train_mse']:.6f}, Test MSE: {results['test_mse']:.6f}")
    print()

In [None]:
# Feature importance analysis
import matplotlib.pyplot as plt

# Plot feature importance for AAPL
symbol = 'AAPL'
importance = rf_results[symbol]['feature_importance']
features = list(importance.keys())
importances = list(importance.values())

# Sort by importance
sorted_idx = np.argsort(importances)[::-1]
sorted_features = [features[i] for i in sorted_idx]
sorted_importances = [importances[i] for i in sorted_idx]

plt.figure(figsize=(12, 8))
plt.barh(range(len(sorted_features)), sorted_importances)
plt.yticks(range(len(sorted_features)), sorted_features)
plt.xlabel('Feature Importance')
plt.title(f'{symbol} - Random Forest Feature Importance')
plt.tight_layout()
plt.show()

print("Top 5 most important features:")
for i in range(5):
    print(f"{i+1}. {sorted_features[i]}: {sorted_importances[i]:.4f}")

In [None]:
# Create prediction API function
def predict_price_movement(symbol, model, current_data):
    """
    Predict price movement for a given symbol
    """
    # Use the trained model to predict
    prediction = model.predict([current_data])[0]
    
    # Convert to direction and confidence
    direction = 'up' if prediction > 0.02 else 'down' if prediction < -0.02 else 'sideways'
    confidence = min(abs(prediction) * 10, 1.0)  # Scale to 0-1
    
    return {
        'symbol': symbol,
        'prediction': prediction,
        'direction': direction,
        'confidence': confidence,
        'timeframe': '1D'
    }

# Example prediction
symbol = 'AAPL'
model = rf_results[symbol]['model']
latest_data = ml_data[symbol]['X'].iloc[-1].values

prediction = predict_price_movement(symbol, model, latest_data)
print(f"Prediction for {symbol}:")
print(f"  Direction: {prediction['direction']}")
print(f"  Confidence: {prediction['confidence']:.2%}")
print(f"  Raw prediction: {prediction['prediction']:.4f}")

In [None]:
# Model performance summary
print("=== Model Performance Summary ===")
print("\nRandom Forest Results:")
print(f"{'Symbol':<8} {'Train R²':<10} {'Test R²':<10} {'Test MSE':<12}")
print("-" * 45)

for symbol in SYMBOLS:
    results = rf_results[symbol]
    print(f"{symbol:<8} {results['train_r2']:<10.4f} {results['test_r2']:<10.4f} {results['test_mse']:<12.6f}")

# Average performance
avg_test_r2 = np.mean([rf_results[s]['test_r2'] for s in SYMBOLS])
avg_test_mse = np.mean([rf_results[s]['test_mse'] for s in SYMBOLS])

print("-" * 45)
print(f"{'Average':<8} {'':<10} {avg_test_r2:<10.4f} {avg_test_mse:<12.6f}")

print(f"\nModel successfully trained on {len(SYMBOLS)} symbols!")
print(f"Average Test R²: {avg_test_r2:.4f}")
print(f"Average Test MSE: {avg_test_mse:.6f}")