# Stock Market Feature Engineering

This notebook focuses on creating features from:
1. Technical indicators from stock price data
2. Sentiment analysis from news data
3. Feature selection and dimensionality reduction


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import sys
sys.path.append('..')
from src.features.technical_indicators import calculate_technical_indicators
from src.features.sentiment_features import extract_sentiment_features
from src.features.feature_reduction import select_features

# 1. Load Preprocessed Data


In [None]:
symbols = ['AAPL', 'GOOG', 'MSFT']
stock_data = {}
news_data = {}

for symbol in symbols:
    stock_data[symbol] = pd.read_csv(f'../data/preprocessed/{symbol}_stock_preprocessed.csv', index_col=0, parse_dates=True)
    news_data[symbol] = pd.read_csv(f'../data/preprocessed/{symbol}_news_preprocessed.csv', parse_dates=['date'])
    print(f"Loaded data for {symbol}")

# 2. Calculate Technical Indicators

In [None]:
def add_technical_indicators(df):
    # Moving averages
    df['SMA_5'] = df['Close'].rolling(window=5).mean()
    df['SMA_20'] = df['Close'].rolling(window=20).mean()
    
    # Relative Strength Index
    delta = df['Close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df['RSI'] = 100 - (100 / (1 + rs))
    
    # MACD
    exp1 = df['Close'].ewm(span=12, adjust=False).mean()
    exp2 = df['Close'].ewm(span=26, adjust=False).mean()
    df['MACD'] = exp1 - exp2
    df['Signal_Line'] = df['MACD'].ewm(span=9, adjust=False).mean()
    
    # Bollinger Bands
    df['BB_middle'] = df['Close'].rolling(window=20).mean()
    df['BB_upper'] = df['BB_middle'] + 2 * df['Close'].rolling(window=20).std()
    df['BB_lower'] = df['BB_middle'] - 2 * df['Close'].rolling(window=20).std()
    
    return df

technical_data = {}
for symbol in symbols:
    technical_data[symbol] = add_technical_indicators(stock_data[symbol].copy())
    print(f"Added technical indicators for {symbol}")

# 3. Extract Sentiment Features


In [None]:
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

def calculate_sentiment_scores(df):
    analyzer = SentimentIntensityAnalyzer()
    
    # Calculate sentiment scores
    df['vader_compound'] = df['text'].apply(lambda x: analyzer.polarity_scores(x)['compound'])
    df['textblob_polarity'] = df['text'].apply(lambda x: TextBlob(x).sentiment.polarity)
    
    # Group by date and calculate daily sentiment metrics
    daily_sentiment = df.groupby('date').agg({
        'vader_compound': ['mean', 'std', 'count'],
        'textblob_polarity': ['mean', 'std']
    })
    
    return daily_sentiment

sentiment_data = {}
for symbol in symbols:
    sentiment_data[symbol] = calculate_sentiment_scores(news_data[symbol])
    print(f"Calculated sentiment scores for {symbol}")

# 4. Combine Features and Create Target Variable


In [None]:
def prepare_final_features(technical_df, sentiment_df):
    # Merge technical and sentiment features
    combined_df = technical_df.join(sentiment_df)
    
    # Create target variable (1 if price goes up, 0 if down)
    combined_df['target'] = (combined_df['Close'].shift(-1) > combined_df['Close']).astype(int)
    
    # Drop rows with missing values
    combined_df = combined_df.dropna()
    
    # Scale features
    scaler = StandardScaler()
    feature_columns = [col for col in combined_df.columns if col != 'target']
    combined_df[feature_columns] = scaler.fit_transform(combined_df[feature_columns])
    
    return combined_df

final_datasets = {}
for symbol in symbols:
    final_datasets[symbol] = prepare_final_features(technical_data[symbol], sentiment_data[symbol])
    print(f"Prepared final features for {symbol}")

# 5. Feature Selection and Dimensionality Reduction

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

def select_important_features(df, k=15):
    X = df.drop('target', axis=1)
    y = df['target']
    
    # Select top k features
    selector = SelectKBest(score_func=f_classif, k=k)
    X_selected = selector.fit_transform(X, y)
    
    # Get selected feature names
    selected_features = X.columns[selector.get_support()].tolist()
    
    # Create new dataframe with selected features
    return df[selected_features + ['target']], selected_features

# Apply feature selection to each dataset
selected_datasets = {}
selected_features = {}
for symbol in symbols:
    selected_datasets[symbol], selected_features[symbol] = select_important_features(final_datasets[symbol])
    print(f"\nSelected features for {symbol}:")
    print(selected_features[symbol])

# 6. Save Engineered Features


In [None]:
# Save the final processed datasets
for symbol in symbols:
    selected_datasets[symbol].to_csv(f'../data/preprocessed/{symbol}_final_features.csv')
    print(f"Saved final features for {symbol}")

# Save selected feature lists for use in model training
import json
with open('../data/preprocessed/selected_features.json', 'w') as f:
    json.dump(selected_features, f)
print("Saved selected features list")