In [None]:
# Cell 1: Import required libraries and functions
from data_exploration import get_historical_data, validate_data_structure
import pandas as pd
import numpy as np

# Cell 2: Technical indicator calculation functions
def ensure_series(data, column_name=None):
    """Ensure data is a pandas Series"""
    if isinstance(data, pd.DataFrame):
        if column_name and column_name in data.columns:
            return data[column_name]
        else:
            return data.iloc[:, 0]  # Take first column
    elif isinstance(data, pd.Series):
        return data
    else:
        return pd.Series(data)

def calculate_rsi(prices, period=14):
    prices = ensure_series(prices)
    delta = prices.diff()
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)
    
    avg_gain = gain.rolling(window=period).mean()
    avg_loss = loss.rolling(window=period).mean()
    
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

def calculate_macd(prices, fast=12, slow=26, signal=9):
    prices = ensure_series(prices)
    ema_fast = prices.ewm(span=fast, adjust=False).mean()
    ema_slow = prices.ewm(span=slow, adjust=False).mean()
    macd = ema_fast - ema_slow
    signal_line = macd.ewm(span=signal, adjust=False).mean()
    histogram = macd - signal_line
    return macd, signal_line, histogram

def calculate_atr(high, low, close, period=14):
    high = ensure_series(high)
    low = ensure_series(low)
    close = ensure_series(close)
    
    tr1 = high - low
    tr2 = abs(high - close.shift(1))
    tr3 = abs(low - close.shift(1))
    tr = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
    atr = tr.rolling(window=period).mean()
    return atr

def calculate_bollinger_bands(prices, period=20, std_dev=2):
    prices = ensure_series(prices)
    middle = prices.rolling(window=period).mean()
    std = prices.rolling(window=period).std()
    upper = middle + (std * std_dev)
    lower = middle - (std * std_dev)
    width = (upper - lower) / middle
    return upper, middle, lower, width

def calculate_obv(close, volume):
    close = ensure_series(close)
    volume = ensure_series(volume)
    obv = pd.Series(0.0, index=close.index)
    for i in range(1, len(close)):
        if close.iloc[i] > close.iloc[i-1]:
            obv.iloc[i] = obv.iloc[i-1] + volume.iloc[i]
        elif close.iloc[i] < close.iloc[i-1]:
            obv.iloc[i] = obv.iloc[i-1] - volume.iloc[i]
        else:
            obv.iloc[i] = obv.iloc[i-1]
    return obv

def calculate_stochastic(high, low, close, k_period=14, d_period=3):
    high = ensure_series(high)
    low = ensure_series(low)
    close = ensure_series(close)
    
    high_roll = high.rolling(window=k_period).max()
    low_roll = low.rolling(window=k_period).min()
    stoch_k = 100 * (close - low_roll) / (high_roll - low_roll)
    stoch_d = stoch_k.rolling(window=d_period).mean()
    return stoch_k, stoch_d

def calculate_adx(high, low, close, period=14):
    high = ensure_series(high)
    low = ensure_series(low)
    close = ensure_series(close)
    
    plus_dm = high.diff()
    minus_dm = low.diff()
    plus_dm[plus_dm < 0] = 0
    minus_dm[minus_dm > 0] = 0
    
    tr1 = high - low
    tr2 = abs(high - close.shift(1))
    tr3 = abs(low - close.shift(1))
    tr = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
    
    atr = tr.rolling(window=period).mean()
    
    plus_di = 100 * (plus_dm.rolling(window=period).mean() / atr)
    minus_di = 100 * (abs(minus_dm.rolling(window=period).mean()) / atr)
    
    dx = 100 * (abs(plus_di - minus_di) / (plus_di + minus_di))
    adx = dx.rolling(window=period).mean()
    
    return adx

# Cell 3: Main feature engineering function
def calculate_technical_features(data):
    """Calculate comprehensive technical indicators with adaptive periods"""
    if data is None or len(data) < 30:
        print(f"  Insufficient data: only {len(data) if data is not None else 0} rows")
        return None
    
    try:
        df = data.copy()
        initial_length = len(df)
        print(f"  Starting with {initial_length} days of data")
        
        # Ensure we have the right column structure
        if isinstance(df.columns, pd.MultiIndex):
            df.columns = [col[0] if isinstance(col, tuple) else col for col in df.columns]
        
        # Verify required columns exist
        required_columns = ['Open', 'High', 'Low', 'Close', 'Volume']
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            print(f"  Error: Missing required columns: {missing_columns}")
            return None
        
        # Adaptive indicator calculation based on available data
        data_length = len(df)
        print(f"  Calculating indicators for {data_length} days...")
        
        # Always calculate these basic indicators
        df['Daily_Return'] = df['Close'].pct_change()
        df['Price_Range'] = (df['High'] - df['Low']) / df['Close'].replace(0, np.nan)
        
        # Short-term indicators (require less data)
        if data_length >= 10:
            df['SMA5'] = df['Close'].rolling(window=5).mean()
            df['SMA10'] = df['Close'].rolling(window=10).mean()
            df['EMA5'] = df['Close'].ewm(span=5, adjust=False).mean()
            df['EMA10'] = df['Close'].ewm(span=10, adjust=False).mean()
            df['ROC5'] = df['Close'].pct_change(periods=5) * 100
            df['Volatility_5'] = df['Daily_Return'].rolling(window=5).std() * np.sqrt(252)
        
        # Medium-term indicators
        if data_length >= 20:
            df['SMA20'] = df['Close'].rolling(window=20).mean()
            df['EMA20'] = df['Close'].ewm(span=20, adjust=False).mean()
            df['ROC10'] = df['Close'].pct_change(periods=10) * 100
            df['ROC20'] = df['Close'].pct_change(periods=20) * 100
            
            # RSI (needs at least 15 days)
            df['RSI'] = calculate_rsi(df['Close'], period=14)
            
            # MACD
            df['MACD'], df['Signal_Line'], df['MACD_Histogram'] = calculate_macd(df['Close'])
            
            # Bollinger Bands
            df['BB_Upper'], df['BB_Middle'], df['BB_Lower'], df['BB_Width'] = calculate_bollinger_bands(df['Close'], period=20)
            
            # ATR
            df['ATR'] = calculate_atr(df['High'], df['Low'], df['Close'], period=14)
            
            # Volume indicators
            df['Volume_SMA20'] = df['Volume'].rolling(window=20).mean()
            df['Volume_Ratio'] = df['Volume'] / df['Volume_SMA20'].replace(0, np.nan)
            df['OBV'] = calculate_obv(df['Close'], df['Volume'])
            
            # Stochastic
            df['K_percent'], df['D_percent'] = calculate_stochastic(df['High'], df['Low'], df['Close'])
            
            # Calculate ratios for short/medium term MAs
            if 'SMA10' in df.columns:
                df['Close_SMA10_Ratio'] = df['Close'] / df['SMA10'].replace(0, np.nan)
            if 'SMA20' in df.columns:
                df['Close_SMA20_Ratio'] = df['Close'] / df['SMA20'].replace(0, np.nan)
        
        # Longer-term indicators (only if we have enough data)
        if data_length >= 50:
            df['SMA50'] = df['Close'].rolling(window=50).mean()
            df['EMA50'] = df['Close'].ewm(span=50, adjust=False).mean()
            df['Close_SMA50_Ratio'] = df['Close'] / df['SMA50'].replace(0, np.nan)
            df['ROC50'] = df['Close'].pct_change(periods=50) * 100
            df['Volatility_30'] = df['Daily_Return'].rolling(window=30).std() * np.sqrt(252)
            
            # ADX (needs more data)
            df['ADX'] = calculate_adx(df['High'], df['Low'], df['Close'], period=14)
        
        # Very long-term indicators (only with lots of data)
        if data_length >= 100:
            df['SMA100'] = df['Close'].rolling(window=100).mean()
            df['EMA100'] = df['Close'].ewm(span=100, adjust=False).mean()
            df['Close_SMA100_Ratio'] = df['Close'] / df['SMA100'].replace(0, np.nan)
        
        if data_length >= 220:  # Only with 200+ extra buffer
            df['SMA200'] = df['Close'].rolling(window=200).mean()
            df['EMA200'] = df['Close'].ewm(span=200, adjust=False).mean()
            df['Close_SMA200_Ratio'] = df['Close'] / df['SMA200'].replace(0, np.nan)
        
        # Create target variables
        df['Next_Close'] = df['Close'].shift(-1)
        df['Target'] = ((df['Next_Close'] > df['Close']) * 1).astype(int)
        df['Target_Return'] = ((df['Next_Close'] - df['Close']) / df['Close'].replace(0, np.nan)) * 100
        
        # Create lagged variables (only for available features)
        lag_features = ['Close', 'Volume']
        if 'RSI' in df.columns:
            lag_features.append('RSI')
        if 'MACD' in df.columns:
            lag_features.append('MACD')
            
        for feature in lag_features:
            if feature in df.columns:
                df[f'{feature}_Lag1'] = df[feature].shift(1)
                df[f'{feature}_Lag2'] = df[feature].shift(2)
        
        # Calculate technical sentiment based on available indicators
        df['Tech_Sentiment'] = calculate_flexible_technical_sentiment(df)
        
        # SMART NaN removal - be more aggressive about keeping data
        print(f"  Before NaN removal: {len(df)} rows")
        
        # Drop rows where the target is NaN (last row)
        df = df[df['Target'].notna()]
        
        # For other NaN values, be more selective
        # Only require the most basic indicators to be non-NaN
        essential_features = ['Close', 'Daily_Return']
        if 'RSI' in df.columns:
            essential_features.append('RSI')
        if 'SMA20' in df.columns:
            essential_features.append('SMA20')
        
        # Drop rows where essential features are NaN
        for feature in essential_features:
            if feature in df.columns:
                initial_rows = len(df)
                df = df[df[feature].notna()]
                dropped = initial_rows - len(df)
                if dropped > 0:
                    print(f"    Dropped {dropped} rows due to NaN in {feature}")
        
        # Fill remaining NaN values with forward-fill then back-fill
        numeric_columns = df.select_dtypes(include=[np.number]).columns
        df[numeric_columns] = df[numeric_columns].fillna(method='ffill').fillna(method='bfill')
        
        print(f"  After processing: {len(df)} rows with {len(df.columns)} features")
        
        # Final check - ensure we have enough data for meaningful analysis
        if len(df) < 20:
            print(f"  Warning: Very little data remaining ({len(df)} rows). Consider using more historical data.")
            if len(df) < 10:
                print(f"  Insufficient data for analysis.")
                return None
        
        return df
        
    except Exception as e:
        print(f"  Error in calculate_technical_features: {e}")
        import traceback
        traceback.print_exc()
        return None

def get_technical_sentiment(data):
    """Calculate sentiment based on technical indicators"""
    if data is None or len(data) < 5:
        return 0.0
    
    try:
        # Get the latest data point
        latest = data.iloc[-1]
        
        # Initialize sentiment components
        ma_sentiment = 0.0
        momentum_sentiment = 0.0
        volatility_sentiment = 0.0
        
        # Moving Average component (-0.4 to 0.4)
        ma_signals = 0
        ma_count = 0
        
        # Price vs moving averages
        if 'SMA20' in latest.index and pd.notna(latest['SMA20']):
            ma_signals += 1 if latest['Close'] > latest['SMA20'] else -1
            ma_count += 1
            
        if 'SMA50' in latest.index and pd.notna(latest['SMA50']):
            ma_signals += 1 if latest['Close'] > latest['SMA50'] else -1
            ma_count += 1
            
        if 'SMA200' in latest.index and pd.notna(latest['SMA200']):
            ma_signals += 1 if latest['Close'] > latest['SMA200'] else -1
            ma_count += 1
        
        # Moving average crossovers
        if 'SMA20' in latest.index and 'SMA50' in latest.index:
            if pd.notna(latest['SMA20']) and pd.notna(latest['SMA50']):
                ma_signals += 1 if latest['SMA20'] > latest['SMA50'] else -1
                ma_count += 1
        
        if 'SMA50' in latest.index and 'SMA200' in latest.index:
            if pd.notna(latest['SMA50']) and pd.notna(latest['SMA200']):
                ma_signals += 1 if latest['SMA50'] > latest['SMA200'] else -1
                ma_count += 1
        
        # Calculate moving average sentiment
        if ma_count > 0:
            ma_sentiment = (ma_signals / ma_count) * 0.4  # Scale to -0.4 to 0.4
        
        # Momentum indicators component (-0.4 to 0.4)
        momentum_signals = 0
        momentum_count = 0
        
        # RSI
        if 'RSI' in latest.index and pd.notna(latest['RSI']):
            rsi = latest['RSI']
            if rsi < 30:
                momentum_signals += 1  # Oversold - positive for future
            elif rsi > 70:
                momentum_signals -= 1  # Overbought - negative for future
            momentum_count += 1
        
        # MACD
        if 'MACD' in latest.index and 'Signal_Line' in latest.index:
            if pd.notna(latest['MACD']) and pd.notna(latest['Signal_Line']):
                momentum_signals += 1 if latest['MACD'] > latest['Signal_Line'] else -1
                momentum_count += 1
        
        # ROC (Rate of Change)
        if 'ROC10' in latest.index and pd.notna(latest['ROC10']):
            roc = latest['ROC10']
            if roc > 0:  # Rising price = positive momentum
                momentum_signals += 1
            else:  # Falling price = negative momentum
                momentum_signals -= 1
            momentum_count += 1
        
        # Calculate momentum sentiment
        if momentum_count > 0:
            momentum_sentiment = (momentum_signals / momentum_count) * 0.4  # Scale to -0.4 to 0.4
        
        # Volatility and Other Indicators component (-0.2 to 0.2)
        vol_signals = 0
        vol_count = 0
        
        # Bollinger Bands
        if all(item in latest.index for item in ['Close', 'BB_Upper', 'BB_Lower']):
            if all(pd.notna(latest[item]) for item in ['Close', 'BB_Upper', 'BB_Lower']):
                if latest['Close'] < latest['BB_Lower']:  # Oversold
                    vol_signals += 1
                elif latest['Close'] > latest['BB_Upper']:  # Overbought
                    vol_signals -= 1
                vol_count += 1
        
        # ATR (high volatility can be concerning)
        if 'ATR' in latest.index and 'Close' in latest.index:
            if pd.notna(latest['ATR']) and pd.notna(latest['Close']):
                atr_pct = latest['ATR'] / latest['Close'] * 100
                if atr_pct > 3:  # High volatility
                    vol_signals -= 1
                vol_count += 1
        
        # Volume indicators
        if 'Volume_Ratio' in latest.index and pd.notna(latest['Volume_Ratio']):
            volume_ratio = latest['Volume_Ratio']
            if volume_ratio > 1.5:  # Higher than average volume
                if 'Daily_Return' in latest.index and pd.notna(latest['Daily_Return']):
                    # Volume in direction of price move
                    vol_signals += 1 if latest['Daily_Return'] > 0 else -1
                vol_count += 1
        
        # Calculate volatility sentiment
        if vol_count > 0:
            volatility_sentiment = (vol_signals / vol_count) * 0.2  # Scale to -0.2 to 0.2
        
        # Combine all components
        total_sentiment = ma_sentiment + momentum_sentiment + volatility_sentiment
        
        # Ensure result is between -1 and 1
        return max(-1.0, min(1.0, total_sentiment))
        
    except Exception as e:
        print(f"Error calculating technical sentiment: {e}")
        return 0.0