#**Imports and Installs**

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from keras.models import Sequential, save_model
from keras.layers import Dense, Bidirectional, LSTM, Conv1D, MaxPooling1D, Flatten, Dropout
from keras.regularizers import L1L2
from keras.optimizers import Adam
import joblib
import warnings
warnings.filterwarnings('ignore')

#**Load data**

In [3]:
def load_data(filepath):
    data = pd.read_csv(filepath)
       # Clean column names
    data.columns = data.columns.str.strip()
    data.columns = data.columns.str.replace(' ', '_')
    data["date"] = pd.to_datetime(data["Date"])
    data.set_index("date", inplace=True)
    data = data.drop("Date", axis=1)

    # Validate required columns
    required_cols = ['Open', 'High', 'Low', 'Close']
    missing = [col for col in required_cols if col not in data.columns]
    if missing:
        raise ValueError(f"Missing columns: {missing}")

    # Robust numeric conversion
    for col in required_cols:
        # Remove all non-digit characters except dots and negatives
        data[col] = (
            data[col].astype(str)
            .str.replace(r'[^\d.-]', '', regex=True)  # Improved regex
            .replace(r'^\.$', np.nan, regex=True)  # Handle lone dots
            .replace('', np.nan)
        )
        data[col] = pd.to_numeric(data[col], errors='coerce')

    # Post-cleaning validation
    print("\nData types after cleaning:")
    print(data.dtypes)

    print("\nNull values after cleaning:")
    print(data.isna().sum())

    data.dropna(inplace=True)
    return data

data = load_data("/content/sample_data/Data/merged (2).csv")


Data types after cleaning:
Open      float64
High      float64
Low       float64
Close     float64
Sector     object
dtype: object

Null values after cleaning:
Open      0
High      0
Low       0
Close     0
Sector    0
dtype: int64


##**Display sample**

In [4]:
print("\nData Sample:")
print(data.head())
print("Columns in DataFrame:", data.columns.tolist())


Data Sample:
            Open  High   Low  Close          Sector
date                                               
1979-01-02  14.7  14.7  14.7   14.7  Gold price INR
1979-01-03  14.7  14.7  14.7   14.7  Gold price INR
1979-01-04  14.8  14.8  14.8   14.8  Gold price INR
1979-01-05  15.1  15.1  15.1   15.1  Gold price INR
1979-01-08  15.1  15.1  15.1   15.1  Gold price INR
Columns in DataFrame: ['Open', 'High', 'Low', 'Close', 'Sector']


#**Technical Indicators**

In [5]:

def add_technical_indicators(df):
    df = df.select_dtypes(include=np.number)

    # Moving Averages
    df['SMA_10'] = df['Close'].rolling(10).mean()
    df['EMA_10'] = df['Close'].ewm(span=10, adjust=False).mean()

    # RSI Calculation
    delta = df['Close'].diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)
    avg_gain = gain.rolling(14).mean()
    avg_loss = loss.rolling(14).mean()
    rs = avg_gain / (avg_loss + 1e-10)  # Avoid division by zero
    df['RSI'] = 100 - (100 / (1 + rs))

    # ATR Calculation
    high_low = df['High'] - df['Low']
    high_close = np.abs(df['High'] - df['Close'].shift())
    low_close = np.abs(df['Low'] - df['Close'].shift())
    true_range = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
    df['ATR'] = true_range.rolling(14).mean()

    return df.dropna()

data = add_technical_indicators(data)
# Prepare data
target = data[['Close']]
features = data.drop(['Close', 'Date'], axis=1, errors='ignore')
# Final type check
print("\nFinal feature types:")
print(features.dtypes)


Final feature types:
Open      float64
High      float64
Low       float64
SMA_10    float64
EMA_10    float64
RSI       float64
ATR       float64
dtype: object


#**Scaling**

In [6]:
scaler_x = MinMaxScaler()
scaler_y = MinMaxScaler()
scaled_features = scaler_x.fit_transform(features)
scaled_target = scaler_y.fit_transform(target)

#**Sequence creation**

In [7]:
def create_sequences(features, target, seq_length=30):
    X, y = [], []
    for i in range(seq_length, len(features)):
        X.append(features[i-seq_length:i])
        y.append(target[i])
    return np.array(X), np.array(y)

X_seq, y_seq = create_sequences(scaled_features, scaled_target)
train_size = int(0.8 * len(X_seq))
X_train_seq, X_test_seq = X_seq[:train_size], X_seq[train_size:]
y_train_seq, y_test_seq = y_seq[:train_size], y_seq[train_size:]


#**Model Definitions**

##**BiLstm**

In [13]:
def build_bilstm(input_shape):
    model = Sequential([
        Bidirectional(LSTM(128, return_sequences=True, kernel_regularizer=L1L2(0.01, 0.01)), input_shape=input_shape),
        Dropout(0.4),
        Bidirectional(LSTM(64)),
        Dropout(0.3),
        Dense(32, activation='relu'),
        Dense(1)
    ])
    model.compile(optimizer=Adam(0.001), loss='mse')
    return model

##**CNN**

In [14]:
def build_cnn(input_shape):
    model = Sequential([
        Conv1D(64, 3, activation='relu', input_shape=input_shape),
        MaxPooling1D(2),
        Flatten(),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(1)
    ])
    model.compile(optimizer=Adam(0.001), loss='mse')
    return model

##**Random Forest**

In [15]:
def train_rf(X_train, y_train):
    model = RandomForestRegressor(n_estimators=100, random_state=42,verbose=1,          # show progress
        n_jobs=-1)
    model.fit(X_tree, y_tree.values.ravel())
    return model

##**XGBoost**

In [16]:
def train_xgboost(X_train, y_train):
    model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42, verbosity=1, eval_metric='logloss',
        use_label_encoder=False)

    model.fit(X_train, y_train)
    return model

#**Model Training**

In [17]:
# Prepare data for tree models
X_tree = features.iloc[30:]
y_tree = target.iloc[30:]

# Train models
print("Training BiLSTM...")
bilstm_model = build_bilstm((X_train_seq.shape[1], X_train_seq.shape[2]))
bilstm_history = bilstm_model.fit(X_train_seq, y_train_seq, epochs=100, batch_size=32,
                                  validation_data=(X_test_seq, y_test_seq), verbose=1)

print("Training CNN...")
cnn_model = build_cnn((X_train_seq.shape[1], X_train_seq.shape[2]))
cnn_history = cnn_model.fit(X_train_seq, y_train_seq, epochs=100, batch_size=32,
                            validation_data=(X_test_seq, y_test_seq), verbose=1)

print("Training Random Forest...")
rf_model = train_rf(X_tree, y_tree.values.ravel())


print("Training XGBoost...")
xgb_model = train_xgboost(X_tree, y_tree.values.ravel())

Training BiLSTM...
Epoch 1/100
[1m2210/2210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m313s[0m 139ms/step - loss: 0.3393 - val_loss: 0.0259
Epoch 2/100
[1m2210/2210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m280s[0m 127ms/step - loss: 0.0121 - val_loss: 0.0259
Epoch 3/100
[1m2210/2210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m340s[0m 135ms/step - loss: 0.0122 - val_loss: 0.0260
Epoch 4/100
[1m2210/2210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m317s[0m 132ms/step - loss: 0.0123 - val_loss: 0.0255
Epoch 5/100
[1m2210/2210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m290s[0m 131ms/step - loss: 0.0123 - val_loss: 0.0258
Epoch 6/100
[1m2210/2210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m289s[0m 131ms/step - loss: 0.0121 - val_loss: 0.0257
Epoch 7/100
[1m2210/2210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m321s[0m 130ms/step - loss: 0.0122 - val_loss: 0.0254
Epoch 8/100
[1m2210/2210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m329s[0m 133ms/step

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   36.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.4min finished


Training XGBoost...


#**Model Evaluation**

In [18]:

# Evaluation functions
def evaluate_model(name, model, X_test, y_test, is_dl=False):
    if is_dl:
        pred = model.predict(X_test)
        if len(pred.shape) > 1: pred = pred.ravel()
    else:
        pred = model.predict(X_test)

    return {
        'Model': name,
        'MSE': mean_squared_error(y_test, pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, pred)),
        'MAE': mean_absolute_error(y_test, pred),
        'R2': r2_score(y_test, pred)
    }

# Evaluate all models
results = []

# DL Models
test_dates = data.index[30+train_size:]
dl_pred = bilstm_model.predict(X_test_seq)
results.append(evaluate_model('BiLSTM', bilstm_model, X_test_seq, y_test_seq, True))
results.append(evaluate_model('CNN', cnn_model, X_test_seq, y_test_seq, True))

# Tree Models
X_test_tree = X_tree.iloc[train_size:]
y_test_tree = y_tree.iloc[train_size:]
results.append(evaluate_model('Random Forest', rf_model, X_test_tree, y_test_tree))
results.append(evaluate_model('XGBoost', xgb_model, X_test_tree, y_test_tree))

# Results comparison
results_df = pd.DataFrame(results).set_index('Model')
print("\nModel Comparison:")
print(results_df.sort_values('RMSE'))

[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 44ms/step
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 42ms/step
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.2s



Model Comparison:
                        MSE        RMSE        MAE        R2
Model                                                       
CNN                0.004239    0.065106   0.042166  0.647482
BiLSTM             0.016983    0.130319   0.087225 -0.412366
Random Forest    904.151494   30.069112  15.636436  0.999992
XGBoost        31664.455078  177.945090  88.563812  0.999711


[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.5s finished


#**Visualization**

In [None]:

plt.figure(figsize=(15, 8))
plt.subplot(2, 2, 1)
plt.plot(bilstm_history.history['loss'], label='BiLSTM Train')
plt.plot(bilstm_history.history['val_loss'], label='BiLSTM Val')
plt.plot(cnn_history.history['loss'], label='CNN Train')
plt.plot(cnn_history.history['val_loss'], label='CNN Val')
plt.title('Training History')
plt.legend()

plt.subplot(2, 2, 2)
results_df['RMSE'].sort_values().plot(kind='bar')
plt.title('Model RMSE Comparison')

plt.subplot(2, 2, 3)
actual = scaler_y.inverse_transform(y_test_seq.reshape(-1, 1))
plt.plot(test_dates, actual, label='Actual')
plt.plot(test_dates, scaler_y.inverse_transform(bilstm_model.predict(X_test_seq)), label='BiLSTM')
plt.plot(test_dates, scaler_y.inverse_transform(cnn_model.predict(X_test_seq)), label='CNN')
plt.title('DL Models Predictions')
plt.legend()

plt.subplot(2, 2, 4)
plt.plot(test_dates, actual, label='Actual')
plt.plot(test_dates, rf_model.predict(X_test_tree), label='RF')
plt.plot(test_dates, xgb_model.predict(X_test_tree), label='XGBoost')
plt.title('Tree Models Predictions')
plt.legend()

plt.tight_layout()
plt.show()

#**Prediction**

In [20]:
# Trading signals
def generate_signals(predictions, window=5):
    signals = []
    sma = pd.Series(predictions).rolling(window).mean()
    for i in range(len(predictions)):
        if i < window:
            signals.append('Hold')
        elif predictions[i] > sma[i] * 1.02:
            signals.append('Buy')
        elif predictions[i] < sma[i] * 0.98:
            signals.append('Sell')
        else:
            signals.append('Hold')
    return signals

# Generate predictions and signals
final_preds = {
    'Date': test_dates,
    'Actual': scaler_y.inverse_transform(y_test_seq.reshape(-1, 1)).ravel(),
    'BiLSTM': scaler_y.inverse_transform(bilstm_model.predict(X_test_seq).reshape(-1, 1)).ravel(),
    'CNN': scaler_y.inverse_transform(cnn_model.predict(X_test_seq).reshape(-1, 1)).ravel(),
    'RandomForest': rf_model.predict(X_test_tree),
    'XGBoost': xgb_model.predict(X_test_tree)
}
for model in ['BiLSTM', 'CNN', 'RandomForest', 'XGBoost']:
    final_preds[f'{model}_Signal'] = generate_signals(final_preds[model])

results_df = pd.DataFrame(final_preds)

[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 42ms/step
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.2s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.5s finished


#**Save results and models**

In [21]:
results_df.to_csv('stock_predictions_signals.csv', index=False)
joblib.dump(rf_model, 'random_forest_model.pkl')
joblib.dump(xgb_model, 'xgboost_model.pkl')
save_model(bilstm_model, 'bilstm_model.h5')
save_model(cnn_model, 'cnn_model.h5')

print("\nPredictions and signals saved to stock_predictions_signals.csv")
print("All models saved to disk")




Predictions and signals saved to stock_predictions_signals.csv
All models saved to disk


#**Second Comparison**

##**Split Indian and US Data on same model**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from keras.models import Model, Sequential, load_model
from keras.layers import Dense, LSTM, Input, Concatenate, Dropout, BatchNormalization
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
import joblib
import shap
import warnings
warnings.filterwarnings('ignore')

# Global configuration
plt.style.use('ggplot')
MARKET = 'INDIAN'  # Change to 'INDIAN' for Indian market and 'US' for American market

# Load and preprocess data
def load_data(market):
    if market == 'INDIAN':
        df = pd.read_csv("/content/sample_data/Data/merged.csv")
        # df = pd.read_csv("/content/sample_data/Data/NIFTY_IT_Historical_Data.csv")
        # Preprocessing steps for Indian market
        df.columns = df.columns.str.strip().str.replace(' ', '_')
        df["date"] = pd.to_datetime(df["Date"])
        df.set_index("date", inplace=True)
        df = df.drop("Date", axis=1)
         # Validate and clean columns
        required_cols = ['source_file','Open', 'High', 'Low', 'Close']
        for col in required_cols:
            df[col] = (df[col].astype(str)
                      .str.replace(r'[^\d.-]', '', regex=True)
                      .replace(r'^\.$', np.nan, regex=True)
                      .replace('', np.nan))
            # df[col] = pd.to_numeric(df[col], errors='coerce')

        # Add synthetic volume for demonstration
        df['Volume'] = np.random.randint(10000, 50000, size=len(df))
        df.dropna(inplace=True)
        return df
if __name__ == "__main__":
    # Load and preprocess data
    data = load_data(MARKET)
    # data = add_technical_indicators(data)

In [None]:
import pandas as pd
import numpy as np
import shap
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor

# Load merged data
merged_df = df

# Add a fallback mapping of features to market sectors
sector_mapping = {
    'Open': 'Price',
    'High': 'Price',
    'Low': 'Price',
    'Close': 'Price',
    'Volume': 'Volume',
    'SMA_10': 'Technical',
    'EMA_10': 'Technical',
    'RSI': 'Technical',
    'ATR': 'Technical',
    'Twitter_Sentiment': 'Social Media',
    'Reddit_Sentiment': 'Social Media',
    'Social_Volume': 'Social Media',
    # Add others as needed
}

# Identify feature columns (exclude target and identifiers)
feature_cols = merged_df.columns.difference(['source_file', 'Date', 'source_file_Date'])

# Group data by source_file
grouped = merged_df.groupby('source_file')

In [None]:
df['source_file'].unique()

In [None]:
import shap
import matplotlib.pyplot as plt

# Initialize SHAP explainer for tree models
explainer = shap.TreeExplainer(rf)

# Use a sample from test set to keep visualization manageable
X_sample = rf_features_test[:200]  # limit size for speed

# Calculate SHAP values
shap_values = explainer.shap_values(X_sample)

# Summary plot
shap.summary_plot(shap_values, X_sample, feature_names=feature_names,max_display=10)


# shap.summary_plot(shap_values, X_sample, feature_names=feature_names)


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Concatenate, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
import os
import joblib
import warnings
warnings.filterwarnings('ignore')

# Global configuration
plt.style.use('ggplot')
MARKET = 'US'  # Set to 'US' or 'INDIAN'
US_STOCK_FILE = "/content/sample_data/Data/stock_yfinance_data.csv"
US_SOCIAL_FILE = "/content/sample_data/Data/stock_tweets.csv"
INDIAN_FILE = "/content/sample_data/Data/NIFTY_IT_Historical_Data.csv"

# Load and preprocess data
def load_data(market):
    if market == 'INDIAN':
        # Load Indian market data
        df = pd.read_csv(INDIAN_FILE)

        # Preprocessing steps for Indian market
        df.columns = df.columns.str.strip().str.replace(' ', '_')
        df["date"] = pd.to_datetime(df["Date"])
        df.set_index("date", inplace=True)
        df = df.drop("Date", axis=1, errors='ignore')

        # Validate and clean columns
        required_cols = ['Open', 'High', 'Low', 'Close']
        for col in required_cols:
            df[col] = (df[col].astype(str)
                      .str.replace(r'[^\d.-]', '', regex=True)
                      .replace(r'^\.$', np.nan, regex=True)
                      .replace('', np.nan))
            df[col] = pd.to_numeric(df[col], errors='coerce')

        # Add synthetic volume if missing
        if 'Volume' not in df.columns:
            df['Volume'] = np.random.randint(10000, 50000, size=len(df))
        df.dropna(inplace=True)
        return df

    elif market == 'US':
        # Load US stock data
        #stock_df = pd.read_csv(US_STOCK_FILE)
        # Handle mixed date formats
        # 1. Verify file exists
        if not os.path.exists(US_STOCK_FILE):
          raise FileNotFoundError(f"Data file not found at: {US_STOCK_FILE}")

        # 2. Read CSV with error handling
        try:
           stock_df = pd.read_csv(US_STOCK_FILE)
        except Exception as e:
            raise IOError(f"Error reading {US_STOCK_FILE}: {str(e)}")

        # 3. Check if DataFrame is empty
        if stock_df.empty:
            raise ValueError(f"Loaded empty DataFrame from {US_STOCK_FILE}")

        # 4. Handle date parsing with better validation
        if 'Date' not in stock_df.columns:
            raise KeyError("CSV file is missing 'Date' column")

        stock_df['Date'] = pd.to_datetime(
            stock_df['Date'],
            errors='coerce',
            format='mixed',
            dayfirst=True
        )
        # 5. Handle failed date parsing
        date_failures = stock_df['Date'].isna().sum()
        if date_failures > 0:
            print(f"Warning: Failed to parse {date_failures} date values")
            stock_df = stock_df.dropna(subset=['Date'])

        # 6. Final validation before return
        if stock_df.empty:
            raise ValueError("All rows were dropped during date parsing")

        # 7. Sort by date for time-series operations
        stock_df = stock_df.sort_values('Date').reset_index(drop=True)
        # Remove rows with unparseable dates
        initial_count = len(stock_df)
        stock_df = stock_df.dropna(subset=['Date'])
        final_count = len(stock_df)

        if initial_count != final_count:
          print(f"Warning: Dropped {initial_count - final_count} rows with unparseable dates")
        stock_df['Date'] = pd.to_datetime(stock_df['Date'], format='%Y-%m-%d')
        stock_df.columns = stock_df.columns.str.strip().str.replace(' ', '_')
        stock_df["date"] = pd.to_datetime(stock_df["Date"], format='%d-%m-%Y')
        stock_df.set_index("date", inplace=True)
        stock_df = stock_df.drop(["Date", "Adj_Close", "Stock_Name"], axis=1, errors='ignore')

        # Load social media data
        social_df = pd.read_csv(US_SOCIAL_FILE)
        social_df['Date'] = social_df['Date'].str.split(pat = " ").str[0]
        social_df['Date'] = pd.to_datetime(social_df['Date'], format='%Y-%m-%d')
        social_df.columns = social_df.columns.str.strip().str.replace(' ', '_')
        social_df["date"] = pd.to_datetime(social_df["Date"])
        social_df.set_index("date", inplace=True)
        social_df = social_df.drop("Date", axis=1, errors='ignore')

        # Merge stock and social data
        df = pd.merge(stock_df, social_df, left_index=True, right_index=True, how='left')

        # Fill missing social data with zeros
        social_cols = ['Twitter_Sentiment', 'Reddit_Sentiment', 'Social_Volume']
        for col in social_cols:
            if col not in df.columns:
                df[col] = 0.0
            else:
                df[col].fillna(0, inplace=True)

        # Clean numerical columns
        for col in ['Open', 'High', 'Low', 'Close', 'Volume'] + social_cols:
            df[col] = pd.to_numeric(df[col], errors='coerce')

        df.dropna(inplace=True)
        return df

# Technical indicators
def add_technical_indicators(df):
    if df is None or df.empty:
        raise ValueError("Received invalid DataFrame in add_technical_indicators")
    # Moving Averages
    df['SMA_10'] = df['Close'].rolling(10).mean()
    df['EMA_10'] = df['Close'].ewm(span=10, adjust=False).mean()

    # RSI
    delta = df['Close'].diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)
    avg_gain = gain.rolling(14).mean()
    avg_loss = loss.rolling(14).mean()
    rs = avg_gain / (avg_loss + 1e-10)
    df['RSI'] = 100 - (100 / (1 + rs))

    # ATR
    high_low = df['High'] - df['Low']
    high_close = np.abs(df['High'] - df['Close'].shift())
    low_close = np.abs(df['Low'] - df['Close'].shift())
    true_range = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
    df['ATR'] = true_range.rolling(14).mean()

    return df.dropna()

# FINESS Hybrid Model Architecture
def create_hybrid_model(tech_input_shape, social_input_shape=None):
    # Technical Feature Processor (Tech+)
    tech_input = Input(shape=tech_input_shape, name='tech_input')
    x = LSTM(128, return_sequences=True)(tech_input)
    x = LSTM(64, return_sequences=False)(x)
    tech_output = Dense(32, activation='relu')(x)

    # Social Feature Processor (FINESS) - Only for US market
    if social_input_shape is not None:
        social_input = Input(shape=social_input_shape, name='social_input')
        y = LSTM(64, return_sequences=False)(social_input)
        social_output = Dense(16, activation='relu')(y)

        # Combine technical and social features
        combined = Concatenate()([tech_output, social_output])
        z = Dense(64, activation='relu')(combined)
    else:
        z = tech_output

    # Final prediction layers
    z = Dense(32, activation='relu')(z)
    z = Dropout(0.3)(z)
    output = Dense(1)(z)

    # Create model
    if social_input_shape is not None:
        model = Model(inputs=[tech_input, social_input], outputs=output)
    else:
        model = Model(inputs=tech_input, outputs=output)

    model.compile(optimizer=Adam(0.001), loss='mse')
    return model

# Create sequences for LSTM
def create_sequences(data, target_col, seq_length=30):
    X, y = [], []
    for i in range(seq_length, len(data)):
        X.append(data.iloc[i-seq_length:i].values)
        y.append(data.iloc[i][target_col])
    return np.array(X), np.array(y)

# Inverse scaling function
def inverse_scale(scaler, values, feature_index):
    dummy = np.zeros((len(values), scaler.n_features_in_))
    dummy[:, feature_index] = values
    return scaler.inverse_transform(dummy)[:, feature_index]

# Main execution
if __name__ == "__main__":
    # Load and preprocess data
    data = load_data(MARKET)
    data = add_technical_indicators(data)
    print(f"Data shape after preprocessing: {data.shape}")
    print(f"Columns: {data.columns.tolist()}")

    # Prepare features based on market
    if MARKET == 'US':
        tech_features = ['Open', 'High', 'Low', 'Close', 'Volume', 'SMA_10', 'EMA_10', 'RSI', 'ATR']
        social_features = ['Twitter_Sentiment', 'Reddit_Sentiment', 'Social_Volume']
        target = 'Close'
    else:  # Indian market
        tech_features = ['Open', 'High', 'Low', 'Close', 'Volume', 'SMA_10', 'EMA_10', 'RSI', 'ATR']
        social_features = None
        target = 'Close'

    # Train-test split (time-based)
    train_size = int(0.8 * len(data))
    train_data = data.iloc[:train_size]
    test_data = data.iloc[train_size:]

    # Scale data
    tech_scaler = MinMaxScaler()
    train_tech = tech_scaler.fit_transform(train_data[tech_features])
    test_tech = tech_scaler.transform(test_data[tech_features])

    if MARKET == 'US':
        social_scaler = MinMaxScaler()
        train_social = social_scaler.fit_transform(train_data[social_features])
        test_social = social_scaler.transform(test_data[social_features])

    # Create sequences
    seq_length = 30
    close_idx = tech_features.index('Close')  # Index of Close for inverse scaling

    X_train_tech, y_train = create_sequences(
        pd.DataFrame(train_tech, columns=tech_features),
        target_col=close_idx,
        seq_length=seq_length
    )

    X_test_tech, y_test = create_sequences(
        pd.DataFrame(test_tech, columns=tech_features),
        target_col=close_idx,
        seq_length=seq_length
    )

    if MARKET == 'US':
        X_train_social, _ = create_sequences(
            pd.DataFrame(train_social, columns=social_features),
            target_col=0,
            seq_length=seq_length
        )
        X_test_social, _ = create_sequences(
            pd.DataFrame(test_social, columns=social_features),
            target_col=0,
            seq_length=seq_length
        )

    # Build and train hybrid model
    if MARKET == 'US':
        model = create_hybrid_model(
            tech_input_shape=(seq_length, len(tech_features)),
            social_input_shape=(seq_length, len(social_features))
        )
        callbacks = [
            EarlyStopping(patience=15, restore_best_weights=True),
            ReduceLROnPlateau(factor=0.1, patience=5)
        ]
        print("\nTraining hybrid model with social features...")
        history = model.fit(
            [X_train_tech, X_train_social], y_train,
            validation_data=([X_test_tech, X_test_social], y_test),
            epochs=100,
            batch_size=32,
            verbose=1,
            callbacks=callbacks
        )
    else:
        model = create_hybrid_model(
            tech_input_shape=(seq_length, len(tech_features))
        )
        print("\nTraining hybrid model without social features...")
        history = model.fit(
            X_train_tech, y_train,
            validation_data=(X_test_tech, y_test),
            epochs=100,
            batch_size=32,
            verbose=1
        )

    # Train Random Forest for comparison
    print("\nTraining Random Forest model...")
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    # Prepare 2D features for RF
    rf_features_train = X_train_tech.reshape(X_train_tech.shape[0], -1)
    rf_features_test = X_test_tech.reshape(X_test_tech.shape[0], -1)

    rf.fit(rf_features_train, y_train)
    rf_pred = rf.predict(rf_features_test)

    # Generate predictions
    if MARKET == 'US':
        hybrid_pred = model.predict([X_test_tech, X_test_social]).flatten()
    else:
        hybrid_pred = model.predict(X_test_tech).flatten()

    # Inverse scale predictions to original price values
    y_test_actual = inverse_scale(tech_scaler, y_test, close_idx)
    hybrid_pred_actual = inverse_scale(tech_scaler, hybrid_pred, close_idx)
    rf_pred_actual = inverse_scale(tech_scaler, rf_pred, close_idx)

    # Evaluate models
    def evaluate_model(name, actual, predicted):
        return {
            'Model': name,
            'MSE': mean_squared_error(actual, predicted),
            'RMSE': np.sqrt(mean_squared_error(actual, predicted)),
            'MAE': mean_absolute_error(actual, predicted),
            'R2': r2_score(actual, predicted)
        }

    results = [
        evaluate_model('Hybrid Model', y_test_actual, hybrid_pred_actual),
        evaluate_model('Random Forest', y_test_actual, rf_pred_actual)
    ]

    results_df = pd.DataFrame(results).set_index('Model')
    print("\nModel Evaluation Results:")
    print(results_df)

    # Plot results
    plt.figure(figsize=(15, 10))

    # Plot training progress
    plt.subplot(2, 2, 1)
    plt.plot(history.history['loss'], label='Training Loss')
    if 'val_loss' in history.history:
        plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Training Progress')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend()

    # Plot predictions vs actual
    plt.subplot(2, 2, 2)
    test_dates = test_data.index[seq_length:seq_length+len(y_test_actual)]
    plt.plot(test_dates, y_test_actual, label='Actual', linewidth=2)
    plt.plot(test_dates, hybrid_pred_actual, label='Hybrid Prediction', linestyle='--')
    plt.plot(test_dates, rf_pred_actual, label='RF Prediction', linestyle='--')
    plt.title(f'{MARKET} Market: Actual vs Predicted Prices')
    plt.xlabel('Date')
    plt.ylabel('Price')
    plt.legend()
    plt.xticks(rotation=45)

    # Plot feature importance for Random Forest
    plt.subplot(2, 2, 3)
    # Create names for features with time steps
    feature_names = []
    for i in range(seq_length-1, -1, -1):
        for feat in tech_features:
            feature_names.append(f"t-{i}_{feat}")

    importances = rf.feature_importances_
    n_top = min(20, len(importances))
    indices = np.argsort(importances)[-n_top:]

    plt.title(f'Top {n_top} Important Features (Random Forest)')
    plt.barh(range(n_top), importances[indices], align='center')
    plt.yticks(range(n_top), [feature_names[i] for i in indices])
    plt.xlabel('Relative Importance')

    # Plot actual vs predicted
    plt.subplot(2, 2, 4)
    plt.scatter(y_test_actual, hybrid_pred_actual, alpha=0.6, label='Hybrid Model')
    plt.scatter(y_test_actual, rf_pred_actual, alpha=0.6, label='Random Forest')
    plt.plot([min(y_test_actual), max(y_test_actual)],
             [min(y_test_actual), max(y_test_actual)],
             'k--', lw=2)
    plt.xlabel('Actual Prices')
    plt.ylabel('Predicted Prices')
    plt.title('Actual vs Predicted Prices')
    plt.legend()

    plt.tight_layout()
    plt.savefig(f'finess_results_{MARKET}.png', dpi=300)
    plt.show()

    # Save models
    model.save(f'finess_hybrid_model_{MARKET}.h5')
    joblib.dump(rf, f'random_forest_model_{MARKET}.pkl')
    joblib.dump(tech_scaler, f'tech_scaler_{MARKET}.pkl')
    if MARKET == 'US':
        joblib.dump(social_scaler, f'social_scaler_{MARKET}.pkl')
    print("\nModels and scalers saved successfully!")

In [None]:
# Re-import necessary packages after kernel reset
import pandas as pd
import shap
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Load cleaned data again
file_path = "/content/sample_data/Data/merged.csv"
df_real = pd.read_csv(file_path)

# Convert 'Open', 'High', 'Low' to numeric
for col in ['Open', 'High', 'Low']:
    df_real[col] = pd.to_numeric(df_real[col], errors='coerce')

# Drop rows with any missing values in critical columns
df_real_clean = df_real.dropna(subset=['Open', 'High', 'Low', 'Close', 'Volume'])

# Define target
median_close = df_real_clean['Close'].median()
df_real_clean['target'] = (df_real_clean['Close'] > median_close).astype(int)

# Downsample to 10,000 rows
df_sampled = df_real_clean.sample(n=10000, random_state=42)

# Features and target
X_sampled = df_sampled[['Open', 'High', 'Low', 'Volume', 'source_file']].copy()
y_sampled = df_sampled['target']

# Encode 'source_file' as numeric
X_sampled['source_file'] = pd.factorize(X_sampled['source_file'])[0]

# Train/test split
X_train_sampled, X_test_sampled, y_train_sampled, y_test_sampled = train_test_split(
    X_sampled, y_sampled, test_size=0.25, random_state=42
)

# Train RandomForest model
model_sampled = RandomForestClassifier(n_estimators=50, max_depth=5, random_state=42)
model_sampled.fit(X_train_sampled, y_train_sampled)

# SHAP TreeExplainer
explainer_sampled = shap.TreeExplainer(model_sampled)
shap_values_sampled = explainer_sampled.shap_values(X_test_sampled)

# Plot SHAP summary
shap.summary_plot(shap_values_sampled, X_test_sampled, show=False)
plt.tight_layout()
output_path_sampled = "sampled_shap_plot.png"
plt.savefig(output_path_sampled)
plt.close()

output_path_sampled


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from keras.models import Model, Sequential, load_model
from keras.layers import Dense, LSTM, Input, Concatenate, Dropout, BatchNormalization
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
import joblib
import os
import shap
import warnings
warnings.filterwarnings('ignore')

# Global configuration
plt.style.use('ggplot')
MARKET = 'INDIAN'  # Change to 'INDIAN' for Indian market and 'US' for American market
US_STOCK_FILE = "/content/sample_data/Data/stock_yfinance_data.csv"
US_SOCIAL_FILE = "/content/sample_data/Data/stock_tweets.csv"
INDIAN_FILE = "/content/sample_data/Data/merged.csv"

# Fixed load_data function for Indian market
def load_data(market):
    if market == 'INDIAN':
        try:
            print(f"Loading Indian market data from: {INDIAN_FILE}")
            df = pd.read_csv(INDIAN_FILE)
            print(f"Initial data shape: {df.shape}")

            # 1. Clean and process columns
            price_cols = ['Open', 'High', 'Low', 'Close']
            for col in price_cols:
                if col in df.columns:
                    # Clean the column
                    df[col] = df[col].astype(str)
                    df[col] = df[col].str.replace(r'[^\d.-]', '', regex=True)
                    df[col] = df[col].replace(r'^\.$', np.nan, regex=True)
                    df[col] = df[col].replace('', np.nan)
                    df[col] = pd.to_numeric(df[col], errors='coerce')

            # 2. Create Source_File column if missing
            if 'Source_File' not in df.columns:
                # Use filename as source file
                df['Source_File'] = os.path.basename(INDIAN_FILE)

            # 3. Sector mapping logic
            print("Mapping sectors...")
            sector_mapping = {
                'NIFTY NEXT 50': 'Large Cap',
                'NIFTY 50': 'Large Cap',
                'NIFTY PHARMA': 'Pharmaceuticals',
                'NIFTY FMCG': 'FMCG',
                'NIFTY 100': 'Large Cap',
                'NIFTY IT': 'Information Technology',
                'NIFTY HOUSING': 'Housing',
                'NIFTY INDIA MANUFACTURING': 'Manufacturing',
                'NIFTY BANK': 'Banking',
                'NIFTY MIDCAP 100': 'Mid Cap',
                'NIFTY INFRASTRUCTURE': 'Infrastructure',
                'NIFTY PSU BANK': 'Public Sector Banking',
                'NIFTY ENERGY': 'Energy',
                'NIFTY AUTO': 'Automobile',
                'NIFTY MEDIA': 'Media',
                'NIFTY METAL': 'Metals',
                'NIFTY COMMODITIES': 'Commodities',
                'NIFTY PRIVATE BANK': 'Private Banking',
                'NIFTY OIL  GAS': 'Oil & Gas',
                'VIX': 'Volatility',
                'DEFAULT': 'Other'
            }

            def map_sector(source_file):
                source_file = str(source_file).upper()
                for key, sector in sector_mapping.items():
                    if key in source_file:
                        return sector
                return sector_mapping['DEFAULT']

            df['Sector'] = df['Source_File'].apply(map_sector)
            print(f"Sectors mapped: {df['Sector'].nunique()} unique sectors")
            print("Sample sectors:", df['Sector'].unique()[:5])

            # 4. Convert and sort dates
            df['Date'] = pd.to_datetime(df['Date'])
            df.sort_values('Date', inplace=True)
            df.set_index('Date', inplace=True)

            print(f"Final data shape: {df.shape}")
            print("Sample columns:", df.columns.tolist()[:5])
            return df

        except Exception as e:
            print(f"Error loading data: {e}")
            import traceback
            traceback.print_exc()
            return pd.DataFrame()

    elif market == 'US':
        # (US market code remains the same as in your original)
        # ... [unchanged US market code] ...
        return df  # Make sure to return df for US market

# Fixed technical indicators function
def add_technical_indicators(df):
    # Skip calculation if no numerical columns exist
    if len(df.select_dtypes(include=[np.number]).columns) == 0:
        print("Warning: No numerical columns for technical indicators")
        return df
    # Calculate indicators for each sector
    sectors = df['Sector'].unique() if 'Sector' in df.columns else ['DEFAULT']

    for sector in sectors:
        # Skip if sector columns don't exist
        if f'{sector}_Close' not in df.columns:
            continue

        # Simple Moving Average
        df[f'{sector}_SMA_10'] = df[f'{sector}_Close'].rolling(10).mean()

        # Exponential Moving Average
        df[f'{sector}_EMA_10'] = df[f'{sector}_Close'].ewm(span=10, adjust=False).mean()

        # RSI
        delta = df[f'{sector}_Close'].diff()
        gain = delta.clip(lower=0)
        loss = -delta.clip(upper=0)
        avg_gain = gain.rolling(14).mean()
        avg_loss = loss.rolling(14).mean()
        rs = avg_gain / (avg_loss + 1e-10)
        df[f'{sector}_RSI'] = 100 - (100 / (1 + rs))

        # ATR
        high = df[f'{sector}_High'] if f'{sector}_High' in df.columns else df[f'{sector}_Close']
        low = df[f'{sector}_Low'] if f'{sector}_Low' in df.columns else df[f'{sector}_Close']
        close = df[f'{sector}_Close']
        tr1 = high - low
        tr2 = (high - close.shift()).abs()
        tr3 = (low - close.shift()).abs()
        tr = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
        df[f'{sector}_ATR'] = tr.rolling(14).mean()

    # Fill any remaining NaNs
    df = df.ffill().bfill().dropna()
    return df

# FINESS Hybrid Model Architecture
def create_hybrid_model(tech_input_shape, social_input_shape=None):
    # Technical Feature Processor (Tech+)
    tech_input = Input(shape=tech_input_shape, name='tech_input')
    x = LSTM(128, return_sequences=True)(tech_input)
    x = LSTM(64, return_sequences=False)(x)
    tech_output = Dense(32, activation='relu')(x)

    # Social Feature Processor (FINESS) - Only for US market
    if social_input_shape and MARKET == 'US':
        social_input = Input(shape=social_input_shape, name='social_input')
        y = LSTM(64, return_sequences=False)(social_input)
        social_output = Dense(16, activation='relu')(y)

        # Combine technical and social features
        combined = Concatenate()([tech_output, social_output])
        z = Dense(64, activation='relu')(combined)
    else:
        z = tech_output

    # Final prediction layers
    z = Dense(32, activation='relu')(z)
    z = Dropout(0.3)(z)
    output = Dense(1)(z)

    # Create model
    if MARKET == 'US':
        model = Model(inputs=[tech_input, social_input], outputs=output)
    else:
        model = Model(inputs=tech_input, outputs=output)

    model.compile(optimizer=Adam(0.001), loss='mse')
    return model

# Create sequences for LSTM (fixed to handle different input types)
def create_sequences(data, target_col, seq_length=30):
    X, y = [], []

    # Handle both DataFrame and array inputs
    if isinstance(data, pd.DataFrame):
        values = data.values
    else:
        values = data
    if MARKET == 'INDIAN':
        try:
            # Try to find target column index in features
            target_idx = tech_features.index(target_col)
        except ValueError:
            # If not found, use last column as fallback
            target_idx = -1
            print(f"Warning: Target column {target_col} not in features, using last column")

    # Check if target_col is index or column name
    if isinstance(target_col, str) and isinstance(data, pd.DataFrame):
        target_idx = data.columns.get_loc(target_col)
    else:
        target_idx = target_col

    for i in range(seq_length, len(values)):
        X.append(values[i-seq_length:i])
        y.append(values[i][target_idx])

    return np.array(X), np.array(y)

# Main execution
if __name__ == "__main__":
    # Load and preprocess data
    data = load_data(MARKET)

    # Diagnostic output
    print("\n" + "="*50)
    print("DATA LOADING DIAGNOSTICS:")
    print(f"Market: {MARKET}")
    print(f"Data type: {type(data)}")
    if isinstance(data, pd.DataFrame):
        print(f"Data shape: {data.shape}")
        print(f"Columns: {data.columns.tolist()[:5]}...")
        print(f"Index type: {type(data.index)}")
        if not data.empty:
            print(f"Date range: {data.index.min()} to {data.index.max()}")
        else:
            print("DataFrame is empty")
    else:
        print(f"Data is not a DataFrame: {type(data)}")
    print("="*50 + "\n")

    # Validate before proceeding
    if data is None or data.empty:
        raise ValueError("Data loading failed. Check file path and format.")

    # Add technical indicators
    data = add_technical_indicators(data)

    # Prepare features based on market
    if MARKET == 'US':
        tech_features = ['Open', 'High', 'Low', 'Volume', 'SMA_10', 'EMA_10', 'RSI', 'ATR']
        social_features = ['Twitter_Sentiment', 'Reddit_Sentiment', 'Social_Volume']
        target = 'Close'
    else:
        # Use only numerical columns and exclude categorical columns
        numerical_cols = data.select_dtypes(include=[np.number]).columns.tolist()

        # Define target - try to find a Close column
        close_cols = [col for col in numerical_cols if 'Close' in col]
        target_col = close_cols[0] if close_cols else 'Close'

        # If no Close column exists, create one
        if target_col not in data.columns:
            data['Close'] = data['Open']  # Fallback to Open price
            numerical_cols.append('Close')
            target_col = 'Close'

        tech_features = numerical_cols

        # Ensure target column is not in features
        if target_col in tech_features:
            tech_features.remove(target_col)

        print(f"Using target column: {target_col}")
        print(f"Technical features: {tech_features}")

    # Train-test split (time-based)
    train_size = int(0.8 * len(data))
    train_data = data.iloc[:train_size]
    test_data = data.iloc[train_size:]

    # Scale data
    tech_scaler = MinMaxScaler()
    train_tech = tech_scaler.fit_transform(train_data[tech_features])
    test_tech = tech_scaler.transform(test_data[tech_features])

    if MARKET == 'US':
        social_scaler = StandardScaler()
        train_social = social_scaler.fit_transform(train_data[social_features])
        test_social = social_scaler.transform(test_data[social_features])

    # Create sequences
    seq_length = 30

    # For Indian market, use column index of target
    if MARKET == 'INDIAN':
        target_idx = tech_features.index(target_col) if target_col in tech_features else 0
    else:
        target_idx = tech_features.index(target) if target in tech_features else 0

    X_train_tech, y_train = create_sequences(
        pd.DataFrame(train_tech, columns=tech_features),
        target_col=target_idx,
        seq_length=seq_length
    )

    X_test_tech, y_test = create_sequences(
        pd.DataFrame(test_tech, columns=tech_features),
        target_col=target_idx,
        seq_length=seq_length
    )

    if MARKET == 'US':
        X_train_social, _ = create_sequences(
            pd.DataFrame(train_social, columns=social_features),
            target_col=0,  # First column as placeholder
            seq_length=seq_length
        )
        X_test_social, _ = create_sequences(
            pd.DataFrame(test_social, columns=social_features),
            target_col=0,
            seq_length=seq_length
        )

    # Build and train hybrid model
    if MARKET == 'US':
        model = create_hybrid_model(
            tech_input_shape=(seq_length, len(tech_features)),
            social_input_shape=(seq_length, len(social_features))
        )
        history = model.fit(
            [X_train_tech, X_train_social], y_train,
            validation_data=([X_test_tech, X_test_social], y_test),
            epochs=100,
            batch_size=32,
            verbose=1,
            callbacks=[]#[EarlyStopping(patience=10)]
        )
    else:
        model = create_hybrid_model(
            tech_input_shape=(seq_length, len(tech_features))
        )
        history = model.fit(
            X_train_tech, y_train,
            validation_data=(X_test_tech, y_test),
            epochs=100,
            batch_size=32,
            verbose=1,
            callbacks=[]#[EarlyStopping(patience=10)]
        )

    # Train Random Forest for comparison
    rf = RandomForestRegressor(n_estimators=50, random_state=42)  # Reduced for faster testing

    # Prepare 2D features for RF
    rf_features_train = X_train_tech.reshape(X_train_tech.shape[0], -1)
    rf_features_test = X_test_tech.reshape(X_test_tech.shape[0], -1)

    rf.fit(rf_features_train, y_train)
    rf_pred = rf.predict(rf_features_test)

    # Generate predictions
    if MARKET == 'US':
        hybrid_pred = model.predict([X_test_tech, X_test_social]).flatten()
    else:
        hybrid_pred = model.predict(X_test_tech).flatten()

    # Evaluate models
    def evaluate_model(name, actual, predicted):
        return {
            'Model': name,
            'MSE': mean_squared_error(actual, predicted),
            'RMSE': np.sqrt(mean_squared_error(actual, predicted)),
            'MAE': mean_absolute_error(actual, predicted),
            'R2': r2_score(actual, predicted)
        }

    results = [
        evaluate_model('Hybrid Model', y_test, hybrid_pred),
        evaluate_model('Random Forest', y_test, rf_pred)
    ]

    results_df = pd.DataFrame(results).set_index('Model')
    print("\nModel Evaluation Results:")
    print(results_df)

    # Plot training progress
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Training Progress')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend()

    # Plot predictions vs actual
    plt.subplot(1, 2, 2)
    test_dates = test_data.index[seq_length:seq_length+len(y_test)]
    plt.plot(test_dates, y_test, label='Actual', alpha=0.7)
    plt.plot(test_dates, hybrid_pred, label='Hybrid Prediction', alpha=0.7)
    plt.plot(test_dates, rf_pred, label='RF Prediction', alpha=0.7)
    plt.title(f'{MARKET} Market Predictions')
    plt.xlabel('Date')
    plt.ylabel('Price')
    plt.legend()
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

    # SHAP Analysis (simplified)
    try:
        # Prepare SHAP explainer
        explainer = shap.TreeExplainer(rf)
        shap_values = explainer.shap_values(rf_features_test)

        # Plot SHAP summary
        plt.figure(figsize=(10, 8))
        shap.summary_plot(shap_values, rf_features_test, feature_names=tech_features)
        plt.title(f'SHAP Feature Importance for {MARKET} Market')
        plt.tight_layout()
        plt.savefig(f'shap_summary_{MARKET}.png', dpi=300)
        plt.show()
    except Exception as e:
        print(f"SHAP analysis failed: {str(e)}")

    # Feature importance for Random Forest
    plt.figure(figsize=(12, 8))
    importances = rf.feature_importances_

    # Get top features
    n_top = min(15, len(importances))
    indices = np.argsort(importances)[-n_top:]

    # Create meaningful feature names
    feature_names = []
    for i in range(seq_length):
        for j, feature in enumerate(tech_features):
            feature_names.append(f"t-{seq_length-i-1}_{feature}")

    plt.title(f'Top {n_top} Important Features (Random Forest)')
    plt.barh(range(n_top), importances[indices], align='center')
    plt.yticks(range(n_top), [feature_names[i] for i in indices])
    plt.xlabel('Relative Importance')
    plt.tight_layout()
    plt.savefig(f'{MARKET}_market_feature_importance.png', dpi=300)
    plt.show()

    # Save models
    model.save(f'finess_hybrid_model_{MARKET}.h5')
    joblib.dump(rf, f'random_forest_model_{MARKET}.pkl')
    print("Models saved successfully!")