In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from joblib import dump, load
import yfinance as yf
import time
import os

# Step 1: Specify the directory containing your CSV files
directory = r"C:\Users\tanis\Downloads\archive"

# List all files in the directory
file_names = [f for f in os.listdir(directory) if f.endswith('.csv')]

for file_name in file_names:
    print(f"Processing file: {file_name}")
    
    # Step 2: Load data and strip any leading/trailing spaces in column names
    data = pd.read_csv(os.path.join(directory, file_name))
    data.columns = data.columns.str.strip()  # Clean column names

    # Step 3: Basic preprocessing
    data.dropna(inplace=True)  # Handle missing values
    
    # Example feature engineering
    data['price_change'] = data['Close'].pct_change()  # Percentage change in closing price
    data['volume_change'] = data['Volume'].pct_change()  # Percentage change in volume
    data['price_moving_avg'] = data['Close'].rolling(window=5).mean()  # 5-period moving average for price
    data['volume_moving_avg'] = data['Volume'].rolling(window=5).mean()  # 5-period moving average for volume
    
    # Create labels (dummy example, adjust as needed)
    data['label'] = (data['price_change'].abs() > 0.05) & (data['volume_change'].abs() > 0.1)
    
    # Drop columns that are not needed for modeling, ignoring missing columns
    columns_to_drop = ['Date', 'Symbol', 'Series', 'Prev Close', 'Open', 'High', 'Low', 'Last', 'VWAP', 'Turnover', 'Trades', 'Deliverable Volume', '%Deliverable']
    data.drop(columns=columns_to_drop, axis=1, inplace=True, errors='ignore')
    
    # Drop rows with NaN values after feature engineering
    data.dropna(inplace=True)

    # Step 4: Split data into features and labels
    X = data[['price_change', 'volume_change', 'price_moving_avg', 'volume_moving_avg']]
    y = data['label']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Step 5: Apply SMOTE to handle class imbalance
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
    
    # Step 6: Normalize the data
    scaler = StandardScaler()
    X_resampled_scaled = scaler.fit_transform(X_resampled)
    X_test_scaled = scaler.transform(X_test)
    
    # Step 7: Hyperparameter tuning using GridSearchCV
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5]
    }
    
    grid_search = GridSearchCV(RandomForestClassifier(random_state=42, class_weight='balanced'), param_grid, cv=3)
    grid_search.fit(X_resampled_scaled, y_resampled)
    
    # Best model
    best_model = grid_search.best_estimator_
    
    # Step 8: Evaluate the model
    y_pred = best_model.predict(X_test_scaled)
    print(f"Results for {file_name}:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("\n" + "="*50 + "\n")
    
    # Save the trained model to a file
    model_filename = 'random_forest_model.joblib'
    dump(best_model, model_filename)
    print(f"Model saved to {model_filename}")

# Load the trained model
model = load('random_forest_model.joblib')
print("Model loaded successfully")

# Step 9: Function to fetch real-time data
def fetch_real_time_data(symbol):
    # Fetch historical data
    data = yf.download(symbol, period="1d", interval="1m")
    return data

# Step 10: Preprocess the real-time data
def preprocess_data(data):
    # Feature engineering and preprocessing
    data['price_change'] = data['Close'].pct_change()
    data['volume_change'] = data['Volume'].pct_change()
    data['price_moving_avg'] = data['Close'].rolling(window=5).mean()
    data['volume_moving_avg'] = data['Volume'].rolling(window=5).mean()
    
    # Drop rows with NaN values
    data.dropna(inplace=True)
    
    # Select features
    X = data[['price_change', 'volume_change', 'price_moving_avg', 'volume_moving_avg']]
    return X

# Step 11: Function to track and predict real-time data for symbols
def track_symbols(symbols):
    # Real-time tracking loop
    while True:
        for symbol in symbols:
            print(f"Processing symbol: {symbol}")
            
            # Fetch real-time data
            data = fetch_real_time_data(symbol)
            
            if not data.empty:
                # Preprocess the data
                X = preprocess_data(data)
                X = X.astype('float64')
                
                if not X.empty:
                    # Make prediction
                    prediction = model.predict(X.tail(1))  # Predict based on the latest available data
                    print(f"Real-time prediction for {symbol}: {'Positive' if prediction[0] else 'Negative'}")
            
            print("-" * 50)
        
        # Wait for a while before fetching new data (e.g., 1 hour)
        time.sleep(3600)

# List of symbols to track
symbols = [
    # Stock Exchanges (example tickers, adjust as needed)
    "AAPL",    # Apple
    "GOOGL",   # Alphabet
    "MSFT",    # Microsoft
    
    # Forex Pairs
    "EURUSD=X", # EUR/USD
    "GBPUSD=X", # GBP/USD
    "JPY=X",    # JPY/USD
    
    # Cryptocurrencies
    "BTC-USD",  # Bitcoin
    "ETH-USD",  # Ethereum
]

# Start tracking the symbols
track_symbols(symbols)


Processing file: ADANIPORTS.csv
Results for ADANIPORTS.csv:
Accuracy: 0.9293478260869565
Classification Report:
               precision    recall  f1-score   support

       False       1.00      0.93      0.96       704
        True       0.38      1.00      0.55        32

    accuracy                           0.93       736
   macro avg       0.69      0.96      0.76       736
weighted avg       0.97      0.93      0.94       736



Model saved to random_forest_model.joblib
Processing file: ASIANPAINT.csv
Results for ASIANPAINT.csv:
Accuracy: 0.9619565217391305
Classification Report:
               precision    recall  f1-score   support

       False       0.99      0.97      0.98       724
        True       0.21      0.50      0.30        12

    accuracy                           0.96       736
   macro avg       0.60      0.73      0.64       736
weighted avg       0.98      0.96      0.97       736



Model saved to random_forest_model.joblib
Processing file: AXISBANK.csv
Re

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

Real-time prediction for AAPL: Positive
--------------------------------------------------
Processing symbol: GOOGL
Real-time prediction for GOOGL: Positive
--------------------------------------------------
Processing symbol: MSFT



[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

Real-time prediction for MSFT: Positive
--------------------------------------------------
Processing symbol: EURUSD=X



[*********************100%***********************]  1 of 1 completed

--------------------------------------------------
Processing symbol: GBPUSD=X





--------------------------------------------------
Processing symbol: JPY=X


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

--------------------------------------------------
Processing symbol: BTC-USD





Real-time prediction for BTC-USD: Negative
--------------------------------------------------
Processing symbol: ETH-USD


[*********************100%***********************]  1 of 1 completed


Real-time prediction for ETH-USD: Negative
--------------------------------------------------
