# Yes Bank Stock Price Prediction Project

This project involves analyzing historical stock price data for Yes Bank and building predictive models using various regression techniques. The goal is to predict the closing price of the stock based on historical trends and data features.


## 1. Project Overview
- **Objective**: Predict Yes Bank's stock closing prices.
- **Dataset**: `data_YesBank_StockPrices.csv`
- **Features**: `Open`, `High`, `Low`
- **Target**: `Close`
- **Tools Used**: pandas, matplotlib, seaborn, scikit-learn, scipy, joblib

In [None]:
#EDA
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats

# Read the data
file_path = 'data_YesBank_StockPrices.csv'
df = pd.read_csv(file_path)

# Convert 'Date' to datetime (assuming format 'Mon-YY')
df['Date'] = pd.to_datetime(df['Date'], format='%b-%y')
df = df.sort_values('Date').reset_index(drop=True)

# Basic info
print('--- Data Head ---')
print(df.head())
print('\n--- Data Info ---')
print(df.info())
print('\n--- Data Description ---')
print(df.describe())

# Check for missing values
print('\n--- Missing Values ---')
print(df.isnull().sum())

# Handle missing values (if any)
df = df.dropna()

# Outlier detection and removal (Z-score method)
def remove_outliers_zscore(data, columns, threshold=3):
    z_scores = np.abs(stats.zscore(data[columns]))
    mask = (z_scores < threshold).all(axis=1)
    return data[mask]

num_cols = ['Open', 'High', 'Low', 'Close']
df = remove_outliers_zscore(df, num_cols)

# 1. Line plot of Closing Price over Time
plt.figure(figsize=(12, 6))
plt.plot(df['Date'], df['Close'], marker='o')
plt.title('Monthly Closing Price of Yes Bank Over Time')
plt.xlabel('Date')
plt.ylabel('Closing Price')
plt.grid(True)
plt.tight_layout()
plt.show()

# 2. OHLC (Open, High, Low, Close) Plot
plt.figure(figsize=(12, 6))
plt.plot(df['Date'], df['Open'], label='Open', alpha=0.7)
plt.plot(df['Date'], df['High'], label='High', alpha=0.7)
plt.plot(df['Date'], df['Low'], label='Low', alpha=0.7)
plt.plot(df['Date'], df['Close'], label='Close', alpha=0.7)
plt.title('OHLC Prices Over Time')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()
plt.tight_layout()
plt.show()

# 3. Distribution of Closing Prices
plt.figure(figsize=(8, 5))
sns.histplot(df['Close'], bins=30, kde=True)
plt.title('Distribution of Monthly Closing Prices')
plt.xlabel('Closing Price')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

# 4. Monthly Returns (Percentage Change)
df['Monthly_Return'] = df['Close'].pct_change() * 100
plt.figure(figsize=(12, 6))
plt.plot(df['Date'], df['Monthly_Return'], marker='o', color='purple')
plt.title('Monthly Returns (%) Over Time')
plt.xlabel('Date')
plt.ylabel('Monthly Return (%)')
plt.grid(True)
plt.tight_layout()
plt.show()

# 5. Correlation Heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(df[['Open', 'High', 'Low', 'Close']].corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of Price Features')
plt.tight_layout()
plt.show()

# 6. Boxplot of Closing Price by Year
df['Year'] = df['Date'].dt.year
plt.figure(figsize=(12, 6))
sns.boxplot(x='Year', y='Close', data=df)
plt.title('Yearly Distribution of Closing Prices')
plt.xlabel('Year')
plt.ylabel('Closing Price')
plt.tight_layout()
plt.show()





## 3. Feature Engineering
Features selected based on correlation and domain relevance:
- `Open`
- `High`
- `Low`

Target variable:
- `Close`

## 4. Machine Learning Modeling
The following models are implemented:
- Linear Regression
- Ridge Regression
- Random Forest Regressor

Model evaluation uses:
- Root Mean Squared Error (RMSE)
- Mean Absolute Error (MAE)
- R² Score

In [None]:
#ML MODEL
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy import stats
import joblib

# 1. Load Data
file_path = 'data_YesBank_StockPrices.csv'
df = pd.read_csv(file_path)
df['Date'] = pd.to_datetime(df['Date'], format='%b-%y')
df = df.sort_values('Date').reset_index(drop=True)

# 2. Handle Missing Values
print('--- Missing Values Before ---')
print(df.isnull().sum())
# Drop any rows with missing values
if df.isnull().sum().sum() > 0:
    df = df.dropna()
    print('Missing values found and dropped.')
else:
    print('No missing values found.')
print('--- Missing Values After ---')
print(df.isnull().sum())

# 3. Outlier Detection and Handling (Z-score method for numeric columns)
def remove_outliers_zscore(data, columns, threshold=3):
    z_scores = np.abs(stats.zscore(data[columns]))
    mask = (z_scores < threshold).all(axis=1)
    return data[mask]

num_cols = ['Open', 'High', 'Low', 'Close']
df_clean = remove_outliers_zscore(df, num_cols)
print(f"Rows before outlier removal: {len(df)}")
print(f"Rows after outlier removal: {len(df_clean)}")

# 4. Correlation Analysis
plt.figure(figsize=(8, 6))
sns.heatmap(df_clean[num_cols].corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of Price Features')
plt.tight_layout()
plt.show()

# 5. Feature Engineering (add lag features, month, year)
df_clean['Month'] = df_clean['Date'].dt.month
df_clean['Year'] = df_clean['Date'].dt.year
df_clean['Prev_Close'] = df_clean['Close'].shift(1)
df_clean = df_clean.dropna().reset_index(drop=True)

# 6. Feature Selection
features = ['Open', 'High', 'Low', 'Month', 'Year', 'Prev_Close']
target = 'Close'

# 7. Train-Test Split
X = df_clean[features]
y = df_clean[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# 8. Model Training & Evaluation Functions
def evaluate_model(model, X_train, y_train, X_test, y_test, name):
    """Train and evaluate a regression model, print metrics and plot results."""
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    print(f'\n--- {name} ---')
    print('Train RMSE:', np.sqrt(mean_squared_error(y_train, y_pred_train)))
    print('Test RMSE:', np.sqrt(mean_squared_error(y_test, y_pred_test)))
    print('Test MAE:', mean_absolute_error(y_test, y_pred_test))
    print('Test R2:', r2_score(y_test, y_pred_test))
    plt.figure(figsize=(8, 4))
    plt.plot(y_test.values, label='Actual')
    plt.plot(y_pred_test, label='Predicted')
    plt.title(f'{name} - Actual vs Predicted Closing Price')
    plt.xlabel('Test Sample Index')
    plt.ylabel('Closing Price')
    plt.legend()
    plt.tight_layout()
    plt.show()
    return np.sqrt(mean_squared_error(y_test, y_pred_test)), r2_score(y_test, y_pred_test)

# 9. Model 1: Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_rmse, lr_r2 = evaluate_model(lr, X_train, y_train, X_test, y_test, 'Linear Regression')

# 10. Model 2: Random Forest Regressor (with hyperparameter tuning)
rf_params = {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7, None]}
rf = RandomForestRegressor(random_state=42)
gs_rf = GridSearchCV(rf, rf_params, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1)
gs_rf.fit(X_train, y_train)
print('Best RF Params:', gs_rf.best_params_)
rf_best = gs_rf.best_estimator_
rf_rmse, rf_r2 = evaluate_model(rf_best, X_train, y_train, X_test, y_test, 'Random Forest Regressor')

# 11. Model 3: Ridge Regression (with hyperparameter tuning)
ridge_params = {'alpha': [0.01, 0.1, 1, 10, 100]}
ridge = Ridge()
gs_ridge = GridSearchCV(ridge, ridge_params, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1)
gs_ridge.fit(X_train, y_train)
print('Best Ridge Params:', gs_ridge.best_params_)
ridge_best = gs_ridge.best_estimator_
ridge_rmse, ridge_r2 = evaluate_model(ridge_best, X_train, y_train, X_test, y_test, 'Ridge Regression')

# 12. Final Model Selection
results = pd.DataFrame({
    'Model': ['Linear Regression', 'Random Forest', 'Ridge Regression'],
    'Test RMSE': [lr_rmse, rf_rmse, ridge_rmse],
    'Test R2': [lr_r2, rf_r2, ridge_r2]
})
print('\n--- Model Comparison ---')
print(results)

best_model_idx = results['Test RMSE'].idxmin()
print(f"\nBest Model: {results.loc[best_model_idx, 'Model']} (Lowest Test RMSE)")

# 13. Final Summary
print("""
Summary:
- Data was cleaned for missing values and outliers.
- Feature engineering included lag features and date parts.
- Three models were tested: Linear Regression, Random Forest, and Ridge Regression (with hyperparameter tuning).
- The best model was selected based on lowest Test RMSE.
- All code is modular, commented, and outputs are clearly formatted for interpretation.
""")

def save_best_model(model, filename='best_model.pkl'):
    """Save the trained model to a file."""
    joblib.dump(model, filename)
    print(f'Best model saved to {filename}')

# Save the best model
if results.loc[best_model_idx, 'Model'] == 'Random Forest':
    save_best_model(rf_best)
elif results.loc[best_model_idx, 'Model'] == 'Ridge Regression':
    save_best_model(ridge_best)
else:
    save_best_model(lr)

# --- Prediction Utility ---
def predict_new_closing_price(open_price, high_price, low_price, month, year, prev_close, model_path='best_model.pkl'):
    """
    Predict the closing price for a new month using the saved best model.
    Args:
        open_price (float): Opening price for the month
        high_price (float): Highest price for the month
        low_price (float): Lowest price for the month
        month (int): Month as integer (1-12)
        year (int): Year as integer (e.g., 2021)
        prev_close (float): Previous month's closing price
        model_path (str): Path to the saved model file
    Returns:
        float: Predicted closing price
    """
    model = joblib.load(model_path)
    new_data = pd.DataFrame([{
        'Open': open_price,
        'High': high_price,
        'Low': low_price,
        'Month': month,
        'Year': year,
        'Prev_Close': prev_close
    }])
    pred = model.predict(new_data)
    return pred[0]

# --- Optional: Command-line interface ---
if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser(description='Predict Yes Bank monthly closing price.')
    parser.add_argument('--open', type=float, required=False, help='Opening price for the month')
    parser.add_argument('--high', type=float, required=False, help='Highest price for the month')
    parser.add_argument('--low', type=float, required=False, help='Lowest price for the month')
    parser.add_argument('--month', type=int, required=False, help='Month as integer (1-12)')
    parser.add_argument('--year', type=int, required=False, help='Year as integer (e.g., 2021)')
    parser.add_argument('--prev_close', type=float, required=False, help="Previous month's closing price")
    args = parser.parse_args()
    if all([args.open, args.high, args.low, args.month, args.year, args.prev_close]):
        pred = predict_new_closing_price(
            open_price=args.open,
            high_price=args.high,
            low_price=args.low,
            month=args.month,
            year=args.year,
            prev_close=args.prev_close
        )
        print(f'Predicted Closing Price: {pred:.2f}')
    else:
        print('To predict, provide all arguments: --open --high --low --month --year --prev_close')
        print('Or use the function predict_new_closing_price() in your own script.')

## 5. Results and Evaluation
- Evaluate all three models using RMSE, MAE, and R².
- Select the best-performing model (typically Random Forest).
- Visualize predictions vs actual closing prices.

## 6. Model Deployment Preparation
Save the best model using `joblib` for later use in deployment.

```python
joblib.dump(best_model, 'best_model.pkl')

---




## 7. Conclusion
- The Random Forest model provided the best performance.
- Data preprocessing, outlier handling, and proper evaluation are crucial.
- Future improvements could include time series modeling (ARIMA, LSTM), using macroeconomic features, or deploying via a web interface.