# Feature Engineering

In [8]:
# Import necessary libraries
import pandas as pd
import numpy as np
import os # Import the os library to handle file paths

# --- Cell 1: Load the Cleaned Data ---
# Define the full path to your cleaned CSV file from the 'processed' folder.
cleaned_file_path = r"C:\Users\DELL\bootcamp_Dhriti_Trivedi\Project\Data\processed\yahoo_data_cleaned.csv"

try:
    df = pd.read_csv(cleaned_file_path, index_col='Date', parse_dates=True)
    print("Cleaned dataset loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at the path: {cleaned_file_path}")
    exit()


Cleaned dataset loaded successfully!


In [10]:
# --- Cell 2: Calculate Daily Returns ---
# The daily return is the percentage change in the closing price from one day to the next.
df['Daily_Return'] = df['Adj Close'].pct_change() * 100
print("\n--- Calculated Daily Returns (%) ---")
print(df[['Adj Close', 'Daily_Return']].head())


--- Calculated Daily Returns (%) ---
            Adj Close  Daily_Return
Date                               
2018-05-01   24099.05           NaN
2018-05-02   23924.98     -0.722311
2018-05-03   23930.15      0.021609
2018-05-04   24262.51      1.388876
2018-05-07   24357.32      0.390767


In [12]:
# --- Cell 3: Calculate Volume Change ---
# We calculate the percentage change in trading volume.
df['Volume_Change'] = df['Volume'].pct_change() * 100
print("\n--- Calculated Volume Change (%) ---")
print(df[['Volume', 'Volume_Change']].head())


--- Calculated Volume Change (%) ---
               Volume  Volume_Change
Date                                
2018-05-01  380070000            NaN
2018-05-02  385350000       1.389218
2018-05-03  389240000       1.009472
2018-05-04  329480000     -15.352996
2018-05-07  307670000      -6.619522


In [14]:
# --- Cell 4: Create Lagged Features ---
# Does yesterday's volume change affect today's return? We create a "lagged" feature.
df['Volume_Change_Lag1'] = df['Volume_Change'].shift(1)
print("\n--- Created Lagged Volume Change ---")
print(df[['Volume_Change', 'Volume_Change_Lag1']].head())


--- Created Lagged Volume Change ---
            Volume_Change  Volume_Change_Lag1
Date                                         
2018-05-01            NaN                 NaN
2018-05-02       1.389218                 NaN
2018-05-03       1.009472            1.389218
2018-05-04     -15.352996            1.009472
2018-05-07      -6.619522          -15.352996


In [16]:
# --- Cell 5: Finalize the Feature Set ---
# Drop rows with NaN values (the first two rows) to create a clean dataset for modeling.
df_model = df.dropna()
print("\n--- Final Dataset for Modeling ---")
print(df_model.head())



--- Final Dataset for Modeling ---
                Open      High       Low     Close  Adj Close     Volume  \
Date                                                                       
2018-05-03  23836.23  23996.15  23531.31  23930.15   23930.15  389240000   
2018-05-04  23865.22  24333.35  23778.87  24262.51   24262.51  329480000   
2018-05-07  24317.66  24479.45  24263.42  24357.32   24357.32  307670000   
2018-05-08  24341.35  24412.34  24198.34  24360.21   24360.21  344940000   
2018-05-09  24399.18  24586.48  24323.87  24542.54   24542.54  361580000   

            Daily_Return  Volume_Change  Volume_Change_Lag1  
Date                                                         
2018-05-03      0.021609       1.009472            1.389218  
2018-05-04      1.388876     -15.352996            1.009472  
2018-05-07      0.390767      -6.619522          -15.352996  
2018-05-08      0.011865      12.113628           -6.619522  
2018-05-09      0.748475       4.824027           12.113628

In [24]:
# --- Save the Final Engineered Features ---
# Define the folder where you want to save the output.
output_folder = r"C:\Users\DELL\bootcamp_Dhriti_Trivedi\Project\Data\processed"
# Create the full path for the new features file.
output_file_path = os.path.join(output_folder, 'yahoo_data_features.csv')

# Save the final DataFrame to a new CSV file in your 'processed' folder.
df_model.to_csv(output_file_path)
print(f"\n Dataset with engineered features saved to: {output_file_path}")


 Dataset with engineered features saved to: C:\Users\DELL\bootcamp_Dhriti_Trivedi\Project\Data\processed\yahoo_data_features.csv
