Feature Engineering and Dataload

In [33]:
import pandas as pd

# # Load the datasets
# bbca_df = pd.read_csv('BBCA_clean.csv')
# bbni_df = pd.read_csv('BBNI_clean.csv')
# bbri_df = pd.read_csv('BBRI_clean.csv')
# bmri_df = pd.read_csv('BMRI_clean.csv')

# # Feature engineering functions
# def feature_engineering(df):
#     df['H-L'] = df['High'] - df['Low']
#     df['O-C'] = df['Close'] - df['Open']
#     df['7 DAYS MA'] = df['Close'].rolling(window=7).mean()
#     df['14 DAYS MA'] = df['Close'].rolling(window=14).mean()
#     df['21 DAYS MA'] = df['Close'].rolling(window=21).mean()
#     df['7 DAYS STD DEV'] = df['Close'].rolling(window=7).std()
#     return df

# # Apply feature engineering to each dataframe
# bbca_df = feature_engineering(bbca_df)
# bbni_df = feature_engineering(bbni_df)
# bbri_df = feature_engineering(bbri_df)
# bmri_df = feature_engineering(bmri_df)

# # Save the modified datasets
# bbca_df.to_csv('BBCA_clean_modified.csv', index=False)
# bbni_df.to_csv('BBNI_clean_modified.csv', index=False)
# bbri_df.to_csv('BBRI_clean_modified.csv', index=False)
# bmri_df.to_csv('BMRI_clean_modified.csv', index=False)

In [34]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import plotly.graph_objects as go
import pandas as pd

In [35]:
bbca = pd.read_csv('BBCA_clean_modified.csv')
bbni = pd.read_csv('BBNI_clean_modified.csv')
bbri = pd.read_csv('BBRI_clean_modified.csv')
bmri = pd.read_csv('BMRI_clean_modified.csv')

In [36]:
bbca.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'H-L',
       'O-C', '7 DAYS MA', '14 DAYS MA', '21 DAYS MA', '7 DAYS STD DEV'],
      dtype='object')

In [37]:
bbca.head(21)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,H-L,O-C,7 DAYS MA,14 DAYS MA,21 DAYS MA,7 DAYS STD DEV
0,2019-01-02,3700.0,3700.0,3625.0,3662.5,2965.774658,45319200,75.0,-37.5,,,,
1,2019-01-03,3637.5,3675.0,3550.0,3587.5,2905.042236,74174000,125.0,-50.0,,,,
2,2019-01-04,3587.5,3725.0,3575.0,3725.0,3016.38501,78514000,150.0,137.5,,,,
3,2019-01-07,3750.0,3787.5,3712.5,3712.5,3006.263184,53624800,75.0,-37.5,,,,
4,2019-01-08,3712.5,3737.5,3675.0,3675.0,2975.896729,80073200,62.5,-37.5,,,,
5,2019-01-09,3750.0,3787.5,3700.0,3787.5,3066.995605,104903400,87.5,37.5,,,,
6,2019-01-10,3812.5,3850.0,3775.0,3850.0,3117.606201,130603200,75.0,37.5,3714.285714,,,85.82582
7,2019-01-11,3875.0,3900.0,3837.5,3850.0,3117.606201,88899400,62.5,-25.0,3741.071429,,,95.664966
8,2019-01-14,3875.0,3875.0,3825.0,3850.0,3117.606201,71320000,50.0,-25.0,3778.571429,,,74.552235
9,2019-01-15,3875.0,3900.0,3825.0,3900.0,3158.094727,86697200,75.0,25.0,3803.571429,,,82.510822


Perlu Start dari hari ke 21 karena limitasi dari feature engineering

In [38]:
features=['Open', 'High', 'Low', 'Volume', 'H-L', 'O-C', '7 DAYS MA', '14 DAYS MA', '21 DAYS MA', '7 DAYS STD DEV']
len(features)

10

In [39]:
def preprocess_data(df):
    df = df.dropna().reset_index(drop=True)
    features = df[['Open', 'High', 'Low', 'Volume', 'H-L', 'O-C', '7 DAYS MA', '14 DAYS MA', '21 DAYS MA', '7 DAYS STD DEV']]
    target = df['Close']
    scaler = StandardScaler()
    features = scaler.fit_transform(features)
    return train_test_split(features, target, test_size=0.2, random_state=42), df['Date']

In [40]:
def visualize_model_performance(dates, y_train, y_test, y_pred, title):
    # Prepare the date ranges
    train_dates = dates[:len(y_train)] if y_train is not None else []
    test_dates = dates[len(train_dates):len(train_dates) + len(y_test)]

    # Create the figure
    fig = go.Figure()

    # Add traces for actual values and predictions
    fig.add_trace(go.Scatter(x=train_dates, y=y_train, mode='markers', name='Train Actual', marker=dict(color='gray')))
    fig.add_trace(go.Scatter(x=test_dates, y=y_test, mode='markers', name='Validation Actual', marker=dict(color='red')))
    fig.add_trace(go.Scatter(x=dates, y=y_pred, mode='lines', name='Predictions', line=dict(color='orange')))

    # Update layout
    fig.update_layout(
        title=title,
        xaxis=dict(
            rangeslider=dict(
                visible=True
            ),
            type='date'
        ),
        yaxis=dict(title='Close Price IDR')
    )

    # Show the plot
    fig.show()

In [41]:
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    train_predictions = model.predict(X_train)
    test_predictions = model.predict(X_test)
    
    train_rmse = np.sqrt(mean_squared_error(y_train, train_predictions))
    test_rmse = np.sqrt(mean_squared_error(y_test, test_predictions))
    
    train_mae = mean_absolute_error(y_train, train_predictions)
    test_mae = mean_absolute_error(y_test, test_predictions)
    
    all_predictions = np.concatenate([train_predictions, test_predictions])
    
    return train_rmse, test_rmse, train_mae, test_mae, all_predictions

In [42]:
def process_dataset(df, dataset_name):
    (X_train, X_test, y_train, y_test), dates = preprocess_data(df)

    # Determine number of components for PCA to retain 95% of variance
    pca = PCA(n_components=0.95)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    
    # Train and evaluate Random Forest Regressor for PCA
    rf_model_pca = RandomForestRegressor(random_state=42)
    rf_train_rmse_pca, rf_test_rmse_pca, rf_train_mae_pca, rf_test_mae_pca, rf_predictions_pca = evaluate_model(rf_model_pca, X_train_pca, X_test_pca, y_train, y_test)
    
    # Train and evaluate Linear Regression for PCA
    lr_model_pca = LinearRegression()
    lr_train_rmse_pca, lr_test_rmse_pca, lr_train_mae_pca, lr_test_mae_pca, lr_predictions_pca = evaluate_model(lr_model_pca, X_train_pca, X_test_pca, y_train, y_test)
    
    # Concatenate train and test sets for actual values
    y_true = np.concatenate([y_train, y_test])
    
    # Visualize model performance for PCA
    visualize_model_performance(dates, y_train, y_test, rf_predictions_pca, f'Random Forest Model with PCA for {dataset_name}')
    visualize_model_performance(dates, y_train, y_test, lr_predictions_pca, f'Linear Regression Model with PCA for {dataset_name}')
    
    return {
        'Dataset': dataset_name,
        'RF_Train_RMSE_PCA': rf_train_rmse_pca,
        'RF_Test_RMSE_PCA': rf_test_rmse_pca,
        'RF_Train_MAE_PCA': rf_train_mae_pca,
        'RF_Test_MAE_PCA': rf_test_mae_pca,
        'LR_Train_RMSE_PCA': lr_train_rmse_pca,
        'LR_Test_RMSE_PCA': lr_test_rmse_pca,
        'LR_Train_MAE_PCA': lr_train_mae_pca,
        'LR_Test_MAE_PCA': lr_test_mae_pca,
    }

In [43]:
# Result all dataset
results = []

Results Both LR and RF

BBCA

In [44]:
BBCARES=process_dataset(bbca, 'BBCA')

In [45]:
results.append(BBCARES)
pd.DataFrame(BBCARES, index=[0])

Unnamed: 0,Dataset,RF_Train_RMSE_PCA,RF_Test_RMSE_PCA,RF_Train_MAE_PCA,RF_Test_MAE_PCA,LR_Train_RMSE_PCA,LR_Test_RMSE_PCA,LR_Train_MAE_PCA,LR_Test_MAE_PCA
0,BBCA,24.368147,69.989954,17.075382,48.567683,61.825945,65.438642,45.863238,48.803874


BBNI

In [46]:
BBNIRES=process_dataset(bbni, 'BBNI')

In [47]:
results.append(BBNIRES)
pd.DataFrame(BBNIRES, index=[0])

Unnamed: 0,Dataset,RF_Train_RMSE_PCA,RF_Test_RMSE_PCA,RF_Train_MAE_PCA,RF_Test_MAE_PCA,LR_Train_RMSE_PCA,LR_Test_RMSE_PCA,LR_Train_MAE_PCA,LR_Test_MAE_PCA
0,BBNI,29.04154,80.903199,20.263635,56.573205,66.072413,65.239914,48.328434,48.781701


BBRI

In [48]:
BBRIRES=process_dataset(bbri, 'BBRI')

In [49]:
results.append(BBRIRES)
pd.DataFrame(BBRIRES, index=[0])

Unnamed: 0,Dataset,RF_Train_RMSE_PCA,RF_Test_RMSE_PCA,RF_Train_MAE_PCA,RF_Test_MAE_PCA,LR_Train_RMSE_PCA,LR_Test_RMSE_PCA,LR_Train_MAE_PCA,LR_Test_MAE_PCA
0,BBRI,32.756793,91.824529,22.368578,62.486992,79.069157,80.152601,57.919219,60.04627


BMRI

In [50]:
BMRIRES=process_dataset(bmri, 'BMRI')

In [51]:
results.append(BMRIRES)
pd.DataFrame(BMRIRES, index=[0])

Unnamed: 0,Dataset,RF_Train_RMSE_PCA,RF_Test_RMSE_PCA,RF_Train_MAE_PCA,RF_Test_MAE_PCA,LR_Train_RMSE_PCA,LR_Test_RMSE_PCA,LR_Train_MAE_PCA,LR_Test_MAE_PCA
0,BMRI,33.284662,90.45639,23.355714,65.855488,81.848405,84.69792,59.744225,64.344935


Overall Evaluation Metrics

In [52]:
resultsall=pd.DataFrame(results)
resultsall

Unnamed: 0,Dataset,RF_Train_RMSE_PCA,RF_Test_RMSE_PCA,RF_Train_MAE_PCA,RF_Test_MAE_PCA,LR_Train_RMSE_PCA,LR_Test_RMSE_PCA,LR_Train_MAE_PCA,LR_Test_MAE_PCA
0,BBCA,24.368147,69.989954,17.075382,48.567683,61.825945,65.438642,45.863238,48.803874
1,BBNI,29.04154,80.903199,20.263635,56.573205,66.072413,65.239914,48.328434,48.781701
2,BBRI,32.756793,91.824529,22.368578,62.486992,79.069157,80.152601,57.919219,60.04627
3,BMRI,33.284662,90.45639,23.355714,65.855488,81.848405,84.69792,59.744225,64.344935
