In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('../../datasets/processed_data/combined_features/ELGIEQUIP.BO.csv')

In [3]:
df.columns

Index(['date', 'open', 'close', 'adj close', 'volume', 'low', 'high',
       'volume_adi', 'volume_obv', 'volume_cmf',
       ...
       'inr=x_percent_change', 'cl=f_percent_change',
       'treasury_yeild_10_years_percent_change', 'usdx-index_percent_change',
       '^nsei_percent_change', '^bsesn_percent_change', '^gspc_percent_change',
       'hsi_percent_change', 'sha_percent_change', '^sti_percent_change'],
      dtype='object', length=635)

In [4]:
def create_custom_target(df: pd.DataFrame) -> pd.DataFrame:
    """
    This function creates the custom target price, which is computed as ln(high/yesterday_close)
    """    
    # make a copy of the dataframe so as not to change the original dataframe
    data_df = df.copy()
    
    # create yesterday_close feature as
    data_df['yesterday_close'] = data_df['close'].shift(1)
    
    # create custom target price to predict, computing  ln(high/yesterday_close)
    data_df['ln_target'] = np.log(data_df['high'] / data_df['yesterday_close'])
    
    # as yesterday close would not be available for first day, 
    # we would not have custom target price for that day, which needs to be excluded 
    return(data_df.iloc[1:, ])

In [5]:
df = create_custom_target(df)

In [6]:
df['yesterday_close']

1        13.562500
2        13.225000
3        13.175000
4        12.812500
5        12.787500
           ...    
3745    571.549988
3746    571.549988
3747    556.349976
3748    543.400024
3749    534.200012
Name: yesterday_close, Length: 3749, dtype: float64

In [7]:
df['ln_target'] 

1       0.021879
2       0.016870
3       0.022515
4       0.014528
5      -0.004900
          ...   
3745    0.019835
3746    0.020350
3747    0.011971
3748    0.012073
3749    0.027328
Name: ln_target, Length: 3749, dtype: float64

In [8]:
# Drop cols with NaN values
df = df.dropna(axis=1, how='all')
df.fillna(method='ffill', inplace=True)

In [9]:
combined_date_df = df['date']
train_date, test_date = train_test_split(combined_date_df, train_size=0.8, shuffle=False)

In [10]:
df_without_date = df.drop(columns=["date"])

In [11]:
df_without_date.dropna(inplace=True)

In [12]:
df_without_date

Unnamed: 0,open,close,adj close,volume,low,high,volume_adi,volume_obv,volume_cmf,volume_fi,...,treasury_yeild_10_years_percent_change,usdx-index_percent_change,^nsei_percent_change,^bsesn_percent_change,^gspc_percent_change,hsi_percent_change,sha_percent_change,^sti_percent_change,yesterday_close,ln_target
1,13.162500,13.225000,9.042980,47944,12.925000,13.862500,-3.054382e+04,-34660,-0.292259,-4.452554e+03,...,-0.018383,0.004251,-0.026796,-0.021467,-0.010504,0.015385,-0.011495,0.000000,13.562500,0.021879
2,13.325000,13.175000,9.008792,8928,13.125000,13.450000,-3.672473e+04,-43588,-0.292259,-4.452554e+03,...,-0.018383,0.004251,-0.005000,-0.006264,-0.005788,0.015385,-0.011495,0.000000,13.225000,0.016870
3,13.000000,12.812500,8.760922,61976,12.500000,13.475000,-5.897254e+04,-105564,-0.292259,-4.452554e+03,...,0.010775,0.002458,-0.027630,-0.028051,-0.000327,0.015385,-0.011495,0.000000,13.175000,0.022515
4,12.887500,12.787500,8.743828,24064,12.750000,13.000000,-7.581727e+04,-129628,-0.292259,-4.452554e+03,...,0.021574,-0.005176,0.019921,0.016431,0.019496,0.015385,-0.011495,0.000000,12.812500,0.014528
5,12.500000,12.650000,8.649808,36556,12.187500,12.725000,-4.946304e+04,-166184,-0.292259,-4.452554e+03,...,0.021574,-0.005176,-0.010509,-0.012527,-0.030889,0.015385,-0.011495,0.000000,12.787500,-0.004900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3745,535.000000,571.549988,571.549988,136540,531.400024,583.000000,1.677814e+06,75230258,0.297408,3.101130e+06,...,-0.005647,0.002810,0.001835,0.000292,-0.011222,0.181818,0.000000,-0.002427,571.549988,0.019835
3746,567.950012,556.349976,556.349976,70707,552.049988,583.299988,1.626566e+06,75159551,0.255305,2.504576e+06,...,0.005679,0.003865,-0.003412,-0.003356,-0.007319,0.181818,0.000000,-0.003102,571.549988,0.020350
3747,558.849976,543.400024,543.400024,22764,540.349976,563.049988,1.609919e+06,75136787,0.240868,2.104666e+06,...,0.025545,0.003080,0.001955,0.001600,0.008758,0.181818,0.000000,-0.000415,556.349976,0.011971
3748,550.000000,534.200012,534.200012,100755,531.000000,550.000000,1.543103e+06,75036032,0.156200,1.671578e+06,...,0.025545,0.003080,0.005368,0.005515,0.013049,0.181818,0.000000,-0.000279,543.400024,0.012073


In [13]:
def timeseries_to_supervise(df, window_size, target):    
    X = []
    y = []
    indx = []
    no_records = len(df)
    
    for i in range(window_size, no_records):
        X.append(df.iloc[i-window_size:i].drop(target, axis=1).values.flatten())  # Collect past records as a sequence
        y.append(df.iloc[i][target])  # Next record as target variable
        indx.append(np.arange(i-window_size, i))

    X = pd.DataFrame(X)
    y = pd.Series(y)
    return(X, y, indx)

In [14]:
window_size = 10

In [15]:
# do train/test split the data with shuffle = False
train_data, test_data = train_test_split(df_without_date, train_size=0.8, shuffle=False)

# Identify the indices of the specified sentiment columns in the original dataset
sentiment_columns = ['agg_polarity', 'agg_compound', 'topic_polarity', 'topic_compound', 'ticker_polarity', 'ticker_compound']
column_indices = [train_data.columns.get_loc(col) for col in sentiment_columns if col in train_data.columns]

column_indices

[616, 618, 619, 621, 623, 622]

In [16]:
# convert timeseries to be used in supervise learning model
X_train, y_train, indx_train = timeseries_to_supervise(train_data, window_size, 'ln_target')  

# convert timeseries to be used in supervise learning model    
X_test, y_test, indx_test = timeseries_to_supervise(test_data, window_size, 'ln_target')  

In [17]:
# Use SelectKBest to select the top 44 features
selector = SelectKBest(score_func=f_regression, k=44)
X_train_44 = selector.fit_transform(X_train, y_train)
X_test_44 = selector.transform(X_test)

# Determine if the specified columns are among the top 44 features
selected_indices = selector.get_support(indices=True)
missing_indices = [idx for idx in column_indices if idx not in selected_indices]

# Add the specified columns if they are not among the top 44 features
for idx in missing_indices:
    extracted_column_train = X_train.iloc[:, idx].values.reshape(-1, 1)
    extracted_column_test = X_test.iloc[:, idx].values.reshape(-1, 1)
    X_train_44 = np.hstack([X_train_44, extracted_column_train])
    X_test_44 = np.hstack([X_test_44, extracted_column_test])

# If some of the specified columns were already in the top 44, 
# select additional top features to make the total count 50
remaining_indices = [i for i in range(X_train.shape[1]) if i not in selected_indices and i not in missing_indices]
num_additional_features_needed = 50 - X_train_44.shape[1]

if num_additional_features_needed > 0:
    additional_selector = SelectKBest(score_func=f_regression, k=num_additional_features_needed)
    additional_selector.fit(X_train.iloc[:, remaining_indices], y_train)
    X_train_additional = additional_selector.transform(X_train.iloc[:, remaining_indices])
    X_test_additional = additional_selector.transform(X_test.iloc[:, remaining_indices])

    X_train_44 = np.hstack([X_train_44, X_train_additional])
    X_test_44 = np.hstack([X_test_44, X_test_additional])

X_train_50 = X_train_44
X_test_50 = X_test_44

In [19]:
linear_reg = LinearRegression()

param_grid = {
    'fit_intercept': [True, False],         
}

grid_search = GridSearchCV(linear_reg, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train_50, y_train)

print("Best parameters: ", grid_search.best_params_)
print("Best cross-validation score: ", -grid_search.best_score_)

model_50 = grid_search.best_estimator_
model_50.fit(X_train_50, y_train)

Best parameters:  {'fit_intercept': True}
Best cross-validation score:  0.48590803164611557


In [21]:
def convert_custom_target_to_actual_for_supervise(df: pd.DataFrame, window: int, y: "pd.Series[int]") -> "pd.Series[int]":
    """
    this module converts custom target - ln(high/yesterday_close) to actual high price again for timeseries converted data using rolling         window of size 10
    """
    data_df = df.copy()
    
    # exclude first 10 rows of train/test data, as while us
    
    y = np.exp(y) * data_df.loc[data_df.index[window:], 'yesterday_close'].reset_index(drop=True)
    return(y)    

In [22]:
def evaluate_model(model, window, test_data, test_date, X_test, y_test):
    
    # do target prediction using the provide model
    y_pred = model.predict(X_test)

    # convert back to original value, before computing mape            
    y_test = convert_custom_target_to_actual_for_supervise(test_data, window, y_test)
    y_pred = convert_custom_target_to_actual_for_supervise(test_data, window, y_pred)

    test_dates = test_date[window:].reset_index(drop=True)
    predictions_df = pd.DataFrame({'date': test_dates, 'y_test': y_test, 'y_pred': y_pred})

    # compute regression metric - mape 
    mape = mean_absolute_percentage_error(y_test, y_pred)

    # compute rmse metric
    rmse = mean_squared_error(y_test, y_pred, squared=False)        
    return(predictions_df, mape, rmse)

In [23]:
predictions_df, mape, rmse = evaluate_model(model_50, window_size, test_data, test_date, X_test_50, y_test)

In [24]:
predictions_df, mape, rmse

(           date      y_test      y_pred
 0    2020-08-25  114.199997  118.476197
 1    2020-08-26  113.900002  116.425709
 2    2020-08-27  115.000000  117.288799
 3    2020-08-28  115.675003  117.563348
 4    2020-08-31  115.000000  116.637782
 ..          ...         ...         ...
 735  2023-05-24  583.000000  590.532377
 736  2023-05-25  583.299988  585.627757
 737  2023-05-26  563.049988  570.592115
 738  2023-05-30  550.000000  555.494707
 739  2023-05-31  549.000000  546.712044
 
 [740 rows x 3 columns],
 0.01776715649954395,
 9.127031651982774)

In [25]:
print(f"MAPE for model with top 50 features: {mape*100:.2f}%")

MAPE for model with top 50 features: 1.78%


In [26]:
predictions_df.to_csv("lr_ELGIEQUIP.csv", index=False)

In [30]:
feature_names = df.columns.tolist()

# Get coefficients from the model
coefficients = model_50.coef_

# Map coefficients to corresponding feature names
feature_coefficient_mapping = dict(zip(feature_names, coefficients))

# Sort by absolute coefficient values (magnitude)
sorted_features = sorted(feature_coefficient_mapping.items(), key=lambda x: abs(x[1]), reverse=True)

# Display top 10
for feature, coef in sorted_features[:10]:
    print(f"{feature}: {coef}")



trend_ema_slow: 0.06368064807631246
trend_vortex_ind_neg: -0.018076149464476485
trend_trix: 0.010008167654171273
trend_macd: 0.009873975082841803
trend_vortex_ind_diff: 0.007892039305151042
trend_mass_index: -0.005963893231875655
volatility_dcm: -0.005526617657026831
trend_ema_fast: -0.004669985961057201
trend_vortex_ind_pos: -0.004014878324984067
volatility_dcp: -0.0029955864735173826


In [31]:
import altair as alt

alt.themes.enable('fivethirtyeight')

predictions_df['date'] = pd.to_datetime(predictions_df['date'])

predictions_df['label'] = 'Actual'
predictions_df['predicted_label'] = 'Predicted'

# Actual high price line
line1 = alt.Chart(predictions_df).mark_line(strokeWidth=2).encode(
    x='date:T',
    y=alt.Y('y_test:Q', title='Price', scale=alt.Scale(zero=False)),
    color=alt.Color('label:N', legend=alt.Legend(title="Line Type")),
    tooltip=['date', 'y_test', 'y_pred']
)

# Predicted high price line
line2 = alt.Chart(predictions_df).mark_line(strokeWidth=1, strokeDash=[3, 3]).encode(
    x='date:T',
    y=alt.Y('y_pred:Q', title='', scale=alt.Scale(zero=False)),
    color=alt.Color('predicted_label:N', legend=alt.Legend(title="Line Type")),
    tooltip=['date', 'y_test', 'y_pred']
)

# Combine the two lines
chart = alt.layer(line1, line2).properties(
    title='Actual vs Predicted High Prices',
    width=650,
    height=400
).interactive()

chart

Repeat the process for the top 50 features without sentiment of ELGIEQUIP.BO

In [32]:
df = pd.read_csv('../../datasets/processed_data/combined_features/ELGIEQUIP.BO.csv')

In [33]:
df_without_sentiment = df.drop(columns=['agg_polarity', 'agg_compound', 'topic_polarity', 'topic_compound', 'ticker_polarity', 'ticker_compound'])

In [34]:
df_without_sentiment = create_custom_target(df_without_sentiment)

In [35]:
# Drop cols with NaN values
df_without_sentiment = df_without_sentiment.dropna(axis=1, how='all')
df_without_sentiment.fillna(method='ffill', inplace=True)

In [36]:
combined_date_df = df_without_sentiment['date']
train_date, test_date = train_test_split(combined_date_df, train_size=0.8, shuffle=False)

In [37]:
df_without_date = df_without_sentiment.drop(columns=["date"])

In [38]:
df_without_date.dropna(inplace=True)

In [39]:
# do train/test split the data with shuffle = False
train_data, test_data = train_test_split(df_without_date, train_size=0.8, shuffle=False)

# convert timeseries to be used in supervise learning model
X_train, y_train, indx_train = timeseries_to_supervise(train_data, window_size, 'ln_target')

# convert timeseries to be used in supervise learning model    
X_test, y_test, indx_test = timeseries_to_supervise(test_data, window_size, 'ln_target')  

In [40]:
selector = SelectKBest(score_func=f_regression, k=50)
X_train_50 = selector.fit_transform(X_train, y_train)
X_test_50 = selector.transform(X_test)

In [41]:
linear_reg = LinearRegression()

param_grid = {
    'fit_intercept': [True, False],         
}

grid_search = GridSearchCV(linear_reg, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train_50, y_train)

print("Best parameters: ", grid_search.best_params_)
print("Best cross-validation score: ", -grid_search.best_score_)

model_50 = grid_search.best_estimator_
model_50.fit(X_train_50, y_train)

Best parameters:  {'fit_intercept': False}
Best cross-validation score:  9.325578841168264


In [42]:
predictions_df, mape, rmse = evaluate_model(model_50, window_size, test_data, test_date, X_test_50, y_test)

In [43]:
predictions_df, mape, rmse

(           date      y_test      y_pred
 0    2020-08-25  114.199997  118.301923
 1    2020-08-26  113.900002  116.425265
 2    2020-08-27  115.000000  117.306102
 3    2020-08-28  115.675003  117.465682
 4    2020-08-31  115.000000  116.466288
 ..          ...         ...         ...
 735  2023-05-24  583.000000  587.986767
 736  2023-05-25  583.299988  583.828207
 737  2023-05-26  563.049988  568.827425
 738  2023-05-30  550.000000  552.306251
 739  2023-05-31  549.000000  544.132625
 
 [740 rows x 3 columns],
 0.017562207052635306,
 9.13656705535197)

In [44]:
df.to_csv('lr_ELGIEQUIP_without_sentiment.csv', index=False)

In [45]:
print(f"MAPE for model with top 50 features without sentiment features: {mape*100:.2f}%")

MAPE for model with top 50 features without sentiment features: 1.76%


In [46]:
feature_names = df.columns.tolist()

# Get coefficients from the model
coefficients = model_50.coef_

# Map coefficients to corresponding feature names
feature_coefficient_mapping = dict(zip(feature_names, coefficients))

# Sort by absolute coefficient values (magnitude)
sorted_features = sorted(feature_coefficient_mapping.items(), key=lambda x: abs(x[1]), reverse=True)

# Display top 10
for feature, coef in sorted_features[:10]:
    print(f"{feature}: {coef}")

trend_ema_slow: 0.007290970905299798
trend_macd_signal: -0.005011286821535551
trend_mass_index: -0.003271710309079676
trend_vortex_ind_diff: 0.003203415969423394
trend_sma_fast: -0.00288273802255256
trend_sma_slow: -0.0024702843962163337
volatility_kchi: 0.0016549118991988556
trend_vortex_ind_neg: 0.0015257915055808555
volatility_atr: 0.0014692584328491758
volatility_bbm: -0.0011322548529605486


In [47]:
import altair as alt

alt.themes.enable('fivethirtyeight')

predictions_df['date'] = pd.to_datetime(predictions_df['date'])

predictions_df['label'] = 'Actual'
predictions_df['predicted_label'] = 'Predicted'

# Actual high price line
line1 = alt.Chart(predictions_df).mark_line(strokeWidth=2).encode(
    x='date:T',
    y=alt.Y('y_test:Q', title='Price', scale=alt.Scale(zero=False)),
    color=alt.Color('label:N', legend=alt.Legend(title="Line Type")),
    tooltip=['date', 'y_test', 'y_pred']
)

# Predicted high price line
line2 = alt.Chart(predictions_df).mark_line(strokeWidth=1, strokeDash=[3, 3]).encode(
    x='date:T',
    y=alt.Y('y_pred:Q', title='', scale=alt.Scale(zero=False)),
    color=alt.Color('predicted_label:N', legend=alt.Legend(title="Line Type")),
    tooltip=['date', 'y_test', 'y_pred']
)

# Combine the two lines
chart = alt.layer(line1, line2).properties(
    title='Actual vs Predicted High Prices',
    width=650,
    height=400
).interactive()

chart