In [398]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.subplots as sp
import yfinance as yf
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

In [399]:
ticker_symbol = "DX-Y.NYB"
ticker = yf.Ticker(ticker_symbol)
df = ticker.history(period="5y").reset_index(drop=False).drop(columns=['Volume', 'Dividends', 'Stock Splits'])
df

Unnamed: 0,Date,Open,High,Low,Close
0,2020-11-05 00:00:00-05:00,93.339996,93.550003,92.489998,92.529999
1,2020-11-06 00:00:00-05:00,92.570000,92.820000,92.180000,92.230003
2,2020-11-09 00:00:00-05:00,92.239998,92.959999,92.129997,92.730003
3,2020-11-10 00:00:00-05:00,92.760002,92.970001,92.599998,92.750000
4,2020-11-11 00:00:00-05:00,92.699997,93.209999,92.610001,93.040001
...,...,...,...,...,...
1252,2025-10-29 00:00:00-04:00,98.739998,99.360001,98.620003,99.220001
1253,2025-10-30 00:00:00-04:00,99.139999,99.720001,98.919998,99.529999
1254,2025-10-31 00:00:00-04:00,99.489998,99.839996,99.419998,99.800003
1255,2025-11-03 00:00:00-05:00,99.750000,99.989998,99.709999,99.870003


# Question 1

In [400]:

fig = px.line(df, x="Date", y=["Close", "High", "Low", "Open"], labels={"value": "USD Price", "variable": "Date"})
fig.show()

# Question 2

The time-series are non-stationary because the price level doesn't fluctuate around a central value. It moves from ~90 in 2021, ~110 in 2022, drops to ~105 in 2023 and then fluctuates between ~100 to ~110 in 2024-2025. Also there is a trend in 2022 (upward) and in 2025 there is volatility. Variance also changes: relatively stable in 2023 but highly volatile in 2022 and 2025.

# Question 3

Training a regressor on these non-stationary time series would cause problems because the model assumes data stay relatively constant over time. Our price moves from ~90 to ~115 to ~100, with high volatility in 2022 and low volatility in 2023. A model trained on 2021-2022 upward trend would wrongly predict prices keep rising and fail completely when they drop in late 2022. The model would also treat temporary events like the spike to 115 as recurring patterns when they might be one-time occurrences. If we train on volatile 2022, the model would probably expect large swings in calm 2023 that don't happen, while training on stable 2023 would leave it unprepared for 2025 volatility. The issue is that regression cannot handle the changing mean and variance in non-stationary data.

# Question 4

We notice that the price of dollar dropped to very low, same period Trump imposed tariffs which cuased financial worry for the economy.

# Question 5

No, we can't predict tomorrow's close price using today's open, high, low, and close. Today's open, high, low and close might be irrelevant tomorrow (eg due to tariffs) because usd price is non-stationary and is affected by data not present in our dataset.

# Question 6

No, random train-test split doesn't make sense because time series data is about events that happen in sequence over time. If we randomly split the data, we would be using future data to predict the past, for example.

# Question 7

In [401]:
train_df = df[df['Date'] < '2024-01-01'].copy().sort_values(by='Date', ascending=True).drop(columns=['Date'])
test_df = df[df['Date'] >= '2024-01-01'].copy().sort_values(by='Date', ascending=True).drop(columns=['Date'])

# Question 8

In [402]:
df['Close'].isna()

0       False
1       False
2       False
3       False
4       False
        ...  
1252    False
1253    False
1254    False
1255    False
1256    False
Name: Close, Length: 1257, dtype: bool

In [403]:
def map_to_inputs_targets_np_arrays(dataframe: pd.DataFrame, N: int = 1, target_col_name: str = 'Close') -> tuple[
    np.ndarray, np.ndarray]:
    targets = dataframe[target_col_name].shift(
        -N).to_numpy()  # shift up to N so it's straight forward to match with input (first N input rows to first target etc)
    num_samples = len(dataframe) - N  # number of samples, last N are omitted due to N window
    inputs_x = []
    targets_y = []
    numpy_df = dataframe.to_numpy()
    for i in range(num_samples):
        timeframe = numpy_df[i:i + N]
        target = targets[i]
        inputs_x.append(timeframe)
        targets_y.append(target)

    return np.array(inputs_x), np.array(targets_y)

# Question 9

If we want to predict the next day we should choose N which is not too big (~20-~50), so the model discovers patterns for the near future based on a trend for example that happens in the current time period.
For predicting over a month we should choose a larger N (~60-~120) so patterns happening over months are taken into account.

# Question 10

In [404]:
N = 5

x_train_raw, y_train = map_to_inputs_targets_np_arrays(train_df, N)
x_test_raw, y_test = map_to_inputs_targets_np_arrays(test_df, N)

In [405]:
x_test_raw.shape

(460, 5, 4)

In [406]:
x_train = x_train_raw.reshape(x_train_raw.shape[0], -1)  # -1 calculates second dimension automatically
x_test = x_test_raw.reshape(x_test_raw.shape[0], -1)

In [407]:
x_train.shape

(787, 20)

In [408]:
assert len(x_train) == len(y_train)

In [409]:
assert len(x_test) == len(y_test)

In [410]:
x_train[0]

array([93.33999634, 93.55000305, 92.48999786, 92.52999878, 92.56999969,
       92.81999969, 92.18000031, 92.23000336, 92.23999786, 92.95999908,
       92.12999725, 92.73000336, 92.76000214, 92.97000122, 92.59999847,
       92.75      , 92.69999695, 93.20999908, 92.61000061, 93.04000092])

In [411]:
x_train_raw[0]

array([[93.33999634, 93.55000305, 92.48999786, 92.52999878],
       [92.56999969, 92.81999969, 92.18000031, 92.23000336],
       [92.23999786, 92.95999908, 92.12999725, 92.73000336],
       [92.76000214, 92.97000122, 92.59999847, 92.75      ],
       [92.69999695, 93.20999908, 92.61000061, 93.04000092]])

In [412]:
assert x_train[0][0] == x_train_raw[0][0][0]

# Question 11

In [413]:
def q11(x_train: np.ndarray, y_train: np.ndarray, x_test: np.ndarray,
        y_test: np.ndarray) -> LinearRegression | RandomForestRegressor | BaggingRegressor | BayesianRidge | StackingRegressor:
    seed = 0
    models_dict = {
        'a: Linear Reg': LinearRegression(),
        'b: Random Forest': RandomForestRegressor(n_estimators=100, random_state=seed),
        'c: Bagging (LR)': BaggingRegressor(estimator=LinearRegression(), n_estimators=10, random_state=seed),
        'd: BLR': BayesianRidge(),
        'e: Stacking (LR+DT)': StackingRegressor(
            estimators=[('lr', LinearRegression()), ('dt', DecisionTreeRegressor())],
            final_estimator=LinearRegression()
        )
    }

    results = []

    min_mae_model = None
    min = 0
    for name, model in models_dict.items():
        model.fit(x_train, y_train)

        mae_train = mean_absolute_error(y_train, model.predict(x_train))
        mae_test = mean_absolute_error(y_test, model.predict(x_test))
        if min_mae_model is None or mae_test < min:
            min_mae_model = model
            min = mae_test

        results.append({'Model': name, 'MAE': mae_train, 'Dataset': 'Train'})
        results.append({'Model': name, 'MAE': mae_test, 'Dataset': 'Test'})

    df_results = pd.DataFrame(results)

    fig = px.bar(
        df_results,
        x='Model',
        y='MAE',
        color='Dataset',
        barmode='group',
        color_discrete_map={'Train': 'blue', 'Test': 'green'},
        title='MAE by Model (Blue: Train, Green: Test)',
        labels={'MAE': 'Mean Absolute Error', 'Model': 'Regressor'},
        height=600
    )

    fig.update_layout(
        xaxis_title="Regressor",
        yaxis_title="MAE",
        legend_title="Dataset",
    )

    fig.show()

    return min_mae_model


q11(x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test)

0,1,2
,max_iter,300
,tol,0.001
,alpha_1,1e-06
,alpha_2,1e-06
,lambda_1,1e-06
,lambda_2,1e-06
,alpha_init,
,lambda_init,
,compute_score,False
,fit_intercept,True


Linear Regression: Fits a straight line to predict the target from features.

Random Forest: Builds many decision trees on random data subsets (built with sampling) and averages their predictions.

Bagging (LR): Trains multiple Linear Regression models on random samples and averages results.

BLR: Estimates probabilities for weights instead of fixed values.

Stacking: Combines predictions from Linear Regression and Decision Tree, then trains a final model on them.

# Question 12

## a)

In [414]:
df12 = df.copy()
df12['Month'] = df12['Date'].dt.month
df12['Day'] = df12['Date'].dt.day


The month can improve predictions because many things repeat every year (like holidays or weather), and the model can learn these seasonal patterns. It helps the model know when to expect higher or lower values based on the time of year.

## b)

In [415]:
for col in ['Open', 'High', 'Low', 'Close']:
    df12[f'{col}_log_return'] = np.log(df12[col].shift(-1) / df12[col])
df12

Unnamed: 0,Date,Open,High,Low,Close,Month,Day,Open_log_return,High_log_return,Low_log_return,Close_log_return
0,2020-11-05 00:00:00-05:00,93.339996,93.550003,92.489998,92.529999,11,5,-0.008284,-0.007834,-0.003357,-0.003247
1,2020-11-06 00:00:00-05:00,92.570000,92.820000,92.180000,92.230003,11,6,-0.003571,0.001507,-0.000543,0.005407
2,2020-11-09 00:00:00-05:00,92.239998,92.959999,92.129997,92.730003,11,9,0.005622,0.000108,0.005089,0.000216
3,2020-11-10 00:00:00-05:00,92.760002,92.970001,92.599998,92.750000,11,10,-0.000647,0.002578,0.000108,0.003122
4,2020-11-11 00:00:00-05:00,92.699997,93.209999,92.610001,93.040001,11,11,0.003339,-0.000751,0.001834,-0.000860
...,...,...,...,...,...,...,...,...,...,...,...
1252,2025-10-29 00:00:00-04:00,98.739998,99.360001,98.620003,99.220001,10,29,0.004043,0.003617,0.003037,0.003119
1253,2025-10-30 00:00:00-04:00,99.139999,99.720001,98.919998,99.529999,10,30,0.003524,0.001203,0.005042,0.002709
1254,2025-10-31 00:00:00-04:00,99.489998,99.839996,99.419998,99.800003,10,31,0.002610,0.001501,0.002913,0.000701
1255,2025-11-03 00:00:00-05:00,99.750000,99.989998,99.709999,99.870003,11,3,0.004302,0.003694,0.003494,0.002930


In [416]:
df12.dropna()

Unnamed: 0,Date,Open,High,Low,Close,Month,Day,Open_log_return,High_log_return,Low_log_return,Close_log_return
0,2020-11-05 00:00:00-05:00,93.339996,93.550003,92.489998,92.529999,11,5,-0.008284,-0.007834,-0.003357,-0.003247
1,2020-11-06 00:00:00-05:00,92.570000,92.820000,92.180000,92.230003,11,6,-0.003571,0.001507,-0.000543,0.005407
2,2020-11-09 00:00:00-05:00,92.239998,92.959999,92.129997,92.730003,11,9,0.005622,0.000108,0.005089,0.000216
3,2020-11-10 00:00:00-05:00,92.760002,92.970001,92.599998,92.750000,11,10,-0.000647,0.002578,0.000108,0.003122
4,2020-11-11 00:00:00-05:00,92.699997,93.209999,92.610001,93.040001,11,11,0.003339,-0.000751,0.001834,-0.000860
...,...,...,...,...,...,...,...,...,...,...,...
1251,2025-10-28 00:00:00-04:00,98.750000,98.949997,98.570000,98.690002,10,28,-0.000101,0.004135,0.000507,0.005356
1252,2025-10-29 00:00:00-04:00,98.739998,99.360001,98.620003,99.220001,10,29,0.004043,0.003617,0.003037,0.003119
1253,2025-10-30 00:00:00-04:00,99.139999,99.720001,98.919998,99.529999,10,30,0.003524,0.001203,0.005042,0.002709
1254,2025-10-31 00:00:00-04:00,99.489998,99.839996,99.419998,99.800003,10,31,0.002610,0.001501,0.002913,0.000701


## c)

In [417]:
columns = df12.columns.tolist()
fig = sp.make_subplots(rows=len(columns), cols=1)
for i, col in enumerate(columns):
    subfig = px.histogram(df12[col], nbins=20, title=col)
    for trace in range(len(subfig.data)):
        fig.add_trace(subfig.data[trace], row=i + 1, col=1)
fig.update_layout(
    height=300 * len(columns),  # 300px per subplot
    title_text="Histogram of each column",
)
fig.show()

Log returns will probably improve model accuracy because their statistical behavior is consistent over time (stationarity), unlike raw prices which tend to drift randomly. Their histogram usually looks like a predictable bell curve (distribution), and their spread (variance stability) doesn't change wildly. This stable, bell-shaped distribution makes them much easier and safer to use for building forecasting models.

## d)

If a model predict 0.01 it means logarithmic return of close is 0.01, to get the actual close price we solve according to the logarithmic return type: $$C_{t+1} = C_t \times e^{0.01}$$

In [418]:
Close_t = 0.95
Close_t_plus_1 = Close_t * np.exp(0.01)
print("Close prediction for C_t 0.95: ", Close_t_plus_1)

Close prediction for C_t 0.95:  0.9595476587299595


## e

In [419]:
drop_columns = ['Date', 'Open', 'High', 'Low', 'Close']
original_train_log_df_full = df12[df12['Date'] < '2024-01-01'].copy().sort_values(by='Date', ascending=True)
original_test_log_df_full = df12[df12['Date'] >= '2024-01-01'].copy().sort_values(by='Date', ascending=True)
train_log_df = original_train_log_df_full.drop(columns=drop_columns).dropna()
test_log_df = original_test_log_df_full.drop(columns=drop_columns).dropna()

In [420]:
x_log_train_raw, y_log_train = map_to_inputs_targets_np_arrays(train_log_df, N, target_col_name='Close_log_return')
x_log_test_raw, y_log_test = map_to_inputs_targets_np_arrays(test_log_df, N, target_col_name='Close_log_return')
x_log_train = x_log_train_raw.reshape(x_log_train_raw.shape[0], -1)
x_log_test = x_log_test_raw.reshape(x_log_test_raw.shape[0], -1)

least_mae_model = q11(x_train=x_log_train, y_train=y_log_train, x_test=x_log_test, y_test=y_log_test)

In [421]:
assert len(x_log_train) == len(y_log_train)
assert len(x_log_test) == len(y_log_test)

In [422]:
x_log_train[0]

array([ 1.10000000e+01,  5.00000000e+00, -8.28358951e-03, -7.83395482e-03,
       -3.35731687e-03, -3.24740942e-03,  1.10000000e+01,  6.00000000e+00,
       -3.57125898e-03,  1.50715273e-03, -5.42597294e-04,  5.40658737e-03,
        1.10000000e+01,  9.00000000e+00,  5.62168262e-03,  1.07590343e-04,
        5.08853187e-03,  2.15620474e-04,  1.10000000e+01,  1.00000000e+01,
       -6.47095766e-04,  2.57812865e-03,  1.08008599e-04,  3.12181656e-03,
        1.10000000e+01,  1.10000000e+01,  3.33859757e-03, -7.51271250e-04,
        1.83395239e-03, -8.60234796e-04])

In [423]:
x_log_train_raw[0]

array([[ 1.10000000e+01,  5.00000000e+00, -8.28358951e-03,
        -7.83395482e-03, -3.35731687e-03, -3.24740942e-03],
       [ 1.10000000e+01,  6.00000000e+00, -3.57125898e-03,
         1.50715273e-03, -5.42597294e-04,  5.40658737e-03],
       [ 1.10000000e+01,  9.00000000e+00,  5.62168262e-03,
         1.07590343e-04,  5.08853187e-03,  2.15620474e-04],
       [ 1.10000000e+01,  1.00000000e+01, -6.47095766e-04,
         2.57812865e-03,  1.08008599e-04,  3.12181656e-03],
       [ 1.10000000e+01,  1.10000000e+01,  3.33859757e-03,
        -7.51271250e-04,  1.83395239e-03, -8.60234796e-04]])

In [424]:
assert x_log_train[0][0] == x_log_train_raw[0][0][0]

To find actual close price: $$C_{t+1} = C_t \times e^{log return value}$$

In [425]:
y_log_pred = least_mae_model.predict(x_log_test)
base_prices_C_t = original_test_log_df_full['Close'].iloc[N: N + len(y_log_test)].to_numpy()
dates_C_t_plus_1 = original_test_log_df_full['Date'].iloc[N + 1: N + 1 + len(y_log_test)].to_numpy()

predicted_prices_C_t_plus_1 = base_prices_C_t * np.exp(y_log_pred)
actual_prices_C_t_plus_1 = base_prices_C_t * np.exp(y_log_test)

fig_df = pd.DataFrame({
    'Date': dates_C_t_plus_1,
    'Predicted Close': predicted_prices_C_t_plus_1,
    'Actual Close': actual_prices_C_t_plus_1
})

fig = px.line(
    fig_df,
    x='Date',
    y=['Predicted Close', 'Actual Close'],
    labels={'value': 'USD Price', 'variable': 'Date'},
    title='Predicted vs Actual Close Price'
)
fig.show()