In [86]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge


from xgboost import XGBRegressor

In [87]:
df = pd.read_csv('twitter_stock.csv', parse_dates=['Date'])
df = df.sort_values('Date').set_index('Date')

price_col = 'Adj Close'

In [88]:
# rows where any numeric column equals 0
zero_row = df[(df.select_dtypes(include=[np.number]) == 0).any(axis=1)]
print(zero_row)

                 Open   High        Low      Close  Adj Close  Volume
Date                                                                 
2015-09-22  27.049999  27.48  26.620001  26.830000  26.830000     0.0
2017-09-01  16.969999  17.00  16.580000  16.860001  16.860001     0.0


In [89]:
df = df.replace(0, np.nan)
df = df.ffill().bfill()

In [90]:
df['log_ret'] = np.log(df[price_col] / df[price_col].shift(1))

In [91]:
def make_features(df):
    df = df.copy()


    df['ret_1'] = df['log_ret']
    df['ret_2'] = df['log_ret'].shift(1)
    df['ret_5'] = df[price_col].pct_change(5)


    df['vol_5'] = df['log_ret'].rolling(5).std()
    df['vol_10'] = df['log_ret'].rolling(10).std()


    df['momentum_5'] = df[price_col] / df[price_col].shift(5) - 1
    df['momentum_10'] = df[price_col] / df[price_col].shift(10) - 1


    df['volume_change'] = df['Volume'].pct_change()


    return df

In [92]:
train_size = int(len(df) * 0.9)


train_raw = df.iloc[:train_size]
test_raw = df.iloc[train_size:]


train = make_features(train_raw).dropna()
test = make_features(test_raw).dropna()

In [93]:
X_train = train.drop(columns=[price_col, 'log_ret'])
y_train = train['log_ret'].shift(-1).dropna()


X_train = X_train.iloc[:-1]


X_test = test.drop(columns=[price_col, 'log_ret'])
y_test = test['log_ret'].shift(-1).dropna()


X_test = X_test.iloc[:-1]

In [94]:
def evaluate(y_true, y_pred, name):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    direction = np.mean(np.sign(y_true) == np.sign(y_pred))
    print(f"{name} | RMSE(ret): {rmse:.6f} | Direction Acc: {direction:.3f}")

def evaluate_price(y_true, y_pred, name):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    print(f"{name} | RMSE(price): {rmse:.6f}")

In [95]:
def reconstruct_price(last_price, returns):
    prices = []
    p = last_price
    for r in returns:
        p = p * np.exp(r)
        prices.append(p)
        return prices

last_price = train_raw[price_col].iloc[-1]

In [96]:
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

ridge = Ridge(alpha=10)
ridge.fit(X_train_s, y_train)
y_pred_ridge = ridge.predict(X_test_s)

evaluate(y_test, y_pred_ridge, "Ridge Regression")

Ridge Regression | RMSE(ret): 0.036739 | Direction Acc: 0.500


In [97]:
rf = RandomForestRegressor(
n_estimators=600,
max_depth=6,
min_samples_leaf=20,
max_features=0.5,
random_state=42,
n_jobs=-1
)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

evaluate(y_test, y_pred_rf, "Random Forest")

Random Forest | RMSE(ret): 0.035900 | Direction Acc: 0.523


In [98]:
xgb = XGBRegressor(
n_estimators=800,
max_depth=4,
learning_rate=0.02,
subsample=0.8,
colsample_bytree=0.8,
objective='reg:squarederror',
random_state=42,
early_stopping_rounds=50
)


xgb.fit(
X_train, y_train,
eval_set=[(X_test, y_test)],
verbose=False
)


y_pred_xgb = xgb.predict(X_test)

evaluate(y_test, y_pred_xgb, 'XGBoost')

XGBoost | RMSE(ret): 0.035841 | Direction Acc: 0.509


In [99]:
from statsmodels.tsa.arima.model import ARIMA


# Prepare return series
y_full = np.log(df[price_col]).diff().dropna()
y_train_arima = y_full.iloc[:train_size-1]
y_test_arima = y_full.iloc[train_size-1:train_size-1+len(y_test)]


history = y_train_arima.tolist()
y_pred_arima = []


for t in range(len(y_test_arima)):
    model = ARIMA(history, order=(1,0,1))
    fit = model.fit()
    yhat = fit.forecast()[0]
    y_pred_arima.append(yhat)
    history.append(y_test_arima.iloc[t])


# Evaluation
evaluate(y_test_arima.values, np.array(y_pred_arima), 'ARIMA')



ARIMA | RMSE(ret): 0.036052 | Direction Acc: 0.417


In [100]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout


window = 30
features_lstm = X_train.columns.tolist()


# Build sequences
def make_sequences(X, y, window):
    Xs, ys = [], []
    for i in range(len(X) - window):
        Xs.append(X.iloc[i:i+window].values)
        ys.append(y.iloc[i+window])
    return np.array(Xs), np.array(ys)

Xtr_seq, ytr_seq = make_sequences(X_train, y_train, window)


X_test_ext = pd.concat([X_train.iloc[-window:], X_test])
y_test_ext = pd.concat([y_train.iloc[-window:], y_test])

Xte_seq, yte_seq = make_sequences(X_test_ext, y_test_ext, window)
1

scaler_lstm = StandardScaler()

Xtr_seq = scaler_lstm.fit_transform(
    Xtr_seq.reshape(-1, Xtr_seq.shape[-1])
).reshape(Xtr_seq.shape)

Xte_seq = scaler_lstm.transform(
    Xte_seq.reshape(-1, Xte_seq.shape[-1])
).reshape(Xte_seq.shape)


model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(window, Xtr_seq.shape[-1])),
    Dropout(0.3),
    LSTM(32),
    Dense(1)
])

model.compile(optimizer='adam', loss='mse')
model.fit(Xtr_seq, ytr_seq, epochs=20, batch_size=32, verbose=0)

y_pred_lstm = model.predict(Xte_seq).flatten()
evaluate(yte_seq, y_pred_lstm, 'LSTM')

  super().__init__(**kwargs)


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
LSTM | RMSE(ret): 0.037019 | Direction Acc: 0.495


In [101]:
from prophet import Prophet

prophet_train = train[[price_col]].copy()
# prophet_train['Date'] = prophet_train.index
prophet_train = prophet_train.reset_index().rename(columns={'Date': 'ds', price_col: 'y'})

prophet_test = test[[price_col]].copy()
# prophet_test['Date'] = prophet_test.index
prophet_test = prophet_test.reset_index().rename(columns={'Date': 'ds', price_col: 'y'})

model = Prophet(
    daily_seasonality=False,
    weekly_seasonality=True,
    yearly_seasonality=True,
    seasonality_mode='multiplicative',   # important for prices
    changepoint_prior_scale=0.05          # controls trend flexibility
)

model.fit(prophet_train)

future = model.make_future_dataframe(periods=len(prophet_test), freq='B')
forecast = model.predict(future)
forecast_test = forecast.iloc[-len(prophet_test):].copy()
forecast_test.index = prophet_test.index

y_true = prophet_test['y']
y_pred = forecast_test['yhat']

# evaluate_price(y_true, y_pred, 'Prophet')

16:24:52 - cmdstanpy - INFO - Chain [1] start processing
16:24:56 - cmdstanpy - INFO - Chain [1] done processing


In [102]:
def make_price_path(start_price, returns):
    return start_price * np.exp(np.cumsum(returns))

price_results = []

price_true = make_price_path(last_price, y_test.values)
price_results.append({
    'Model': 'Ridge',
    'RMSE (price)': np.sqrt(mean_squared_error(price_true, make_price_path(last_price, y_pred_ridge)))
})
price_results.append({
    'Model': 'Random Forest',
    'RMSE (price)': np.sqrt(mean_squared_error(price_true, make_price_path(last_price, y_pred_rf)))
})
price_results.append({
    'Model': 'XGBoost',
    'RMSE (price)': np.sqrt(mean_squared_error(price_true, make_price_path(last_price, y_pred_xgb)))
})

start_price_arima = df[price_col].iloc[train_size-1]
price_true_arima = make_price_path(start_price_arima, y_test_arima.values)
price_results.append({
    'Model': 'ARIMA',
    'RMSE (price)': np.sqrt(mean_squared_error(price_true_arima, make_price_path(start_price_arima, y_pred_arima)))
})

start_price_lstm = make_price_path(last_price, y_test.values[:window])[-1]
price_true_lstm = make_price_path(start_price_lstm, y_test.values[window:])
price_results.append({
    'Model': 'LSTM',
    'RMSE (price)': np.sqrt(mean_squared_error(price_true_lstm, make_price_path(start_price_lstm, y_pred_lstm)))
})

price_results.append({
    'Model': 'Prophet',
    'RMSE (price)': np.sqrt(mean_squared_error(y_true, y_pred))
})

price_results_df = pd.DataFrame(price_results).sort_values('RMSE (price)')

plt.figure(figsize=(10, 4))
plt.bar(price_results_df['Model'], price_results_df['RMSE (price)'])
plt.title('Model Comparison – RMSE on Price')
plt.ylabel('RMSE')
plt.xticks(rotation=30)
plt.show()

ValueError: Found input variables with inconsistent numbers of samples: [186, 216]

In [None]:
# Compare predicted price paths vs actual
price_pred_ridge = pd.Series(make_price_path(last_price, y_pred_ridge), index=y_test.index)
price_pred_rf = pd.Series(make_price_path(last_price, y_pred_rf), index=y_test.index)
price_pred_xgb = pd.Series(make_price_path(last_price, y_pred_xgb), index=y_test.index)
price_pred_arima = pd.Series(make_price_path(start_price_arima, y_pred_arima), index=y_test_arima.index)
price_pred_lstm = pd.Series(make_price_path(start_price_lstm, y_pred_lstm), index=y_true.index)
price_pred_prophet = pd.Series(y_pred, index=y_true.index)

plt.figure(figsize=(12, 5))
plt.plot(y_test.index, price_true, label='Actual Price', linewidth=2)
plt.plot(price_pred_ridge.index, price_pred_ridge, label='Ridge')
plt.plot(price_pred_rf.index, price_pred_rf, label='Random Forest')
plt.plot(price_pred_xgb.index, price_pred_xgb, label='XGBoost')
plt.title('Actual vs Predicted Stock Price')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()
plt.tight_layout()
plt.show()

ValueError: Length of values (186) does not match length of index (217)