In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import xgboost as xgb

df = pd.read_csv(r"C:\Users\Yashwi\OneDrive\Desktop\IITK\Summer Project 2024\Application of Probability Theory\Stock Market\archive\all_stocks_5yr.csv")
df = df.sort_values(by='date')
prices = df['close'].dropna()

# features
returns = 100 * np.diff(np.log(prices.values))
returns_df = pd.DataFrame()
returns_df['return'] = returns

returns_df['lag_1'] = returns_df['return'].shift(1)
returns_df['lag_2'] = returns_df['return'].shift(2)
returns_df['ma_3'] = returns_df['return'].rolling(window=3).mean()
returns_df['ma_5'] = returns_df['return'].rolling(window=5).mean()
returns_df['std_3'] = returns_df['return'].rolling(window=3).std()
returns_df['std_5'] = returns_df['return'].rolling(window=5).std()
returns_df['momentum'] = returns_df['return'] - returns_df['lag_1']

# 1 if next return is positive, else 0
returns_df['target'] = (returns_df['return'].shift(-1) > 0).astype(int)
returns_df.dropna(inplace=True)

# train-test split
X = returns_df.drop(columns=['target'])
y = returns_df['target']
split = int(0.8 * len(X))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

# train
# model = LogisticRegression(class_weight='balanced', max_iter=1000)
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, zero_division=0))
print("Recall:", recall_score(y_test, y_pred, zero_division=0))
print("F1 Score:", f1_score(y_test, y_pred, zero_division=0))



Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.7142649446315636
Precision: 0.7176985799957514
Recall: 0.7081633640218635
F1 Score: 0.7128990894187537


In [15]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb

df = pd.read_csv(r"C:\Users\Yashwi\OneDrive\Desktop\IITK\Summer Project 2024\Application of Probability Theory\Stock Market\archive\all_stocks_5yr.csv")
df = df.sort_values('date')

# features
df['return'] = df['close'].pct_change()  # daily returns
df['target'] = (df['return'].shift(-1) > 0).astype(int)  # 1 if next day up, else 0

df['lag_1'] = df['return'].shift(1)
df['lag_2'] = df['return'].shift(2)
df['lag_3'] = df['return'].shift(3)

df['ma_5'] = df['close'].rolling(window=5).mean()
df['ma_10'] = df['close'].rolling(window=10).mean()
df['volatility_5'] = df['return'].rolling(window=5).std()

df = df.dropna()

X = df[['lag_1', 'lag_2', 'lag_3', 'ma_5', 'ma_10', 'volatility_5']]
y = df['target']

# time-series-aware cross-validation
tscv = TimeSeriesSplit(n_splits=5)

accuracies, precisions, recalls, f1s = [], [], [], []

for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # model = LogisticRegression(class_weight='balanced', max_iter=1000)
    model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracies.append(accuracy_score(y_test, y_pred))
    precisions.append(precision_score(y_test, y_pred))
    recalls.append(recall_score(y_test, y_pred))
    f1s.append(f1_score(y_test, y_pred))

print(f"Average Accuracy: {np.mean(accuracies):.3f}")
print(f"Average Precision: {np.mean(precisions):.3f}")
print(f"Average Recall: {np.mean(recalls):.3f}")
print(f"Average F1 Score: {np.mean(f1s):.3f}")

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Average Accuracy: 0.614
Average Precision: 0.659
Average Recall: 0.475
Average F1 Score: 0.550
