# Predykcja cen akcji na rynku

In [3]:
import yfinance as yf
import pandas as pd
import datetime
import numpy as np

In [93]:
from pytz import UTC


## 1. Pobranie danych

In [4]:
df_btc1 = pd.read_csv('/content/BTC-2020min.csv')
df_btc2 = pd.read_csv('/content/BTC-2019min.csv')
df_btc = pd.concat([df_btc1, df_btc2])

In [5]:
df_eth = pd.read_csv('/content/ETH_1min.csv')

In [6]:
#change date
df_btc = df_btc.drop(columns=['date'])
df_btc['date'] = pd.to_datetime(df_btc['unix'], unit='s', utc=True)
df_btc = df_btc.drop(columns=['unix','symbol'])

In [7]:
#change date
df_eth = df_eth.drop(columns=['Date'])
df_eth['date'] = pd.to_datetime(df_eth['Unix Timestamp'] / 1000, unit='s', utc=True)
df_eth = df_eth.drop(columns=['Unix Timestamp','Symbol'])

In [8]:
# Define the stock symbol and date range
end_date = datetime.date(2020, 12, 31)
start_date = end_date - datetime.timedelta(days=365 * 2)

# Download the stock data
#top 500 comp
df_snp = yf.download('^GSPC', start=start_date, end=end_date)
#eur to usd ratio
df_usd = yf.download('EURUSD=X', start=start_date, end=end_date)
#goldman sachs
df_gs = yf.download('GS', start=start_date, end=end_date)
#standard charted (the most crypto investment bank)
df_st = yf.download('STAN.L', start=start_date, end=end_date)
#silver price
df_sv = yf.download('SI=F', start=start_date, end=end_date)
#gold price
df_gd = yf.download('GC=F', start=start_date, end=end_date)


df_snp_shift = df_snp.reset_index()
df_usd_shift = df_usd.reset_index()
df_st_shift = df_st.reset_index()
df_gs_shift = df_gs.reset_index()
df_gd_shift = df_gd.reset_index()
df_sv_shift = df_sv.reset_index()

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


In [9]:
df_stocks = pd.merge(df_usd_shift, df_snp_shift, on='Date', how='left', suffixes=('', '_snp'))
df_stocks = pd.merge(df_stocks, df_st_shift, on='Date', how='left', suffixes=('', '_st'))
df_stocks = pd.merge(df_stocks, df_gs_shift, on='Date', how='left', suffixes=('', '_gs'))
df_stocks = pd.merge(df_stocks, df_gd_shift, on='Date', how='left', suffixes=('', '_gd'))
df_stocks = pd.merge(df_stocks, df_sv_shift, on='Date', how='left', suffixes=('', '_sv'))


In [10]:
df_btc = df_btc.loc[df_btc['date']<='2020-04-16 00:00:00+00:00']
df_eth = df_eth[df_eth['date']>='2019-01-01 00:00:00+00:00']

In [11]:
column_mapping = {
    'Open': 'open',
    'High': 'high',
    'Low': 'low',
    'Close': 'close',
    'Volume': 'Volume ETH'
}

df_eth.rename(columns=column_mapping, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_eth.rename(columns=column_mapping, inplace=True)


In [12]:
df_crypto = df_btc.merge(df_eth, on='date', how='left', suffixes = ['_btc','_eth'])


In [13]:
df_stocks['Date'] = pd.to_datetime(df_stocks['Date'], utc=True)

In [14]:
# Merge the DataFrames based on the date
df_merge = pd.merge(df_crypto,df_stocks, left_on='date',right_on='Date', how='left', suffixes=('', '_crypto'))

In [15]:
# Specify the columns to forward-fill (modify this list according to your columns)
columns_to_ffill = ['open_eth', 'high_eth', 'low_eth', 'close_eth',
       'Volume ETH','Open', 'High', 'Low', 'Close', 'Adj Close',
       'Volume', 'Open_snp', 'High_snp', 'Low_snp', 'Close_snp',
       'Adj Close_snp', 'Volume_snp', 'Open_st', 'High_st', 'Low_st',
       'Close_st', 'Adj Close_st', 'Volume_st', 'Open_gs', 'High_gs', 'Low_gs',
       'Close_gs', 'Adj Close_gs', 'Volume_gs', 'Open_gd', 'High_gd', 'Low_gd',
       'Close_gd', 'Adj Close_gd', 'Volume_gd', 'Open_sv', 'High_sv', 'Low_sv',
       'Close_sv', 'Adj Close_sv', 'Volume_sv']

# Forward-fill the specified columns from df_usd to match every day in df_btc
df_merge[columns_to_ffill] = df_merge[columns_to_ffill].transform('ffill')


In [16]:
df_merge = df_merge.drop(columns=['Date'])
df_merge = df_merge.reset_index()
df_merge = df_merge[df_merge.index % 10 == 0]


In [17]:
df_merge = df_merge.drop(columns=['index'])

In [18]:
#transform series to supervised learning
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
  n_vars = 1 if type(data) is list else data.shape[1]
  df = pd.DataFrame(data)
  cols, names = list(), list()
  #input sequence (t-n, ... t-1)
  for i in range(n_in, 0, -1):
    cols.append(df.shift(i))
    names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
  #forecast sequence (t, t+1, ... t+n)
  for i in range(0, n_out):
    cols.append(df.shift(-i))
  if i == 0:
    names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
  else:
    names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
  #put it all together
  agg = pd.concat(cols, axis=1)
  agg.columns = names
  #drop rows with NaN values
  if dropnan:
    agg.dropna(inplace=True)
  return agg

In [19]:
df_merge.columns

Index(['open_btc', 'high_btc', 'low_btc', 'close_btc', 'Volume BTC',
       'Volume USD', 'date', 'open_eth', 'high_eth', 'low_eth', 'close_eth',
       'Volume ETH', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
       'Open_snp', 'High_snp', 'Low_snp', 'Close_snp', 'Adj Close_snp',
       'Volume_snp', 'Open_st', 'High_st', 'Low_st', 'Close_st',
       'Adj Close_st', 'Volume_st', 'Open_gs', 'High_gs', 'Low_gs', 'Close_gs',
       'Adj Close_gs', 'Volume_gs', 'Open_gd', 'High_gd', 'Low_gd', 'Close_gd',
       'Adj Close_gd', 'Volume_gd', 'Open_sv', 'High_sv', 'Low_sv', 'Close_sv',
       'Adj Close_sv', 'Volume_sv'],
      dtype='object')

In [20]:
df = series_to_supervised(df_merge.drop(columns=['date']), n_in=20, n_out=1, dropnan=True)

In [21]:
df_merge['new_date'] = df_merge['date'] + pd.Timedelta(minutes=60)
df_btc = pd.concat([df_btc1, df_btc2])
df_btc = df_btc.drop(columns=['date'])
df_btc['date'] = pd.to_datetime(df_btc['unix'], unit='s', utc=True)
df_btc = df_btc.drop(columns=['unix','symbol'])
# Merge the DataFrames to get the 'close' value corresponding to the new date
df_result = pd.merge(df_merge[['new_date','close_btc']],df_btc[['date', 'close']], left_on='new_date', right_on='date',how='left')

# Drop the duplicate 'date' column if needed
df_result.drop('date', axis=1, inplace=True)

In [22]:
df_result['diff'] = df_result['close'] - df_result['close_btc']

In [23]:
df_result['diff'].quantile([0.05,0.1,0.2,0.5,0.75])

0.05   -79.9855
0.10   -47.4500
0.20   -23.5740
0.50     0.4700
0.75    18.0600
Name: diff, dtype: float64

In [24]:
df_result['target'] = 0
df_result.loc[df_result['diff']<-50,'target'] = 1

In [25]:
df_result['target'].value_counts()

0    61489
1     6335
Name: target, dtype: int64

In [26]:
df.shape

(67804, 987)

In [106]:
y = df_result[:-20]['target']
X_train = df[14000:]
X_test = df[:7000]
y_train = y[14000:]
y_test = y[:7000]
y_val = y[7000:14000]
X_val = df[7000:14000]

In [85]:
print(len(y_train))
print(sum(y_train))

53804
4598


## 3. Podejście modeli tradycyjnych - Catboost, Lightgbm

In [53]:
from sklearn.utils.class_weight import compute_class_weight

classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))

In [54]:
class_weights

{0: 0.5467219444783157, 1: 5.85080469769465}

## Catboost

In [83]:
from catboost import CatBoostClassifier  # For classification tasks
from catboost import Pool

# Initialize the CatBoost model
# Adjust the weights as needed
train_data = Pool(X_train, label=y_train)
valid_data = Pool(X_val, label=y_val)
# Create the CatBoostClassifier with custom loss function and class weights
model = CatBoostClassifier(
    iterations=100,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    eval_metric='Precision',  # Use Precision as the evaluation metric
    early_stopping_rounds=30,  # Stop training if the evaluation metric doesn't improve for 20 rounds
    class_weights=class_weights
)
# Train the model
# Train the model with the training and validation data
model.fit(train_data, eval_set=valid_data)

# After training, you can access the best iteration using:
best_iteration = model.best_iteration_


0:	learn: 0.6690600	test: 0.5030814	best: 0.5030814 (0)	total: 538ms	remaining: 53.3s
1:	learn: 0.6879424	test: 0.7602406	best: 0.7602406 (1)	total: 948ms	remaining: 46.4s
2:	learn: 0.6958042	test: 0.4820195	best: 0.7602406 (1)	total: 1.4s	remaining: 45.2s
3:	learn: 0.6940673	test: 0.6439663	best: 0.7602406 (1)	total: 1.85s	remaining: 44.4s
4:	learn: 0.6915542	test: 0.6429530	best: 0.7602406 (1)	total: 2.33s	remaining: 44.2s
5:	learn: 0.6959462	test: 0.6059821	best: 0.7602406 (1)	total: 3.33s	remaining: 52.2s
6:	learn: 0.6958711	test: 0.5932588	best: 0.7602406 (1)	total: 4.04s	remaining: 53.7s
7:	learn: 0.6971008	test: 0.5797206	best: 0.7602406 (1)	total: 4.49s	remaining: 51.7s
8:	learn: 0.6977507	test: 0.5628364	best: 0.7602406 (1)	total: 4.91s	remaining: 49.6s
9:	learn: 0.6983028	test: 0.5584886	best: 0.7602406 (1)	total: 5.34s	remaining: 48.1s
10:	learn: 0.6982715	test: 0.5409124	best: 0.7602406 (1)	total: 5.77s	remaining: 46.7s
11:	learn: 0.7007757	test: 0.5510156	best: 0.7602406 (

In [None]:
model.fit(X_train, y_train)

In [81]:
# Make predictions
y_pred_prob = model.predict_proba(X_test)[:, 1]

In [82]:
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
import numpy as np


y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Calculate confusion matrix
confusion = confusion_matrix(y_test, y_pred)

# Display results
print("Accuracy:", accuracy)
print("ROC AUC Score:", roc_auc)
print("Confusion Matrix:\n", confusion)


Accuracy: 0.3904444444444444
ROC AUC Score: 0.44081783558570753
Confusion Matrix:
 [[2809 4864]
 [ 622  705]]


## Lgbm

In [100]:
import lightgbm as lgb
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)
val_data = lgb.Dataset(X_val, label=y_val)
params = {
    'objective': 'binary',  # For binary classification
    'boosting_type': 'gbdt',  # Gradient boosting decision tree
    'metric': ['binary_logloss', 'auc'],  # Evaluation metrics
    'num_leaves': 31,  # Maximum number of leaves in one tree
    'learning_rate': 0.02,  # Learning rate
    'is_unbalance': True
}


# Train the model
model = lgb.train(params, train_data, num_boost_round=500,  valid_sets=val_data,
    callbacks=(
            [lgb.early_stopping(stopping_rounds=100)]))  # Optional: Print results every 10 rounds)

[LightGBM] [Info] Number of positive: 4598, number of negative: 49206
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.746899 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 235294
[LightGBM] [Info] Number of data points in the train set: 53804, number of used features: 966
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.085458 -> initscore=-2.370394
[LightGBM] [Info] Start training from score -2.370394
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.312388	valid_0's auc: 0.580647


In [101]:
print(model.best_iteration)
print(model.best_score)

1
defaultdict(<class 'collections.OrderedDict'>, {'valid_0': OrderedDict([('binary_logloss', 0.3123877387624195), ('auc', 0.5806466105273996)])})


In [108]:
y_pred = model.predict(X_test)

In [109]:
y_pred_binary = (y_pred > 0.5).astype(int)

# Evaluate the model using accuracy, ROC AUC, and confusion matrix
accuracy = accuracy_score(y_test, y_pred_binary)
roc_auc = roc_auc_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred_binary)

print(f'Best Iteration: {best_iteration}')
print(f'Best Validation Score: {best_val_score}')
print(f'Accuracy: {accuracy}')
print(f'ROC AUC: {roc_auc}')
print('Confusion Matrix:')
print(conf_matrix)

Best Iteration: 1
Best Validation Score: defaultdict(<class 'collections.OrderedDict'>, {'valid_0': OrderedDict([('binary_logloss', 0.31496745456994873), ('auc', 0.46034099695025166)])})
Accuracy: 0.8464285714285714
ROC AUC: 0.4577300755568639
Confusion Matrix:
[[5925    0]
 [1075    0]]
