In [None]:
import os
from datetime import datetime, timedelta
import numpy as np
import yfinance as yf
import pandas as pd
from predictions.pipeline.enum.time_interval import TimeInterval


class DataCollector:
    def __init__(self, coin_id: int, coin_symbol: str, currency: str = "USD",
                 interval: TimeInterval = TimeInterval.DAY):
        self.ticker = None
        self.coin_id = coin_id
        self.coin_symbol = coin_symbol
        self.currency = currency
        self.interval = interval

    def _get_ticker(self):
        ticker_symbol = f"{self.coin_symbol}-{self.currency}"
        self.ticker = yf.Ticker(ticker_symbol)

    def _collect_data_other_intervals(self, interval) -> dict:
        period_mapping = {
            "1m": "7d",
            "1h": "730d",
            "1d": "7y",
            "5d": "7y",
            "1w": "7y",
            "1mo": "7y",
        }

        period = period_mapping.get(interval.value, "7d")
        data = self.ticker.history(period=period, interval=interval.value)

        return data.to_dict()

    def _process_and_save_data(self, raw_data_from_api: dict):
        processed_data = pd.DataFrame(raw_data_from_api).drop(["Dividends", "Stock Splits"], axis=1)

        processed_data = processed_data.reset_index().rename(columns={'index': 'datetime'})

        directory_path = f"../data/processed_data/{self.coin_symbol}/{self.interval.name}"

        if not os.path.exists(directory_path):
            os.makedirs(directory_path)

        timestamp_str = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

        csv_file_path = os.path.join(directory_path, f"{self.coin_symbol}_data_{timestamp_str}.csv")

        processed_data.to_csv(
            path_or_buf=csv_file_path,
            date_format="%Y-%m-%d %H:%M:%S",
            index=False
        )

    def process_pipeline(self):
        try:
            self._get_ticker()

            raw_data_from_api = self._collect_data_other_intervals(self.interval)

            self._process_and_save_data(raw_data_from_api)

        except Exception as e:
            print(f"Error collecting data: {e}")
            return None

In [None]:
collector = DataCollector(1, "ETH", currency="USD", interval=TimeInterval.DAY)
data = collector.process_pipeline()
data

In [197]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

dataframe = pd.read_csv("../data/processed_data/ETH/DAY/ETH_data_2023-11-21_12-14-48.csv")

# prediction_dataframe = dataframe.shift(-1).dropna()
# prediction_dataframe = prediction_dataframe.rename(columns={
#     "Open": "Next_Open",
#     "High": "Next_High", 
#     "Low": "Next_Low", 
#     "Close": "Next_Close", 
#     "Volume": "Next_Volume"
# })
# 
# dataframe = dataframe.merge(prediction_dataframe)

dataframe = dataframe.set_index("datetime")

dataframe = dataframe[:-20]

scaler = MinMaxScaler(feature_range=(0, 1))
dataframe_normalized = scaler.fit_transform(dataframe)

dataframe_normalized = pd.DataFrame(dataframe_normalized, columns=dataframe.columns)

sequence_length = 10
sequences = []

for i in range(len(dataframe_normalized) - sequence_length):
    sequence = dataframe_normalized.iloc[i:i + sequence_length, :]
    target = dataframe_normalized.iloc[i + sequence_length, :]
    sequences.append((sequence.values, target.values))

X = np.array([seq[0] for seq in sequences])
y = np.array([seq[1] for seq in sequences])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# X_train = X_train.reshape((X_train.shape[0], sequence_length, dataframe.shape[1]))
# X_test = X_test.reshape((X_test.shape[0], sequence_length, dataframe.shape[1]))

# 
# dataframe_normalized

In [221]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

df = pd.read_csv("../data/processed_data/ETH/DAY/ETH_data_2023-11-21_12-14-48.csv")

df = df.set_index("datetime")

df = df[:-20]

scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df)

sequence_length = 10 
data_sequences = []
target_values = []

for i in range(len(scaled_data) - sequence_length):
    data_sequences.append(scaled_data[i:i + sequence_length])
    target_values.append(scaled_data[i + sequence_length])

data_sequences = np.array(data_sequences)
target_values = np.array(target_values)

train_size = int(len(data_sequences) * 0.8)
train_data, test_data = data_sequences[:train_size], data_sequences[train_size:]
train_targets, test_targets = target_values[:train_size], target_values[train_size:]

model = LinearRegression()
model.fit(train_data.reshape(-1, sequence_length * df.shape[1]), train_targets)

predictions = model.predict(test_data.reshape(-1, sequence_length * df.shape[1]))

predictions = scaler.inverse_transform(predictions)

mean_squared_error(test_targets, predictions)

2.130989195847784e+19

In [210]:
y_pred_original_scale = scaler.inverse_transform(y_pred)

columns = dataframe.columns
y_pred_df = pd.DataFrame(predictions, columns=columns)

last_date = dataframe.index[-1]

future_dates = pd.date_range(start=last_date, periods=len(y_pred_df) + 1, freq='D')[1:]

predictions_with_dates = pd.DataFrame(y_pred_original_scale, columns=columns, index=future_dates)

concatenated_df = pd.concat([dataframe, predictions_with_dates])

concatenated_df

Unnamed: 0,Open,High,Low,Close,Volume
2017-11-09 00:00:00,308.644989,329.451996,307.056000,320.884003,8.932500e+08
2017-11-10 00:00:00,320.670990,324.717987,294.541992,299.252991,8.859860e+08
2017-11-11 00:00:00,298.585999,319.453003,298.191986,314.681000,8.423010e+08
2017-11-12 00:00:00,314.690002,319.153015,298.513000,307.907990,1.613480e+09
2017-11-13 00:00:00,307.024994,328.415009,307.024994,316.716003,1.041890e+09
...,...,...,...,...,...
2025-01-05 00:00:00,1759.352295,1824.956299,1709.441406,1756.085693,1.136955e+10
2025-01-06 00:00:00,1755.309692,1821.239380,1697.009644,1737.948730,9.937840e+09
2025-01-07 00:00:00,1763.142334,1828.231812,1700.270020,1737.885742,9.520869e+09
2025-01-08 00:00:00,1781.604248,1845.217529,1721.257446,1759.425293,1.039972e+10


In [213]:
df_plot = pd.read_csv("../data/processed_data/ETH/DAY/ETH_data_2023-11-21_12-14-48.csv")

In [None]:
dataframe_normalized.corr()