# Кластерный анализ финансовых рынков


## Установка ta-lib

### Для Windows

- Инструкции: https://github.com/afnhsn/TA-Lib_x64
- Скачать "ta-lib x64.zip" https://github.com/afnhsn/TA-Lib_x64. Распаковать zip файл в C:\ta-lib
- Скачать Visual C++ build tools 2022: https://aka.ms/vs/17/release/vs_buildtools.exe
- Установить Visual C++ build tools: https://stackoverflow.com/a/54136652/10997732
- Установить ta-lib: pip install ta-lib

### На Linux блок ниже


In [None]:
# Устанавливаем ta-lib и другие модули
# Этот блок необходим только для Unix систем
""" 
!wget http://prdownloads.sourceforge.net/ta-lib/ta-lib-0.4.0-src.tar.gz
!tar xvzf ta-lib-0.4.0-src.tar.gz
%cd ta-lib/
!./configure --prefix=/usr
!make
!make install
!pip install TA-Lib 
"""

In [None]:
!pip install pandas
!pip install numpy
!pip install hmmlearn
!pip install matplotlib
!pip install scikit-learn
!pip install plotly

In [2]:
# Импортируем библиотеки
import numpy as np
import talib
from hmmlearn import hmm
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd

# Для доступа к данным на Google drive
# from google.colab import drive

In [3]:
# Load results
symbol = "FTMUSDT"
fraction = 0
period = "1m"
candles = pd.read_csv(rf"./data/candles_{symbol}_{period}.csv").iloc[fraction:]
candles

Unnamed: 0.1,Unnamed: 0,open_time,open,high,low,close,volume,close_time,qav,num_trades,taker_base_vol,taker_quote_vol,ignore
0,2023-07-24 03:00:59.999,1690156800000,0.2560,0.2562,0.2560,0.2562,45201.0,1690156859999,11574.3940,34,33211.0,8504.5550,0
1,2023-07-24 03:01:59.999,1690156860000,0.2560,0.2562,0.2560,0.2561,29447.0,1690156919999,7539.8941,23,6009.0,1539.0282,0
2,2023-07-24 03:02:59.999,1690156920000,0.2560,0.2560,0.2554,0.2554,57397.0,1690156979999,14676.4600,52,2987.0,763.6364,0
3,2023-07-24 03:03:59.999,1690156980000,0.2553,0.2555,0.2550,0.2555,158081.0,1690157039999,40364.8764,98,31709.0,8096.3927,0
4,2023-07-24 03:04:59.999,1690157040000,0.2554,0.2558,0.2554,0.2558,44771.0,1690157099999,11447.6813,41,35744.0,9139.6879,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14989,2023-08-03 12:49:59.999,1691056140000,0.2353,0.2354,0.2353,0.2354,19352.0,1691056199999,4554.1626,16,18915.0,4451.3365,0
14990,2023-08-03 12:50:59.999,1691056200000,0.2354,0.2355,0.2354,0.2355,26781.0,1691056259999,6304.4085,10,26781.0,6304.4085,0
14991,2023-08-03 12:51:59.999,1691056260000,0.2355,0.2355,0.2354,0.2355,11983.0,1691056319999,2821.9765,15,8967.0,2111.7285,0
14992,2023-08-03 12:52:59.999,1691056320000,0.2356,0.2356,0.2355,0.2356,6388.0,1691056379999,1504.4956,11,1216.0,286.4896,0


In [27]:
import plotly.graph_objects as go

# Generate synthetic data
# Assuming 'candles' is a DataFrame containing your OHLCV data
data = candles[["Unnamed: 0", "close", "high", "low"]].rename(columns={"Unnamed: 0": "timestamp", "close": "price"})
data["timestamp"] = data["timestamp"].str[:19]
data.set_index("timestamp", inplace=True)
data.index = pd.to_datetime(data.index)
data.resample("30m").apply({"price": "ohlc", "high": "max", "low": "min"})
data.columns = ["".join(col).strip() for col in data.columns.values]
data = data * 1000

# Calculate Parabolic SAR

data["sar"] = talib.SAR(data["price"].values, data["price"].values, acceleration=0.2, maximum=0.2)

# Additional Features
data["price_diff"] = data["price"].diff()
data["sar_diff"] = data["sar"].diff()
data["momentum"] = data["price"].diff(3)
data["volatility"] = data["price"].rolling(window=5).std()
data["distance_to_sar"] = data["price"] - data["sar"]
data["rolling_max"] = data["price"].rolling(window=5).max()
data["rolling_min"] = data["price"].rolling(window=5).min()
data["price_over_sar_ratio"] = data["price"] / data["sar"]

# Drop NaN rows created by feature engineering
data.dropna(inplace=True)

# Prepare features for HMM
features = data[
    [
        "price_diff",
        "sar_diff",
        "momentum",
        "volatility",
        "distance_to_sar",
        "rolling_max",
        "rolling_min",
        "price_over_sar_ratio",
    ]
].values

# Create and train HMM
model = hmm.GaussianHMM(n_components=3, covariance_type="full", n_iter=100)
model.fit(features)

# Predict hidden states
data["hidden_state"] = model.predict(features)

# Interpret hidden states as reversal types
state_means = np.array([np.mean(features[data["hidden_state"] == i], axis=0) for i in range(3)])
sorted_states = np.argsort(state_means[:, 0])

data["reversal_type"] = "Neutral"
data.loc[data["hidden_state"] == sorted_states[0], "reversal_type"] = "Down"
data.loc[data["hidden_state"] == sorted_states[2], "reversal_type"] = "Up"

# Trading simulation
initial_balance = 100000
balance = initial_balance
stock_quantity = 0
trade_log = []

# Create a list to track balance over time and a list to track buy dates
data["balance"] = initial_balance
buy_dates = []

for i, row in data.iterrows():
    if row["reversal_type"] == "Up" and stock_quantity == 0:
        stock_quantity = balance // row["price"]
        balance -= stock_quantity * row["price"]
        data.at[i, "balance"] = balance
        buy_dates.append(i)
        trade_log.append(f"Buy at {row['price']} on {i}, Balance: {balance}")
    elif row["reversal_type"] == "Down" and stock_quantity > 0:
        balance += stock_quantity * row["price"]
        stock_quantity = 0
        data.at[i, "balance"] = balance
        trade_log.append(f"Sell at {row['price']} on {i}, Balance: {balance}")

# Remaining stocks are sold at the last price
if stock_quantity > 0:
    balance += stock_quantity * data["price"].iloc[-1]
    data.at[data.index[-1], "balance"] = balance
    trade_log.append(f"Sell at {data['price'].iloc[-1]} on {data.index[-1]}, Balance: {balance}")

# Calculate final portfolio value
final_portfolio_value = balance

print(f"Final Portfolio Value: {final_portfolio_value}")
""" print("\nTrading log:")
for log in trade_log:
    print(log) """

# ... [Plotting section] ...

fig = make_subplots(
    rows=2,
    cols=1,
    shared_xaxes=True,
    subplot_titles=(
        "Price and Parabolic SAR",
        "Balance Over Time (after Buy operations)",
    ),
    row_heights=[0.7, 0.3],
)

# Add Price, SAR and Reversal Type data to the first subplot
fig.add_trace(go.Scatter(x=data.index, y=data["price"], mode="lines", name="Price"), row=1, col=1)
fig.add_trace(
    go.Scatter(
        x=data.index,
        y=data["sar"],
        mode="lines",
        name="Parabolic SAR",
        line=dict(dash="dash"),
    ),
    row=1,
    col=1,
)
colors = {"Up": "#00FF00", "Down": "#FF0000", "Neutral": "#0000FF"}
fig.add_trace(
    go.Scatter(
        x=data.index,
        y=data["price"],
        mode="markers",
        name="Reversal Type",
        marker=dict(color=data["reversal_type"].apply(lambda x: colors[x]), size=5),
    ),
    row=1,
    col=1,
)

# Filter balance data for only buy_dates and subtract initial_balance
buy_balance = data.loc[buy_dates, "balance"] - initial_balance

# Add filtered balance data to the second subplot
fig.add_trace(
    go.Scatter(
        x=buy_balance.index,
        y=buy_balance,
        mode="lines+markers",
        name="Balance After Buy",
    ),
    row=2,
    col=1,
)

# Update and show figure
fig.update_layout(title_text="Price, Parabolic SAR, and Balance Over Time", height=800)
fig.show()

Final Portfolio Value: 95138.00000000006


## Пробуем комбинацию SAR

In [30]:
import pandas as pd
import numpy as np
import talib
from hmmlearn import hmm
from plotly.subplots import make_subplots
import plotly.graph_objects as go

data = candles[["Unnamed: 0", "close", "high", "low"]].rename(columns={"Unnamed: 0": "timestamp", "close": "price"})
data["timestamp"] = data["timestamp"].str[:19]
data.set_index("timestamp", inplace=True)
data.index = pd.to_datetime(data.index)

# Calculate two Parabolic SARs: one for a shorter trend, one for a longer trend
data["sar_short"] = talib.SAR(data["high"].values, data["low"].values, acceleration=0.02, maximum=0.2)
data["sar_long"] = talib.SAR(data["high"].values, data["low"].values, acceleration=0.005, maximum=0.2)

# Use clustering or another method to determine the trend based on the longer SAR
# For simplicity, let's define a rising trend when the price is above the long SAR, and a falling trend otherwise
# This is a placeholder for your actual clustering method
# data['long_trend'] = np.where(data['price'] > data['sar_long'], 'rising', 'falling')
data["sar_long_diff"] = data["sar_long"].diff()
data["long_trend"] = np.sign(data["sar_long_diff"]).replace({1: "rising", -1: "falling", 0: "stable"})

# Trading simulation
initial_balance = 100000
balance = initial_balance
stock_quantity = 0
trade_log = []

data["balance"] = initial_balance
buy_dates = []
sell_dates = []

for i, row in data.iterrows():
    if row["long_trend"] == "falling" and row["sar_short"] < row["price"]:
        if stock_quantity == 0:  # Only buy if not currently holding stock
            stock_quantity = balance // row["price"]
            balance -= stock_quantity * row["price"]
            buy_dates.append(i)
            trade_log.append(f"Buy: {stock_quantity} shares at {row['price']} on {i}, Balance: {balance}")
        else:  # Extend position if already holding
            additional_quantity = (balance // row["price"]) * 0.1
            balance -= additional_quantity * row["price"]
            stock_quantity += additional_quantity
            trade_log.append(f"Extend: Buy {additional_quantity} shares at {row['price']} on {i}, Balance: {balance}")
        data.at[i, "balance"] = balance  # Update balance after buy

    elif row["long_trend"] == "rising" and row["sar_long"] > row["price"] and stock_quantity > 0:
        balance += stock_quantity * row["price"]
        sell_dates.append(i)
        trade_log.append(f"Sell: {stock_quantity} shares at {row['price']} on {i}, Balance: {balance}")
        stock_quantity = 0  # Reset stock quantity after sell
        data.at[i, "balance"] = balance  # Update balance after sell

# Selling any remaining stock quantity at the last price
if stock_quantity > 0:
    balance += stock_quantity * data["price"].iloc[-1]
    trade_log.append(f"Final Sell: {stock_quantity} shares at {data['price'].iloc[-1]} on {data.index[-1]}, Balance: {balance}")
    stock_quantity = 0  # Reset stock quantity after final sell
    data.at[data.index[-1], "balance"] = balance  # Update balance after final sell


# Create a new DataFrame to hold only the buy and sell dates and associated balance
balance_changes = pd.DataFrame(index=data.index)
balance_changes["balance"] = data["balance"]  # The balance column from the trading simulation
balance_changes["balance_after_buys"] = None
balance_changes["balance_after_sells"] = None

# Populate the new DataFrame with balance data for buys and sells
for i, row in data.iterrows():
    if i in buy_dates:  # from the trading logic where buys are recorded
        balance_changes.at[i, "balance_after_buys"] = row["balance"]
    if "Sell" in trade_log:  # this should be adjusted to your actual sell logic
        balance_changes.at[i, "balance_after_sells"] = row["balance"]

# Create a new DataFrame to hold only the buy and sell dates and associated balance
balance_changes = pd.DataFrame(index=data.index)
balance_changes["balance_after_buys"] = None
balance_changes["balance_after_sells"] = None

# Populate the DataFrame with the balance after buys and sells
for date in buy_dates:
    balance_changes.at[date, "balance_after_buys"] = data.at[date, "balance"]
for date in sell_dates:
    balance_changes.at[date, "balance_after_sells"] = data.at[date, "balance"]


# Calculate final portfolio value
final_portfolio_value = balance
print(f"Final Portfolio Value: {final_portfolio_value}")
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create subplots: one for price and SARs, another for balance over time
fig = make_subplots(rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.1, subplot_titles=("Price, Short and Long Parabolic SAR", "Balance Over Time"))

# Add Price, Short SAR and Long SAR data to the first subplot
fig.add_trace(go.Scatter(x=data.index, y=data["price"], mode="lines", name="Price"), row=1, col=1)
fig.add_trace(go.Scatter(x=data.index, y=data["sar_short"], mode="lines", name="Parabolic SAR (Short)", line=dict(color="orange")), row=1, col=1)
fig.add_trace(go.Scatter(x=data.index, y=data["sar_long"], mode="lines", name="Parabolic SAR (Long)", line=dict(color="purple")), row=1, col=1)

# Add markers for Buy and Sell points
fig.add_trace(
    go.Scatter(
        x=data.index,
        y=data["price"],
        mode="markers",
        name="Buy Signal",
        marker=dict(color="green", size=10),
        text=["Buy" if val else "" for val in (data["long_trend"] == "falling") & (data["sar_short"] < data["price"])],
        showlegend=False,
    ),
    row=1,
    col=1,
)

fig.add_trace(
    go.Scatter(
        x=data.index,
        y=data["price"],
        mode="markers",
        name="Sell Signal",
        marker=dict(color="red", size=10),
        text=["Sell" if val else "" for val in (data["long_trend"] == "rising") & (data["sar_long"] > data["price"])],
        showlegend=False,
    ),
    row=1,
    col=1,
)

# Calculate and plot Balance Over Time for buys
balance_time_series = data["balance"].diff()
balance_time_series.iloc[0] = data["balance"].iloc[0]  # Set the first value to the initial balance
cumulative_balance = balance_time_series.cumsum()

# Add balance changes to the plot
fig.add_trace(
    go.Scatter(x=balance_changes.index, y=balance_changes["balance_after_buys"], mode="markers", name="Balance After Buy", marker=dict(color="green", size=10)), row=2, col=1
)

fig.add_trace(
    go.Scatter(x=balance_changes.index, y=balance_changes["balance_after_sells"], mode="markers", name="Balance After Sell", marker=dict(color="red", size=10)), row=2, col=1
)
# Update layout for clear visualization
fig.update_layout(title_text="Trading Strategy Performance", showlegend=True, height=800)

# Set y-axis labels
fig.update_yaxes(title_text="Price", row=1, col=1)
fig.update_yaxes(title_text="Balance", row=2, col=1)

# Set x-axis labels
fig.update_xaxes(title_text="Date", row=2, col=1)

# Show the figure
fig.show()

Final Portfolio Value: 94985.01449999996


In [25]:
trade_log

['Buy: 390777.0 shares at 0.2559 on 2023-07-24 03:08:59, Balance: 0.16569999999774154',
 'Extend: Buy 0.0 shares at 0.256 on 2023-07-24 03:09:59, Balance: 0.16569999999774154',
 'Extend: Buy 0.0 shares at 0.2564 on 2023-07-24 03:10:59, Balance: 0.16569999999774154',
 'Sell: 390777.0 shares at 0.2567 on 2023-07-24 03:58:59, Balance: 100312.6216',
 'Buy: 392613.0 shares at 0.2555 on 2023-07-24 04:12:59, Balance: 9.99999901978299e-05',
 'Extend: Buy 0.0 shares at 0.2555 on 2023-07-24 04:13:59, Balance: 9.99999901978299e-05',
 'Extend: Buy 0.0 shares at 0.2555 on 2023-07-24 04:14:59, Balance: 9.99999901978299e-05',
 'Extend: Buy 0.0 shares at 0.2555 on 2023-07-24 04:15:59, Balance: 9.99999901978299e-05',
 'Extend: Buy 0.0 shares at 0.2554 on 2023-07-24 04:16:59, Balance: 9.99999901978299e-05',
 'Extend: Buy 0.0 shares at 0.2554 on 2023-07-24 04:17:59, Balance: 9.99999901978299e-05',
 'Extend: Buy 0.0 shares at 0.2552 on 2023-07-24 04:18:59, Balance: 9.99999901978299e-05',
 'Extend: Buy 0.0

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import talib
from hmmlearn import hmm
import matplotlib.pyplot as plt
from itertools import combinations
import plotly.graph_objects as go
from sklearn.metrics import silhouette_score

pd.options.plotting.backend = "plotly"
import sys


def zigzag(s, threshold=1.0):
    zz = pd.Series(index=s.index)
    last_zz = s.iloc[0]
    zz.iloc[0] = last_zz
    trend = None

    for i in range(1, len(s)):
        price = s.iloc[i]
        prev = last_zz
        move = price - prev
        pct_move = abs(move) / prev

        if pct_move >= threshold:
            if move > 0 and trend != 1:
                last_zz = price
                zz.iloc[i] = price
                trend = 1
            elif move < 0 and trend != -1:
                last_zz = price
                zz.iloc[i] = price
                trend = -1

    return zz


# Generate synthetic data
data = candles[["Unnamed: 0", "close", "high", "low"]].rename(columns={"Unnamed: 0": "timestamp", "close": "price"})
data["timestamp"] = data["timestamp"].str[:19]
data.set_index("timestamp", inplace=True)
data = data * 1000

# Calculate Parabolic SAR
data["sar"] = talib.SAR(data["price"].values, data["price"].values, acceleration=0.02, maximum=0.2)

# Add zigzag indicator (simplified for example)
threshold = 1.0  # Define your own threshold
data["zigzag"] = zigzag(data["price"], threshold=0.0001)


# Create features
data["sar_position"] = np.where(data["sar"] > data["price"], 1, -1)
data["sar_position_change"] = data["sar_position"].diff()
feature_list = ["sar_position_change"]  # Add more features if needed

# Drop NaN
data.dropna(inplace=True)

# Evaluate clustering with different features
best_score = -1
best_features = None

for feature_combo in combinations(feature_list, 1):  # Single feature for illustration
    features = data[list(feature_combo)].values.reshape(-1, 1)
    model = hmm.GaussianHMM(n_components=2, covariance_type="diag", n_iter=100)
    model.fit(features)
    hidden_states = model.predict(features)

    # Check if there are at least two unique hidden states
    if len(set(hidden_states)) < 2:
        print(f"Skipping feature set {feature_combo} due to insufficient unique hidden states.")
        continue

    try:
        score = silhouette_score(features, hidden_states)
        if score > best_score:
            best_score = score
            best_features = feature_combo
    except ValueError:
        print(f"Cannot calculate silhouette score for feature set {feature_combo}. Skipping.")

# Check if we found any suitable feature set for clustering
if best_features is None:
    print("No suitable feature set found for clustering. Exiting.")
    sys.exit()
print(best_features)
# Train the final model with best features
features = data[list(best_features)].values.reshape(-1, 1)
model = hmm.GaussianHMM(n_components=2, covariance_type="diag", n_iter=100)
model.fit(features)
data["hidden_state"] = model.predict(features)

# Create the figure
fig = go.Figure()

# Add Price Line
fig.add_trace(go.Scatter(x=data.index, y=data["price"], mode="lines", name="Price"))

# Add Zigzag points
fig.add_trace(go.Scatter(x=data.index, y=data["zigzag"], mode="markers", name="Zigzag Extremes"))

# Add Candlestick
fig.add_trace(
    go.Candlestick(
        x=data.index,
        open=data["price"],
        high=data["high"],
        low=data["low"],
        close=data["price"],
        name="Candlestick",
    )
)

# Add cluster points
cluster_0 = data[data["hidden_state"] == 0]
cluster_1 = data[data["hidden_state"] == 1]

fig.add_trace(
    go.Scatter(
        x=cluster_0.index,
        y=cluster_0["price"],
        mode="markers",
        marker=dict(color="green"),
        name="Cluster 0",
    )
)
fig.add_trace(
    go.Scatter(
        x=cluster_1.index,
        y=cluster_1["price"],
        mode="markers",
        marker=dict(color="blue"),
        name="Cluster 1",
    )
)

# Customize layout
fig.update_layout(
    title="Price, Zigzag, and Clusters",
    xaxis_title="Timestamp",
    yaxis_title="Price",
    xaxis_rangeslider_visible=False,
)

fig.show()

print(f"Best feature set: {best_features}")
print(f"Best silhouette score: {best_score}")
# Initialize variables for trading simulation
initial_balance = 100000
balance = initial_balance
stock_quantity = 0
buy_price = 0
trade_log = []

# Simulate trading based on clustering results
for i in range(1, len(data)):
    if data["hidden_state"][i] != data["hidden_state"][i - 1]:
        if stock_quantity == 0:  # Buy
            stock_quantity = balance // data["price"][i]
            balance -= stock_quantity * data["price"][i]
            buy_price = data["price"][i]
            trade_log.append(f"Buy at {buy_price}, Balance: {balance}")

        else:  # Sell
            balance += stock_quantity * data["price"][i]
            stock_quantity = 0
            trade_log.append(f"Sell at {data['price'][i]}, Balance: {balance}")

# Calculate final portfolio value
final_portfolio_value = balance + stock_quantity * data["price"].iloc[-1]
print(f"Final Portfolio Value: {final_portfolio_value}, Total Trades: {len(trade_log)//2}")

# Trading log
""" for log in trade_log:
    print(log) """

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from itertools import combinations
import plotly.express as px

# Assuming 'candles' is a DataFrame containing your OHLCV data
data = candles[["Unnamed: 0", "close", "high", "low"]].rename(columns={"Unnamed: 0": "timestamp", "close": "price"})
data["timestamp"] = data["timestamp"].str[:19]
data.set_index("timestamp", inplace=True)
data.index = pd.to_datetime(data.index)
""" data = data.resample("1h").apply({"price": "ohlc", "high": "max", "low": "min"}).rename({"close": "price"}, axis=1)["price"]
data.columns = ["".join(col).strip() for col in data.columns.values] """

# Calculate SAR
data["sar"] = talib.SAR(data["price"].values, data["price"].values, acceleration=0.02, maximum=0.2)

# Generate features
data["sar_position"] = np.where(data["sar"] > data["price"], 1, -1)
data["sar_position_change"] = data["sar_position"].diff()
data["sar_price_diff"] = data["sar"] - data["price"]
# Drop NaN rows
data.dropna(inplace=True)

# Feature list and combinations
feature_list = ["sar", "sar_position", "sar_position_change", "sar_price_diff"]
feature_combinations = [combo for i in range(1, len(feature_list) + 1) for combo in combinations(feature_list, i)]

best_score = -1
best_features = None

# Evaluate each combination with KMeans and silhouette score
steps = 0
n_clusters = 2
n_init = "auto"
for feature_combo in feature_combinations:
    # print(f"Step: {steps} from {len(feature_combinations)}")
    steps += 1
    kmeans = KMeans(n_clusters=n_clusters, n_init=n_init, random_state=0)
    features = data[list(feature_combo)]
    kmeans.fit(features)

    clusters = kmeans.predict(features)
    score = silhouette_score(features, clusters)
    if score > best_score:
        best_score = score
        best_features = feature_combo

# Re-cluster with best features
kmeans = KMeans(n_clusters=n_clusters, n_init=n_init, random_state=0)
features = data[list(best_features)]
kmeans.fit(features)
data["cluster"] = kmeans.predict(features)

# Plotly Visualization
fig = px.scatter(
    data.reset_index(),
    x="timestamp",
    y="price",
    color="cluster",
    title="Cluster Identification",
    labels={"cluster": "Cluster ID"},
    color_continuous_scale=["green", "red", "blue"],
)

fig.update_layout(xaxis_title="Timestamp", yaxis_title="Price", coloraxis_showscale=False)

fig.show()

print(f"Best feature set: {best_features}")
print(f"Best silhouette score: {best_score}")