In [None]:
import FinanceDataReader as fdr
import pandas as pd
import numpy as np
import ta
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt

# utils.py
from utils import *

#  차트 설정
%matplotlib inline
import matplotlib.pyplot as plt

plt.rcParams["font.family"] = 'Malgun Gothic'
plt.rcParams["figure.figsize"] = (14,4)
plt.rcParams['lines.linewidth'] = 2
plt.rcParams["axes.grid"] = True

In [None]:
start_date, end_date = '20231101', '20231130'
all_stocks = get_dataset(start_date, end_date)

print(all_stocks.info())
print(all_stocks)


In [None]:
df = all_stocks.copy()

In [None]:
# Data Transformation
days = 22 #df.shape[0]//50
returns = df.pct_change().mean() * days
returns = pd.DataFrame(returns)
returns.columns = ['Returns']
returns['Volatility'] = df.pct_change().std() * np.sqrt(days)
data=returns

scaler = StandardScaler().fit(data)
rescaledDataset = pd.DataFrame(scaler.fit_transform(data),columns = data.columns, index = data.index)
# summarize transformed data
rescaledDataset.head(2)
X=rescaledDataset
X.head(2)

In [None]:
# 코사인 유사도 + K-Means 클러스터링
def cosine_kmeans_clustering(data, n_clusters):
    similarity_matrix = cosine_similarity(data)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(similarity_matrix)
    return labels

# PCA + K-Means 클러스터링
def pca_kmeans_clustering(data, n_clusters):
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)
    
    pca = PCA(n_components=2)
    reduced_data = pca.fit_transform(scaled_data)
    
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(reduced_data)
    return labels


# 클러스터링 결과 시각화
def visualize_clusters(data, labels):
    plt.scatter(data[:, 0], data[:, 1], c=labels, cmap='viridis')
    plt.title('Cluster Visualization')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.show()

# MultiIndex에서 회사별로 클러스터 라벨 추가
def add_cluster_labels(df, labels):
    df['Cluster'] = labels
    return df

# 클러스터링 수행 및 시각화
# 주어진 데이터에서 'Date'와 'Name' 열을 제외한 나머지 열을 선택
feature_columns = df.columns.difference(['Date', 'Name'])
data_for_clustering = df[feature_columns].values

# 코사인 유사도 + K-Means 클러스터링
cosine_labels = cosine_kmeans_clustering(data_for_clustering, n_clusters=3)
df_cosine_clusters = add_cluster_labels(df.copy(), cosine_labels)
visualize_clusters(data_for_clustering, cosine_labels)

# PCA + K-Means 클러스터링
pca_labels = pca_kmeans_clustering(data_for_clustering, n_clusters=3)
df_pca_clusters = add_cluster_labels(df.copy(), pca_labels)
visualize_clusters(data_for_clustering, pca_labels)
# visualize_clusters(df_pca_clusters, 'Name', data_for_clustering, pca_labels)


In [None]:
df = all_stocks
df.dropna(axis=0, inplace=True)

# StandardScaler를 사용하여 데이터 표준화
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df.iloc[:, 1:])

# PCA로 차원 축소
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(scaled_data)

# K-Means 클러스터링
kmeans = KMeans(n_clusters=len(df['Name'].unique()), random_state=42)
labels = kmeans.fit_predict(reduced_data)

# 클러스터링 결과 시각화
df['Cluster'] = labels
colors = plt.cm.Spectral(df['Cluster'].astype(float) / len(df['Cluster'].unique()))
plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=colors, edgecolor='k', s=40)
plt.title('Cluster Visualization')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')

# 각 점에 회사명 표시
for company, x, y in zip(df['Name'], reduced_data[:, 0], reduced_data[:, 1]):
    plt.text(x, y, company)

plt.show()
print(set(labels))

In [None]:
# sample
start_date = '20231201'
end_date = '20231205'
code = '005930'
name = '삼성전자'
df = get_dataset(code, name, start_date, end_date)
print(df)

In [None]:
# 종가 데이터 

start_date = '20231201'
end_date = '20231205'
data_list = pd.DataFrame({
    'Code': ['005930', '373220', '000660'],
    'Name': ['삼성전자', 'LG에너지솔루션', 'SK하이닉스']
})


print(data_list.shape)
# data_list.columns = ['Code', 'Name']
# 한국거래소 상장종목 전체 조회
# kospi = fdr.StockListing('KOSPI')
# kosdaq = fdr.StockListing('KOSDAQ')

# kospi_list = kospi[['Code', 'Name']]
# kosdaq_list = kosdaq[['Code', 'Name']]
# data_list = pd.concat([kospi_list, kosdaq_list], axis=0)

# kospi_list 종가 차트
df = fdr.DataReader('005930', start_date, end_date)

print(df.columns)

In [None]:
start_date = '20231101'
end_date = '20231130'
kospi = fdr.StockListing('KOSPI')
kospi_list = kospi[['Code', 'Name']][:10]

all_stocks = pd.DataFrame()
for code, name in zip(kospi_list['Name'], kospi_list['Code']):
    stock = get_dataset(name, code, start_date, end_date)
    stock = add_technical_indicators(stock)
    all_stocks = pd.concat([all_stocks, stock], ignore_index=True)
    
print(all_stocks)

In [None]:
def get_dataset(stock_code, stock_name, start_date, end_date):
    stock_df = fdr.DataReader(stock_code, start_date, end_date).reset_index()
    stock_df['Name'] = stock_name
    return stock_df
def add_technical_indicators(stock_df):
    ma = [5,20,60,120]
    for days in ma:
        stock_df['ma_'+str(days)] = stock_df['Close'].rolling(window = days).mean()
    H, L, C, V = stock_df['High'], stock_df['Low'], stock_df['Close'], stock_df['Volume']

    stock_df['RSI'] = ta.momentum.rsi(close=C, fillna=True)
    stock_df['MACD'] = ta.trend.macd(close=C, fillna=True)
    stock_df['-VI'] = ta.trend.vortex_indicator_neg(high=H, low=L, close=C, fillna=True)
    stock_df['+VI'] = ta.trend.vortex_indicator_pos(high=H, low=L, close=C, fillna=True)
    return stock_df

# # 샘플 데이터 생성
# start_date = '2023-11-01'
# end_date = '2023-12-05'
# kospi_list = pd.DataFrame({
#     'Code': ['005930', '373220', '000660'],
#     'Name': ['삼성전자', 'LG에너지솔루션', 'SK하이닉스']
# })
start_date = '20231101'
end_date = '20231130'
kospi = fdr.StockListing('KOSPI')
kospi_list = kospi[['Code', 'Name']][:20]

all_stocks = pd.DataFrame()

# 데이터 수집 및 기술적 지표 추가
for code, name in zip(kospi_list['Name'], kospi_list['Code']):
    stock = get_dataset(name, code, start_date, end_date)
    stock = add_technical_indicators(stock)
    all_stocks = pd.concat([all_stocks, stock], ignore_index=True)

# Pivot 데이터 구성
features = all_stocks.columns.drop(['Name', 'Date'])
pivot_stocks = all_stocks.pivot(index='Date', columns='Name', values=features)
pivot_stocks = pivot_stocks.interpolate()

# NaN 값을 0으로 대체
pivot_stocks = pivot_stocks.fillna(0)

# 데이터 정규화
scaler = StandardScaler()
pivot_stocks_scaled = scaler.fit_transform(pivot_stocks)

# PCA를 사용하여 데이터 차원 축소
pca = PCA(n_components=2)
pivot_stocks_pca = pca.fit_transform(pivot_stocks_scaled)

# DBSCAN을 사용한 클러스터링
dbscan = DBSCAN(eps=0.5, min_samples=2)
labels = dbscan.fit_predict(pivot_stocks_pca)

print(labels)
# # 종목별 클러스터 결과 저장
# all_stocks['Cluster'] = labels

# # 종목별 클러스터링 결과 시각화
# for name, group in all_stocks.groupby('Name'):
#     plt.figure(figsize=(8, 5))
#     for cluster_label in set(group['Cluster']):
#         cluster_data = group[group['Cluster'] == cluster_label]
#         plt.scatter(cluster_data['Date'], cluster_data['Close'], label=f'Cluster {cluster_label}')

#     plt.title(f'Clustering Result for {name}')
#     plt.xlabel('Date')
#     plt.ylabel('Close Price')
#     plt.legend()
#     plt.show()


In [None]:
start_date = '20231201'
end_date = '20231205'

all_stocks = pd.DataFrame()
stock = get_dataset('005930', '삼성전자',start_date, end_date)
stock = add_technical_indicators(stock)
stock1 = get_dataset('373220', 'LG에너지솔루션', start_date, end_date)
stock1 = add_technical_indicators(stock1)
all_stocks = pd.concat([all_stocks, stock], ignore_index=True)
all_stocks = pd.concat([all_stocks, stock1], ignore_index=True)

# for code, name in zip(data_list['Code'], data_list['Name']):
#     tmp = get_dataset(code, start_date, end_date)
#     tmp = add_technical_indicators(tmp)
features = all_stocks.columns.drop(['Name'])

# print(all_stocks.shape)
# print(all_stocks)

pivot_stocks = all_stocks.pivot(index='Date', columns='Name', values=features)
pivot_stocks = pivot_stocks.interpolate()
print(pivot_stocks)
# 데이터 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(pivot_stocks)

# PCA를 사용하여 데이터 차원 축소
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# DBSCAN을 사용한 클러스터링
dbscan = DBSCAN(eps=0.5, min_samples=2)  # eps는 이웃을 결정하는 반경, min_samples는 클러스터가 될 최소 데이터 수
labels = dbscan.fit_predict(X_pca)

# 클러스터링 결과를 시각화
plt.figure(figsize=(10, 6))

for label in set(labels):
    plt.scatter(X_pca[labels == label, 0], X_pca[labels == label, 1], label=f'Cluster {label}')

plt.title('DBSCAN Clustering of Stock Data with Technical Indicators')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.show()



In [None]:
import pandas as pd
from scipy.spatial.distance import euclidean
from fastdtw import fastdtw
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
# 필요한 특징 선택
features = df[['Open', 'High', 'Low', 'Close', 'Volume', 'Change', 'ma_5', 'ma_20', 'ma_60', 'ma_120', 'RSI', 'MACD', '-VI', '+VI']]

# 표준화 (Standardization)
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# DTW 거리 계산 함수
def dtw_distance(series1, series2):
    distance, path = fastdtw(series1, series2, dist=euclidean)
    return distance

# DTW 거리 행렬 생성
dtw_matrix = pd.DataFrame(index=features.index, columns=features.index)
for i in features.index:
    for j in features.index:
        dtw_matrix.at[i, j] = dtw_distance(features_scaled[i], features_scaled[j])

# K-Means 클러스터링
kmeans = KMeans(n_clusters=3, random_state=42)
df['Cluster'] = kmeans.fit_predict(dtw_matrix)

# 클러스터링 결과 시각화
plt.scatter(features['Date'], features['Close'], c=df['Cluster'], cmap='viridis')
plt.title('Stock Clustering based on DTW Distance')
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.show()

In [None]:
df = get_dataset('005930', start_date, end_date)
print(df)
df = add_technical_indicators(df)
print(df)

In [None]:
features = df.columns
print([*features])


In [None]:
# 13개의 회사, 50일치 데이터
num_companies = 13
num_days = 50
num_features = 6

data = np.random.rand(num_companies, num_days, num_features)
columns = ['Open', 'High', 'Low', 'Close', 'Volume', 'Change']
companies = [f'Company_{i}' for i in range(num_companies)]

# 임의의 주가 패턴 추가
for i in range(num_companies):
    data[i, :, :] += np.linspace(0, i * 0.1, num_days).reshape(-1, 1)

df = pd.DataFrame(data.reshape(-1, num_features), columns=columns)
df['Company'] = [companies[i] for i in range(num_companies) for _ in range(num_days)]
df['Date'] = np.tile(pd.date_range(start='2022-01-01', periods=num_days), num_companies)

df_pivot = df.pivot(index='Company', columns='Date', values=columns)

print(df_pivot.shape)

In [None]:
# Calculate average annual percentage return and volatilities over a theoretical one year period
days = df.shape[0]
returns = df.pct_change().mean() * days
returns = pd.DataFrame(returns)
returns.columns = ['Returns']
returns['Volatility'] = df.pct_change().std() * np.sqrt(days)
data=returns
# format the data as a numpy array to feed into the K-Means algorithm
# data = np.asarray([np.asarray(returns['Returns']),np.asarray(returns['Volatility'])]).T 

scaler = StandardScaler().fit(data)
rescaledDataset = pd.DataFrame(scaler.fit_transform(data),columns = data.columns, index = data.index)
# summarize transformed data
rescaledDataset.head(2)
X=rescaledDataset
X.head(2)

In [None]:
# 기술적 지표를 기반으로 정규화
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)


# DBSCAN for Time Series
dbscan = DBSCAN(metric='euclidean', eps=0.5, min_samples=5)
df['DBSCAN_Labels'] = dbscan.fit_predict(df_scaled)
