# Use k means cluster on data to get classified labels

## Importing and Read & Preprocess data

In [9]:
import global_resources as gr
import numpy as np
import torch
import kmc_torch.kmc as kmc
import os
import pandas as pd

In [10]:
# data_dir = r"D:\Important Files\Repositories\Quantitative-Investment-Algorithms\Data\Cluster DATA\processed_data_final.csv"
data_dir = os.path.join(gr.default_dir, r'Data\Cluster DATA\processed_data_final.csv')
df = gr.read_and_return_pd_df(data_dir)
to_drop = ['证券简称', '行业代码1']
df.drop(to_drop, axis = 1, inplace = True)
df['证券代码'] = df['证券代码'].astype(str).str.zfill(6)
Stock_ID_df = df['证券代码']
df.drop(['证券代码'], axis = 1, inplace = True)
display(Stock_ID_df)
display(df)

Reading files from: D:\ImportanFiles\Coding Related\Repositories\Quantitative-Investment-Algorithms\Data\Cluster DATA\processed_data_final.csv


0       000001
1       000002
2       000004
3       000006
4       000007
         ...  
5316    920002
5317    920008
5318    920016
5319    920099
5320    920118
Name: 证券代码, Length: 5321, dtype: object

Unnamed: 0,营业总收入,现金资产比率,应收类资产比率,固定资产比率,无形资产比率,金融负债比率,少数股东权益占比,市盈率（PE）1,资产负债率,市值A,托宾Q值A,账面市值比A,资产报酬率A,净资产收益率（ROE）A
0,1.294663,-0.777553,-1.270996,-1.297656,-0.692415,-1.477725,-0.185208,-0.330143,2.323068,4.379345,-0.733083,1.421790,-0.375351,0.132565
1,3.324778,-0.706152,-1.208915,-1.205924,-0.608636,-0.188667,1.208350,-0.326349,1.446345,0.858528,-0.829444,1.958758,-0.737222,-0.103471
2,-0.151864,0.133231,2.573513,-1.277036,-0.136708,-1.085169,-0.257693,-0.370844,0.840408,-0.068671,3.617179,-2.012428,-2.865983,-0.471754
3,-0.076711,-0.165453,-1.251412,-1.287878,-0.710005,0.400996,0.012098,-0.370844,1.215474,-0.053009,-0.664129,1.108182,-1.008012,-0.196261
4,-0.150626,1.671150,-0.842090,-0.384225,-0.707361,0.594283,0.100501,0.064576,0.780991,-0.068667,4.447951,-2.103008,-0.356346,0.001572
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5316,-0.148916,0.255536,0.050418,0.063653,-0.494292,-1.182638,-0.185208,-0.140737,-1.544378,-0.069825,-0.522023,0.592871,0.969871,0.106275
5317,-0.149857,0.215799,3.219931,-0.949145,-0.670456,0.700225,-0.185208,-0.148138,-0.198913,-0.069806,0.143840,-0.678933,1.255378,0.184321
5318,-0.150593,0.936079,-0.717003,0.952690,0.067164,1.518683,-0.185208,-0.019689,0.223056,-0.069697,-0.208876,-0.160995,0.756657,0.153632
5319,-0.148657,1.784303,-0.655317,-1.041496,0.039194,-0.601230,-0.185208,-0.230032,0.294809,-0.069054,-0.508312,0.550499,0.700999,0.165490


## Move data to gpu

In [11]:
device = gr.set_device()
print(f"Current device: {device.capitalize()}.")
X = np.array(df, dtype = np.float64)
X_gpu = torch.tensor(X, device = device, dtype = torch.float64)

Current device: Cuda.


## K is set as 10, which is based on the elbow method, and run wcss for single k function to get the labels

In [12]:
X, y, centroids, var = kmc.WCSS_for_single_k(X = X_gpu, k = 10, n_restarts = 50)
print(X.shape)
print(y.shape)
print(centroids.shape)
print(var)

The seed starts at 2634854452
Clustering with: k = 10.
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with

In [13]:
k_results = y.detach().cpu()
k_results = k_results.numpy()
k_results = pd.DataFrame(k_results)
k_results = pd.concat([Stock_ID_df, k_results], axis = 1)
cols = ['ID', 'K_Results']
k_results.columns = cols
display(k_results)

Unnamed: 0,ID,K_Results
0,000001,0
1,000002,0
2,000004,1
3,000006,0
4,000007,1
...,...,...
5316,920002,2
5317,920008,7
5318,920016,4
5319,920099,3


# SVC/OVO

In [14]:
import SVM.SVC as svc

## Set parameters

In [15]:
lr_cpu = 0.01
lr = torch.tensor(lr_cpu, device = X.device, dtype = X.dtype)

In [16]:
%time
wb_dic = svc.ovo_train(
    X,
    y,
    num_epochs = int(1e4),
    start_learning_rate = lr,
    l2_penalty = True,
    print_every = int(1e3),
    delta_loss_breaker = 1e-7,
    patience = int(10),
    relative = True,
    relative_breaker = 1e-6
)

CPU times: total: 0 ns
Wall time: 0 ns
-------------------------------------------------------------------------------------------------------
Training on label a: 0 and label b: 1
Creating random weights and bias with dtype: torch.float64
Training with loss function: hinge loss with l2 penalty on weights.
Training with relative breaker.
Epoch 0 | Loss: 1.928530465917824 | Relative Ratio: None
Epoch 1 | Loss: 1.8910412649393582 | Relative Ratio: 1.019824634011711
Epoch 548 | Loss: 0.002895949551004414 | Relative Ratio: 1.0000006825047967
Exited with relative_ratio consecutively being smaller than 1e-06 from epoch 538 to epoch 548.
-------------------------------------------------------------------------------------------------------
Training on label a: 0 and label b: 2
Creating random weights and bias with dtype: torch.float64
Training with loss function: hinge loss with l2 penalty on weights.
Training with relative breaker.
Epoch 0 | Loss: 0.6519421398172861 | Relative Ratio: None
Ep

In [17]:
votes = svc.ovo_predict(wb_dic, X, y.dtype)

In [18]:
svc.ovo_score(X, y, wb_dic)

Accuracy: 5252/5321 = 98.70%


0.9870325126855855

In [19]:
votes_df = gr.detach_to_pd(votes)
display(votes_df)

Unnamed: 0,0
0,0
1,0
2,1
3,0
4,1
...,...
5316,2
5317,7
5318,4
5319,3


In [20]:
df = pd.concat([Stock_ID_df, votes_df], axis = 1)
COLS = ['ID', 'SVC_votes']
df.columns = COLS
display(df)

Unnamed: 0,ID,SVC_votes
0,000001,0
1,000002,0
2,000004,1
3,000006,0
4,000007,1
...,...,...
5316,920002,2
5317,920008,7
5318,920016,4
5319,920099,3


# Classified Data Analysis

In [21]:
compare_df = pd.merge(df, k_results)
display(compare_df)

Unnamed: 0,ID,SVC_votes,K_Results
0,000001,0,0
1,000002,0,0
2,000004,1,1
3,000006,0,0
4,000007,1,1
...,...,...,...
5316,920002,2,2
5317,920008,7,7
5318,920016,4,4
5319,920099,3,3


In [22]:
different = compare_df[(compare_df['SVC_votes'] != compare_df['K_Results'])]

In [23]:
display(different)
print(different['K_Results'].unique())
print(different['SVC_votes'].unique())

Unnamed: 0,ID,SVC_votes,K_Results
140,000564,4,0
168,000607,0,2
219,000695,6,4
238,000719,0,2
394,000967,6,0
...,...,...,...
4894,688500,3,7
4915,688525,4,1
5003,688646,0,2
5008,688655,4,2


[0 2 4 6 7 1 3 5]
[4 0 6 7 2 1 3 5]


In [24]:
data_df = gr.read_and_return_pd_df(r"Data\Cluster Data\processed_data_final.csv")
data_df['证券代码'] = data_df['证券代码'].astype(str).str.zfill(6)
merged = pd.merge(data_df, compare_df, left_on = '证券代码', right_on = 'ID')
merged.drop('ID', axis = 1, inplace = True)
display(merged)

Reading files from: Data\Cluster Data\processed_data_final.csv


Unnamed: 0,证券代码,证券简称,营业总收入,行业代码1,现金资产比率,应收类资产比率,固定资产比率,无形资产比率,金融负债比率,少数股东权益占比,市盈率（PE）1,资产负债率,市值A,托宾Q值A,账面市值比A,资产报酬率A,净资产收益率（ROE）A,SVC_votes,K_Results
0,000001,平安银行,1.294663,J66,-0.777553,-1.270996,-1.297656,-0.692415,-1.477725,-0.185208,-0.330143,2.323068,4.379345,-0.733083,1.421790,-0.375351,0.132565,0,0
1,000002,万科A,3.324778,K70,-0.706152,-1.208915,-1.205924,-0.608636,-0.188667,1.208350,-0.326349,1.446345,0.858528,-0.829444,1.958758,-0.737222,-0.103471,0,0
2,000004,国华网安,-0.151864,I65,0.133231,2.573513,-1.277036,-0.136708,-1.085169,-0.257693,-0.370844,0.840408,-0.068671,3.617179,-2.012428,-2.865983,-0.471754,1,1
3,000006,深振业A,-0.076711,K70,-0.165453,-1.251412,-1.287878,-0.710005,0.400996,0.012098,-0.370844,1.215474,-0.053009,-0.664129,1.108182,-1.008012,-0.196261,0,0
4,000007,全新好,-0.150626,K70,1.671150,-0.842090,-0.384225,-0.707361,0.594283,0.100501,0.064576,0.780991,-0.068667,4.447951,-2.103008,-0.356346,0.001572,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5316,920002,万达轴承,-0.148916,C34,0.255536,0.050418,0.063653,-0.494292,-1.182638,-0.185208,-0.140737,-1.544378,-0.069825,-0.522023,0.592871,0.969871,0.106275,2,2
5317,920008,成电光信,-0.149857,C39,0.215799,3.219931,-0.949145,-0.670456,0.700225,-0.185208,-0.148138,-0.198913,-0.069806,0.143840,-0.678933,1.255378,0.184321,7,7
5318,920016,中草香料,-0.150593,C26,0.936079,-0.717003,0.952690,0.067164,1.518683,-0.185208,-0.019689,0.223056,-0.069697,-0.208876,-0.160995,0.756657,0.153632,4,4
5319,920099,瑞华技术,-0.148657,M74,1.784303,-0.655317,-1.041496,0.039194,-0.601230,-0.185208,-0.230032,0.294809,-0.069054,-0.508312,0.550499,0.700999,0.165490,3,3


In [25]:
merged.rename(columns={'证券代码': 'ID'}, inplace=True)

## 丢掉k和svc判断不一样的股票，然后将两个结果合并

In [26]:
merged.drop(index = different.index, inplace = True)

In [27]:
display(merged)

Unnamed: 0,ID,证券简称,营业总收入,行业代码1,现金资产比率,应收类资产比率,固定资产比率,无形资产比率,金融负债比率,少数股东权益占比,市盈率（PE）1,资产负债率,市值A,托宾Q值A,账面市值比A,资产报酬率A,净资产收益率（ROE）A,SVC_votes,K_Results
0,000001,平安银行,1.294663,J66,-0.777553,-1.270996,-1.297656,-0.692415,-1.477725,-0.185208,-0.330143,2.323068,4.379345,-0.733083,1.421790,-0.375351,0.132565,0,0
1,000002,万科A,3.324778,K70,-0.706152,-1.208915,-1.205924,-0.608636,-0.188667,1.208350,-0.326349,1.446345,0.858528,-0.829444,1.958758,-0.737222,-0.103471,0,0
2,000004,国华网安,-0.151864,I65,0.133231,2.573513,-1.277036,-0.136708,-1.085169,-0.257693,-0.370844,0.840408,-0.068671,3.617179,-2.012428,-2.865983,-0.471754,1,1
3,000006,深振业A,-0.076711,K70,-0.165453,-1.251412,-1.287878,-0.710005,0.400996,0.012098,-0.370844,1.215474,-0.053009,-0.664129,1.108182,-1.008012,-0.196261,0,0
4,000007,全新好,-0.150626,K70,1.671150,-0.842090,-0.384225,-0.707361,0.594283,0.100501,0.064576,0.780991,-0.068667,4.447951,-2.103008,-0.356346,0.001572,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5316,920002,万达轴承,-0.148916,C34,0.255536,0.050418,0.063653,-0.494292,-1.182638,-0.185208,-0.140737,-1.544378,-0.069825,-0.522023,0.592871,0.969871,0.106275,2,2
5317,920008,成电光信,-0.149857,C39,0.215799,3.219931,-0.949145,-0.670456,0.700225,-0.185208,-0.148138,-0.198913,-0.069806,0.143840,-0.678933,1.255378,0.184321,7,7
5318,920016,中草香料,-0.150593,C26,0.936079,-0.717003,0.952690,0.067164,1.518683,-0.185208,-0.019689,0.223056,-0.069697,-0.208876,-0.160995,0.756657,0.153632,4,4
5319,920099,瑞华技术,-0.148657,M74,1.784303,-0.655317,-1.041496,0.039194,-0.601230,-0.185208,-0.230032,0.294809,-0.069054,-0.508312,0.550499,0.700999,0.165490,3,3


## 将列名改为Labels

In [28]:
merged.rename(columns={'SVC_votes': 'Labels'}, inplace=True)
merged.drop('K_Results', axis = 1, inplace = True)
display(merged)

Unnamed: 0,ID,证券简称,营业总收入,行业代码1,现金资产比率,应收类资产比率,固定资产比率,无形资产比率,金融负债比率,少数股东权益占比,市盈率（PE）1,资产负债率,市值A,托宾Q值A,账面市值比A,资产报酬率A,净资产收益率（ROE）A,Labels
0,000001,平安银行,1.294663,J66,-0.777553,-1.270996,-1.297656,-0.692415,-1.477725,-0.185208,-0.330143,2.323068,4.379345,-0.733083,1.421790,-0.375351,0.132565,0
1,000002,万科A,3.324778,K70,-0.706152,-1.208915,-1.205924,-0.608636,-0.188667,1.208350,-0.326349,1.446345,0.858528,-0.829444,1.958758,-0.737222,-0.103471,0
2,000004,国华网安,-0.151864,I65,0.133231,2.573513,-1.277036,-0.136708,-1.085169,-0.257693,-0.370844,0.840408,-0.068671,3.617179,-2.012428,-2.865983,-0.471754,1
3,000006,深振业A,-0.076711,K70,-0.165453,-1.251412,-1.287878,-0.710005,0.400996,0.012098,-0.370844,1.215474,-0.053009,-0.664129,1.108182,-1.008012,-0.196261,0
4,000007,全新好,-0.150626,K70,1.671150,-0.842090,-0.384225,-0.707361,0.594283,0.100501,0.064576,0.780991,-0.068667,4.447951,-2.103008,-0.356346,0.001572,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5316,920002,万达轴承,-0.148916,C34,0.255536,0.050418,0.063653,-0.494292,-1.182638,-0.185208,-0.140737,-1.544378,-0.069825,-0.522023,0.592871,0.969871,0.106275,2
5317,920008,成电光信,-0.149857,C39,0.215799,3.219931,-0.949145,-0.670456,0.700225,-0.185208,-0.148138,-0.198913,-0.069806,0.143840,-0.678933,1.255378,0.184321,7
5318,920016,中草香料,-0.150593,C26,0.936079,-0.717003,0.952690,0.067164,1.518683,-0.185208,-0.019689,0.223056,-0.069697,-0.208876,-0.160995,0.756657,0.153632,4
5319,920099,瑞华技术,-0.148657,M74,1.784303,-0.655317,-1.041496,0.039194,-0.601230,-0.185208,-0.230032,0.294809,-0.069054,-0.508312,0.550499,0.700999,0.165490,3


## 将ID，证卷名字，label保存下来

In [29]:
informations = merged[['ID', '证券简称', 'Labels']]
informations.to_csv(os.path.join(gr.global_paths['Output'], 'k=10_processed_data.csv'), index = False, encoding='utf-8')

# CAPM on Each Cluster