# Use k means cluster on data to get classified labels

## Importing and Read & Preprocess data

In [1]:
import global_resources as gr
import numpy as np
import torch
import kmc_torch.kmc as kmc
import os
import pandas as pd

In [2]:
# data_dir = r"D:\Important Files\Repositories\Quantitative-Investment-Algorithms\Data\Cluster DATA\processed_data_final.csv"
data_dir = os.path.join(gr.default_dir, r'Data\Cluster DATA\processed_data_final.csv')
df = gr.read_and_return_pd_df(data_dir)
to_drop = ['证券简称', '行业代码1']
df.drop(to_drop, axis = 1, inplace = True)
df['证券代码'] = df['证券代码'].astype(str).str.zfill(6)
Stock_ID_df = df['证券代码']
df.drop(['证券代码'], axis = 1, inplace = True)
display(Stock_ID_df)
display(df)

Reading files from: D:\ImportanFiles\Coding Related\Repositories\Quantitative-Investment-Algorithms\Data\Cluster DATA\processed_data_final.csv


0       000001
1       000002
2       000004
3       000006
4       000007
         ...  
5316    920002
5317    920008
5318    920016
5319    920099
5320    920118
Name: 证券代码, Length: 5321, dtype: object

Unnamed: 0,营业总收入,现金资产比率,应收类资产比率,固定资产比率,无形资产比率,金融负债比率,少数股东权益占比,市盈率（PE）1,资产负债率,市值A,托宾Q值A,账面市值比A,资产报酬率A,净资产收益率（ROE）A
0,1.294663,-0.777553,-1.270996,-1.297656,-0.692415,-1.477725,-0.185208,-0.330143,2.323068,4.379345,-0.733083,1.421790,-0.375351,0.132565
1,3.324778,-0.706152,-1.208915,-1.205924,-0.608636,-0.188667,1.208350,-0.326349,1.446345,0.858528,-0.829444,1.958758,-0.737222,-0.103471
2,-0.151864,0.133231,2.573513,-1.277036,-0.136708,-1.085169,-0.257693,-0.370844,0.840408,-0.068671,3.617179,-2.012428,-2.865983,-0.471754
3,-0.076711,-0.165453,-1.251412,-1.287878,-0.710005,0.400996,0.012098,-0.370844,1.215474,-0.053009,-0.664129,1.108182,-1.008012,-0.196261
4,-0.150626,1.671150,-0.842090,-0.384225,-0.707361,0.594283,0.100501,0.064576,0.780991,-0.068667,4.447951,-2.103008,-0.356346,0.001572
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5316,-0.148916,0.255536,0.050418,0.063653,-0.494292,-1.182638,-0.185208,-0.140737,-1.544378,-0.069825,-0.522023,0.592871,0.969871,0.106275
5317,-0.149857,0.215799,3.219931,-0.949145,-0.670456,0.700225,-0.185208,-0.148138,-0.198913,-0.069806,0.143840,-0.678933,1.255378,0.184321
5318,-0.150593,0.936079,-0.717003,0.952690,0.067164,1.518683,-0.185208,-0.019689,0.223056,-0.069697,-0.208876,-0.160995,0.756657,0.153632
5319,-0.148657,1.784303,-0.655317,-1.041496,0.039194,-0.601230,-0.185208,-0.230032,0.294809,-0.069054,-0.508312,0.550499,0.700999,0.165490


## Move data to gpu

In [3]:
device = gr.set_device()
print(f"Current device: {device.capitalize()}.")
X = np.array(df, dtype = np.float64)
X_gpu = torch.tensor(X, device = device, dtype = torch.float64)

Current device: Cuda.


# K is set as 10, which is based on the elbow method, and run wcss for single k function to get the labels

In [4]:
X, y, centroids, var = kmc.WCSS_for_single_k(X = X_gpu, k = 10, n_restarts = 50)
print(X.shape)
print(y.shape)
print(centroids.shape)
print(var)

Clustering with: k = 10.
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating centroids with k being 10...
Initiating cent

# SVC/OVO

In [5]:
import SVM.SVC as svc

Current training device: Cuda.


## Set parameters

In [6]:
lr_cpu = 0.01
lr = torch.tensor(lr_cpu, device = X.device, dtype = X.dtype)

In [7]:
%time
wb_dic = svc.ovo_train(
    X,
    y,
    num_epochs = int(1e4),
    start_learning_rate = lr,
    l2_penalty = True,
    print_every = int(1e3),
    delta_loss_breaker = 1e-7,
    patience = int(10),
    relative = True,
    relative_breaker = 1e-6
)

CPU times: total: 0 ns
Wall time: 0 ns
-------------------------------------------------------------------------------------------------------
Training on label a: 0 and label b: 1
Creating random weights and bias with dtype: torch.float64
Training with loss function: hinge loss with l2 penalty on weights.
Training with relative breaker.
Epoch 0 | Loss: 1.1088631818215493 | Relative Ratio: None
Epoch 1 | Loss: 1.0670159189092645 | Relative Ratio: 1.0392189677498553
Epoch 1000 | Loss: 0.004995962393397344 | Relative Ratio: 1.0000023184658555
Epoch 2000 | Loss: 0.004979718140941919 | Relative Ratio: 1.0000042524981985
Epoch 3000 | Loss: 0.004956536898295101 | Relative Ratio: 1.0000035932291758
Epoch 4000 | Loss: 0.0049366645059091335 | Relative Ratio: 1.0000017699376975
Epoch 5000 | Loss: 0.004920395971426018 | Relative Ratio: 1.0000059040217
Epoch 6000 | Loss: 0.004897733819565722 | Relative Ratio: 1.0000035049351461
Epoch 7000 | Loss: 0.0048825029947621185 | Relative Ratio: 1.000006264

In [8]:
votes = svc.ovo_predict(wb_dic, X, y.dtype)

In [9]:
svc.ovo_score(X, y, wb_dic)

Accuracy: 5252/5321 = 98.70%


0.9870325126855855

In [11]:
print(X)

tensor([[ 1.2947, -0.7776, -1.2710,  ...,  1.4218, -0.3754,  0.1326],
        [ 3.3248, -0.7062, -1.2089,  ...,  1.9588, -0.7372, -0.1035],
        [-0.1519,  0.1332,  2.5735,  ..., -2.0124, -2.8660, -0.4718],
        ...,
        [-0.1506,  0.9361, -0.7170,  ..., -0.1610,  0.7567,  0.1536],
        [-0.1487,  1.7843, -0.6553,  ...,  0.5505,  0.7010,  0.1655],
        [-0.1341, -0.5186,  3.3960,  ...,  0.8371,  0.4430,  0.1336]],
       device='cuda:0', dtype=torch.float64)


In [10]:
votes_df = gr.detach_to_pd(votes)
display(votes_df)

Unnamed: 0,0
0,8
1,8
2,1
3,8
4,1
...,...
5316,3
5317,7
5318,9
5319,2


In [12]:
df = pd.concat([Stock_ID_df, votes_df], axis = 1)
display(df)

Unnamed: 0,证券代码,0
0,000001,8
1,000002,8
2,000004,1
3,000006,8
4,000007,1
...,...,...
5316,920002,3
5317,920008,7
5318,920016,9
5319,920099,2
