In [39]:
import torch
import torch.nn as nn
import pandas as pd
from graphs.graphs import *

In [41]:
def prepare_adj(df, method = 'gaussian', sig = 1):

    """
    Input: Adjacency matrix or feature matrix with the last column including the labels
    Output: Row normalized gaussian kernel similarity matrix
    """
    X = df.values[:,:-1] #consider X a graph or a feature matrix, both fine
    np.fill_diagonal(X,0) #set diagonal to zero / remove self loops
    Q_index = range(X.shape[0]) # for now always use this

    dis = distanceEuclidean(X, Q_index, n_jobs=-1)
    similarity = kerGauss(dis, sigma = sig) #try different sigma

    # origianl similarity matrix, using gaussian kernel, row normalize
    if method == 'gaussian':
        graph = RandomWalkNormalize(similarity)
        
    elif method == 'MSTKNN':
        A_KNN = MSTKNN(dis,Q_index,delta=20,n_jobs=-1,spanning=False)
        A_KNN_ker = A_KNN*similarity
        graph = RandomWalkNormalize(A_KNN_ker)
        
    return graph

In [150]:
adj_mat = pd.read_csv("./data/sp500/affMat.csv", index_col=0)#.drop('y', axis = 1)
adj_mat

Unnamed: 0,REGN US EQUITY,UNM US EQUITY,UAL US EQUITY,OKE US EQUITY,FB US EQUITY,KMI US EQUITY,SEE US EQUITY,TSN US EQUITY,ADI US EQUITY,VAR US EQUITY,...,KMB US EQUITY,CAT US EQUITY,COO US EQUITY,PAYX US EQUITY,VZ US EQUITY,UNP US EQUITY,XL US EQUITY,CBS US EQUITY,MU US EQUITY,y
Regeneron,1.000000,0.151142,0.105208,0.069935,0.174052,0.230379,0.120237,0.092692,0.203464,0.184139,...,0.116291,0.205736,0.210796,0.170059,0.137301,0.157936,0.084560,0.136909,0.220560,6
Unum_Group,0.151142,1.000000,0.283075,0.149528,0.059304,0.251403,0.279716,0.142772,0.287477,0.137700,...,0.082835,0.435696,0.138003,0.273242,0.219420,0.421148,0.177677,0.177134,0.165679,1
United_Continental,0.105208,0.283075,1.000000,0.143929,0.119060,0.189908,0.189188,0.073910,0.255691,0.139299,...,0.017188,0.257313,0.121235,0.257718,0.094678,0.291897,0.094481,0.162548,0.213872,4
ONEOK,0.069935,0.149528,0.143929,1.000000,0.230242,0.642033,0.179999,0.116641,0.193932,0.207374,...,0.106359,0.397402,0.098413,0.178883,0.229011,0.251607,0.041569,0.235502,0.162912,3
Facebook,0.174052,0.059304,0.119060,0.230242,1.000000,0.247885,0.172003,0.040786,0.317664,0.224253,...,0.043405,0.256198,0.283525,0.270607,0.099680,0.155670,0.038570,0.089609,0.319637,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Verizon,0.137301,0.219420,0.094678,0.229011,0.099680,0.201607,0.228568,0.222397,0.145348,0.174770,...,0.330930,0.177649,0.162602,0.334160,1.000000,0.197834,0.050339,0.228174,0.124103,2
Union_Pacific,0.157936,0.421148,0.291897,0.251607,0.155670,0.283853,0.252752,0.140541,0.303513,0.132389,...,0.140498,0.519631,0.175340,0.323554,0.197834,1.000000,0.147378,0.184699,0.215806,4
XL_Capital,0.084560,0.177677,0.094481,0.041569,0.038570,0.081097,0.139263,0.030380,0.051647,0.094268,...,0.022977,0.132128,0.097885,0.144393,0.050339,0.147378,1.000000,0.014233,0.047311,1
CBS,0.136909,0.177134,0.162548,0.235502,0.089609,0.214689,0.168751,0.188760,0.234295,0.153285,...,0.220684,0.201727,0.088809,0.218312,0.228174,0.184699,0.014233,1.000000,0.146638,10


In [151]:
df = pd.read_csv("./data/sp500/fullMat.csv", index_col=0).drop('201', axis = 1)
df['y'] = df['200']
df = df.drop('200', axis = 1)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,191,192,193,194,195,196,197,198,199,y
Regeneron,0.841414,1.512317,0.675493,2.357999,0.529899,1.164875,0.515220,0.644450,-1.810016,1.458016,...,0.295060,1.786929,0.325655,0.770299,-0.452478,-1.058615,-0.191484,0.923177,-0.342697,6
Unum_Group,-0.208115,-0.077768,1.278303,0.020541,-0.477928,1.466457,1.397962,0.291432,-1.627205,0.698964,...,-0.258379,0.157945,0.342220,2.248742,2.426607,1.039774,-1.430837,0.819338,-2.096831,1
United_Continental,0.084431,0.236514,0.554012,-0.389547,-0.649251,1.950099,0.366168,-0.267554,1.249782,-0.661467,...,0.209797,0.046424,-2.268419,0.949810,-0.813032,0.499300,0.340172,-0.562814,0.286384,4
ONEOK,-1.027363,-0.455291,1.718671,0.162770,0.018373,0.936929,-0.253952,0.980365,-0.363804,0.068560,...,-0.548340,-0.415700,-0.322723,0.293735,-1.695609,0.249339,0.143881,-2.337736,-0.454947,3
Facebook,0.038943,0.157847,0.177917,-0.115327,-0.410939,0.250816,-0.007706,0.245558,-0.261293,-0.001941,...,0.280860,-0.142390,-0.033943,0.151629,-0.068005,0.187972,-0.116353,-0.445524,-0.183300,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Verizon,-0.398314,-0.230092,0.620022,-0.094381,-0.440382,-0.840239,0.351482,-0.453634,-0.388228,0.302094,...,0.518477,-0.603971,-0.139281,0.207346,0.357481,-0.322063,-0.407819,-0.622522,-0.544109,2
Union_Pacific,0.150045,-0.230192,0.813774,-0.054360,0.136712,-0.503516,-0.966860,-0.327242,0.731094,0.069843,...,0.260512,0.147799,-0.191914,0.183549,0.382704,-0.894925,-0.427640,-1.112158,-0.492906,4
XL_Capital,-0.631991,0.116216,-0.098082,-0.146328,-1.157523,-0.469466,-0.278321,0.189413,-0.594333,-1.217302,...,-0.546657,0.531234,-1.204304,1.339710,0.648093,1.000391,-1.779307,0.027047,0.423247,1
CBS,-0.032082,0.009271,0.140478,-0.042760,-0.292019,0.274143,0.036753,0.322229,-0.048576,-0.019123,...,0.274258,-0.102400,-0.002767,0.109035,-0.079797,-0.096514,-0.089003,-0.185086,-0.104750,10


In [47]:
prepare_adj(df, method = 'MSTKNN').sum(axis = 1)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [146]:
df = pd.DataFrame({'x':[20,4,10,2],'y':[1,5,1,3],'z':[1,5,1,6]})
df.div(df.max(axis = 1), axis = 0)

Unnamed: 0,x,y,z
0,1.0,0.05,0.05
1,0.8,1.0,1.0
2,1.0,0.1,0.1
3,0.333333,0.5,1.0


In [None]:
std = corr_mat.std(axis = 1)
corr_mat = corr_mat.subtract(corr_mat.mean(axis = 1), axis = 0)
corr_mat = corr_mat.div(std, axis = 0)