In [39]:
import torch
import torch.nn as nn
import pandas as pd
from graphs.graphs import *

In [41]:
def prepare_adj(df, method = 'gaussian', sig = 1):

    """
    Input: Adjacency matrix or feature matrix with the last column including the labels
    Output: Row normalized gaussian kernel similarity matrix
    """
    X = df.values[:,:-1] #consider X a graph or a feature matrix, both fine
    np.fill_diagonal(X,0) #set diagonal to zero / remove self loops
    Q_index = range(X.shape[0]) # for now always use this

    dis = distanceEuclidean(X, Q_index, n_jobs=-1)
    similarity = kerGauss(dis, sigma = sig) #try different sigma

    # origianl similarity matrix, using gaussian kernel, row normalize
    if method == 'gaussian':
        graph = RandomWalkNormalize(similarity)
        
    elif method == 'MSTKNN':
        A_KNN = MSTKNN(dis,Q_index,delta=20,n_jobs=-1,spanning=False)
        A_KNN_ker = A_KNN*similarity
        graph = RandomWalkNormalize(A_KNN_ker)
        
    return graph

In [48]:
adj_mat = pd.read_csv("./data/sp500/affMat.csv", index_col=0).drop('y', axis = 1)

In [54]:
df = pd.read_csv("./data/sp500/fullMat.csv", index_col=0).drop('201', axis = 1)
df['y'] = df['200']
df = df.drop('200', axis = 1)
df
df.div(df.sum(axis=1), axis=0).fillna(0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,191,192,193,194,195,196,197,198,199,y
Regeneron,0.029709,0.053398,0.023851,0.083258,0.018710,0.041130,0.018192,0.022755,-0.063909,0.051481,...,0.010418,0.063094,0.011498,0.027198,-0.015976,-0.037378,-0.006761,0.032596,-0.012100,0.211853
Unum_Group,0.052983,0.019798,-0.325435,-0.005230,0.121672,-0.373335,-0.355898,-0.074194,0.414259,-0.177945,...,0.065779,-0.040210,-0.087123,-0.572492,-0.617773,-0.264709,0.364267,-0.208590,0.533818,-0.254583
United_Continental,0.008134,0.022785,0.053371,-0.037527,-0.062546,0.187863,0.035275,-0.025775,0.120398,-0.063723,...,0.020211,0.004472,-0.218528,0.091500,-0.078323,0.048100,0.032771,-0.054219,0.027589,0.385340
ONEOK,-0.240188,-0.106443,0.401809,0.038054,0.004296,0.219045,-0.059372,0.229200,-0.085054,0.016029,...,-0.128197,-0.097187,-0.075449,0.068672,-0.396417,0.058293,0.033638,-0.546540,-0.106362,0.701371
Facebook,0.004967,0.020134,0.022695,-0.014711,-0.052418,0.031993,-0.000983,0.031323,-0.033330,-0.000248,...,0.035826,-0.018163,-0.004330,0.019341,-0.008674,0.023977,-0.014842,-0.056830,-0.023381,1.403127
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Verizon,0.046537,0.026883,-0.072441,0.011027,0.051452,0.098170,-0.041066,0.053001,0.045359,-0.035295,...,-0.060577,0.070565,0.016273,-0.024225,-0.041767,0.037629,0.047648,0.072733,0.063571,-0.233672
Union_Pacific,-0.015544,0.023848,-0.084306,0.005632,-0.014163,0.052163,0.100165,0.033902,-0.075740,-0.007236,...,-0.026989,-0.015312,0.019882,-0.019015,-0.039648,0.092713,0.044303,0.115218,0.051064,-0.414394
XL_Capital,0.107173,-0.019708,0.016633,0.024814,0.196293,0.079612,0.047198,-0.032121,0.100787,0.206430,...,0.092702,-0.090087,0.204226,-0.227188,-0.109904,-0.169646,0.301735,-0.004587,-0.071774,-0.169580
CBS,-0.004021,0.001162,0.017606,-0.005359,-0.036599,0.034359,0.004606,0.040386,-0.006088,-0.002397,...,0.034373,-0.012834,-0.000347,0.013666,-0.010001,-0.012096,-0.011155,-0.023197,-0.013129,1.253326


In [47]:
prepare_adj(df, method = 'MSTKNN').sum(axis = 1)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [132]:
df = pd.DataFrame({'x':[20,4,10,2],'y':[1,5,1,3],'z':[1,5,1,6]})
df

Unnamed: 0,x,y,z
0,20,1,1
1,4,5,5
2,10,1,1
3,2,3,6


In [130]:
std = df.std(axis = 1)
df = df.subtract(df.mean(axis = 1), axis = 0)
df = df.div(std, axis = 0)

Unnamed: 0,x,y,z
0,12.666667,-6.333333,-6.333333
1,-0.666667,0.333333,0.333333
2,6.0,-3.0,-3.0


In [131]:
df.div(std, axis = 0)

Unnamed: 0,x,y,z
0,1.154701,-0.57735,-0.57735
1,-1.154701,0.57735,0.57735
2,1.154701,-0.57735,-0.57735


In [90]:
df.mean(axis = 1)

0    2.5
1    2.5
2    1.5
dtype: float64

In [92]:
df.std(axis = 1)

0    2.121320
1    2.121320
2    0.707107
dtype: float64