In [11]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [12]:
def convert_data(data):
    ''' convert data to (i,j,k), value format'''
    print(data.shape) # need to know shape for loading data
    # get indices with non-missing entries and their entries
    available_idx_mask = ~np.isnan(data)
    idx = np.argwhere(available_idx_mask)
    values = data[available_idx_mask]
    # add values to last column
    final_array = np.c_[idx, values]
    final_array = pd.DataFrame(final_array)
    return final_array

In [13]:
def standardize(data):
    ''' standardize data with shape (T, N, D) across time for each firm'''
    data = data.transpose((1,0,2)) # (N, T, D)
    # standardize each firm individually, firm_matrix (T, D), each col is a feature
    for i, firm_matrix in enumerate(data):
        sc =  MinMaxScaler()
        sc.fit(data[i])
        data[i] = sc.transform(data[i])
    
    return data.transpose((1,0,2)) # (N, T, D) -> (T, N, D)

In [49]:
# there are columns with all nan values, after standardization they are just left as nan
data_fundamentals = np.load('data/data_F.npy')
data_fundamentals = standardize(data_fundamentals)
data_fundamentals = convert_data(data_fundamentals)
data_fundamentals.to_csv('data/fundamentals.csv', index=None)

  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


(32, 180, 19)


In [50]:
pd.read_csv('data/fundamentals.csv')

Unnamed: 0,0,1,2,3
0,0.0,0.0,0.0,0.000000
1,0.0,0.0,1.0,0.037931
2,0.0,0.0,2.0,0.000000
3,0.0,0.0,3.0,0.000000
4,0.0,0.0,4.0,0.000000
...,...,...,...,...
108486,31.0,179.0,14.0,0.426540
108487,31.0,179.0,15.0,0.344150
108488,31.0,179.0,16.0,0.116312
108489,31.0,179.0,17.0,0.623545


In [51]:
data_fundamentals.describe()

Unnamed: 0,0,1,2,3
count,108491.0,108491.0,108491.0,108491.0
mean,15.503977,89.783798,8.999594,0.433694
std,9.239604,51.910569,5.483383,0.319107
min,0.0,0.0,0.0,0.0
25%,7.0,45.0,4.0,0.145292
50%,15.0,90.0,9.0,0.41728
75%,24.0,135.0,14.0,0.696325
max,31.0,179.0,18.0,1.0


In [53]:
data_eps = np.load('data/data_EPS.npy')
data_eps = standardize(data_eps)
data_eps = convert_data(data_eps)
data_eps.to_csv('data/eps.csv', index=None)

  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  data_min = np.nanmin(X, axis=0)
  data_max = n

(32, 180, 173)


In [55]:
data_eps.describe()

Unnamed: 0,0,1,2,3
count,42867.0,42867.0,42867.0,42867.0
mean,16.170737,90.636504,84.354818,0.52779
std,9.085589,51.492061,48.995282,0.300409
min,0.0,0.0,0.0,0.0
25%,9.0,45.0,43.0,0.277778
50%,16.0,89.0,83.0,0.540541
75%,24.0,136.0,127.0,0.791176
max,31.0,179.0,172.0,1.0


In [14]:
# there are columns with all nan values, after standardization they are just left as nan
# last 5 years of data, 2012-2021, 100 firms, 202 chars
data_chars = np.load('data/chars.npy')[-60:, :100, :]
data_chars = standardize(data_chars)
data_chars = convert_data(data_chars)
data_chars.to_csv('data/chars.csv', index=None)

  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  data_min = np.nanmin(X, axis=0)
  data_max = n

(60, 100, 202)


In [18]:
len(data_chars)/(60*100*202)

0.6260709570957096