In [1]:
import pandas as pd
import numpy as np
import os
import math
import matplotlib.pyplot as plt
from scipy import stats
from tqdm import tqdm
import pyarrow.feather as pf
import warnings
warnings.simplefilter("ignore")

In [None]:
if "test_chars" not in os.listdir():
    os.mkdir("test_chars")

if "test_rank_no_impute" not in os.listdir():
    os.mkdir("test_rank_no_impute")

In [3]:
char = pf.read_feather("chars/chars_rank_imputed.feather")
print(char.shape)

char_raw_no_impute = pf.read_feather("chars/chars_raw_no_impute.feather")
print(char_raw_no_impute.shape)

char_raw_imputed = pf.read_feather("chars/chars_raw_imputed.feather")
print(char_raw_imputed.shape)

char_rank_no_impute = pf.read_feather("chars/chars_rank_no_impute.feather")
print(char_rank_no_impute.shape)

(2983327, 115)
(2983327, 113)
(2983327, 114)
(2983327, 115)


In [6]:
char.columns.values

array(['gvkey', 'permno', 'ticker', 'conm', 'comnam', 'sic', 'ret',
       'exchcd', 'shrcd', 'prc', 'shrout', 'date', 'lag_me',
       'rank_mom36m', 'rank_maxret', 'rank_rdm', 'rank_pm',
       'rank_secured', 'rank_std_turn', 'rank_agr', 'rank_roavol',
       'rank_hire', 'rank_pchgm_pchsale', 'rank_bm', 'rank_age',
       'rank_chempia', 'rank_mom6m', 'rank_rvar_mean', 'rank_baspread',
       'rank_pchsaleinv', 'rank_indmom', 'rank_chmom', 'rank_saleinv',
       'rank_invest', 'rank_rvar_ff3', 'rank_cfp', 'rank_cashdebt',
       'rank_sue', 'rank_sgr', 'rank_chtx', 'rank_pchsale_pchrect',
       'rank_bm_ia', 'rank_salerec', 'rank_absacc', 'rank_ato', 'rank_tb',
       'rank_adm', 'rank_beta', 'rank_chatoia', 'rank_cinvest',
       'rank_turn', 'rank_pchcapx_ia', 'rank_convind', 'rank_cashpr',
       'rank_dy', 'rank_lgr', 'rank_securedind', 'rank_chinv',
       'rank_salecash', 'rank_herf', 'rank_gma', 'rank_roe', 'rank_ep',
       'rank_rna', 'rank_stdacc', 'rank_pchquick', 'rank

In [9]:
# char_list = ['maxret', 'grltnoa', 'pchcurrat', 'abr', 'pm', 'pchsale_pchinvt',
#        'salecash', 'herf', 'cashdebt', 'sp', 'std_turn', 'rdm', 'roic',
#        'chinv', 'adm', 'acc', 'me', 'tang', 'pchgm_pchsale', 'absacc',
#        'pchsale_pchrect', 'rvar_ff3', 'tb', 'chempia', 'mom1m', 'invest',
#        'roe', 'convind', 'std_dolvol', 'op', 'cfp_ia', 'seas1a',
#        'secured', 'chpmia', 'pscore', 'quick', 'baspread', 'mom6m',
#        'pctacc', 'rd', 'dy', 'realestate', 'chatoia', 're', 'agr', 'divi',
#        'ni', 'rna', 'nincr', 'chcsho', 'indmom', 'egr', 'salerec', 'roa',
#        'cinvest', 'pchdepr', 'gma', 'securedind', 'currat', 'divo', 'cfp',
#        'ato', 'cashpr', 'ill', 'beta', 'roavol', 'turn', 'grcapx',
#        'pchquick', 'ep', 'me_ia', 'zerotrade', 'rvar_mean', 'depr',
#        'pchsaleinv', 'hire', 'stdcf', 'chtx', 'rd_sale', 'saleinv',
#        'rsup', 'rvar_capm', 'bm_ia', 'mom36m', 'pchsale_pchxsga', 'alm',
#        'mom12m', 'pchcapx_ia', 'sgr', 'sue', 'lev', 'age', 'mom60m', 'bm',
#        'sin', 'cash', 'dolvol', 'lgr', 'noa', 'stdacc', 'chmom']
char_list = ['cinvest', 'rvar_capm', 'mom36m', 'zerotrade', 'rna', 'chtx', 'pm',
       'depr', 'cash', 'me_ia', 'adm', 'bm_ia', 'cashdebt',
       'mom12m', 'baspread', 'sp', 'mom1m', 'rvar_ff3', 'ep', 'dy', 're',
       'nincr', 'rdm', 'rsup', 'lgr', 'chpmia', 'std_dolvol', 'rd_sale',
       'beta', 'rvar_mean', 'mom60m', 'chcsho', 'roa', 'acc', 'ato',
       'sue', 'op', 'alm', 'noa', 'mom6m', 'me', 'cfp', 'pscore',
       'seas1a', 'roe', 'maxret', 'std_turn', 'sgr', 'grltnoa', 'gma',
       'ni', 'dolvol', 'bm', 'pctacc', 'herf', 'lev', 'agr', 'ill', 'abr',
       'hire', 'turn']
rank_columns = [f"rank_{col}" for col in char_list]

subplot_x = math.ceil(len(char_list) / 5)

In [10]:
df_raw_no_impute = char_raw_no_impute[['gvkey', 'permno', 'sic', 'ret', 'exchcd', 'shrcd', 'ticker', 'conm', 'comnam', 'prc', 'shrout'] + char_list + ['date']]
df_raw_no_impute.to_feather("chars60_raw_no_impute.feather")

In [11]:
df_raw_imputed = char_raw_imputed[['gvkey', 'permno', 'sic', 'ret', 'exchcd', 'shrcd', 'ticker', 'conm', 'comnam', 'prc', 'shrout'] + char_list + ['date','ffi49']]
df_raw_imputed.to_feather("chars60_raw_imputed.feather")

In [12]:
df_rank_no_impute = char_rank_no_impute[['gvkey', 'permno', 'sic', 'ret', 'exchcd', 'shrcd', 'date', 'lag_me', 'ticker', 'conm', 'comnam', 'prc', 'shrout'] + rank_columns + ['log_me']]
df_rank_no_impute.to_feather("chars60_rank_no_impute.feather")

In [13]:
df_rank_imputed = char[['gvkey', 'permno', 'sic', 'ret', 'exchcd', 'shrcd', 'date', 'lag_me', 'ticker', 'conm', 'comnam', 'prc', 'shrout'] + rank_columns + ['log_me']]
df_rank_imputed.to_feather("chars60_rank_imputed.feather")

In [40]:
df_rank_imputed.columns.values

array(['gvkey', 'permno', 'sic', 'ret', 'exchcd', 'shrcd', 'date',
       'lag_me', 'ticker', 'conm', 'comnam', 'prc', 'shrout',
       'rank_cinvest', 'rank_rvar_capm', 'rank_mom36m', 'rank_zerotrade',
       'rank_rna', 'rank_chtx', 'rank_pm', 'rank_depr', 'rank_cash',
       'rank_me_ia', 'rank_adm', 'rank_bm_ia', 'rank_cashdebt',
       'rank_mom12m', 'rank_baspread', 'rank_sp', 'rank_mom1m',
       'rank_rvar_ff3', 'rank_ep', 'rank_dy', 'rank_re', 'rank_nincr',
       'rank_rdm', 'rank_rsup', 'rank_lgr', 'rank_chpmia',
       'rank_std_dolvol', 'rank_rd_sale', 'rank_beta', 'rank_rvar_mean',
       'rank_mom60m', 'rank_chcsho', 'rank_roa', 'rank_acc', 'rank_ato',
       'rank_sue', 'rank_op', 'rank_alm', 'rank_noa', 'rank_mom6m',
       'rank_me', 'rank_cfp', 'rank_pscore', 'rank_seas1a', 'rank_roe',
       'rank_maxret', 'rank_std_turn', 'rank_sgr', 'rank_grltnoa',
       'rank_gma', 'rank_ni', 'rank_dolvol', 'rank_bm', 'rank_pctacc',
       'rank_herf', 'rank_lev', 'rank_agr', 'r

### Rank + Imputed

In [33]:
for this_date in all_dates:
    select_char = char[char['date'] == this_date]

    # plot all the char
    fig, axs = plt.subplots(subplot_x, 5, figsize=(20, int(3*subplot_x)))
    fig.subplots_adjust(hspace=0.5, wspace=0.5)

    for i, column in enumerate(rank_columns):
        ax = axs[i // 5, i % 5]
        ax.hist(select_char[column], bins=20, range=(-1, 1), edgecolor='black')
        ax.set_title(f'char: {column}')
        ax.set_xlabel('Values')
        ax.set_ylabel('Frequency')
    plt.savefig(f"test_chars/char_plot_{str(this_date)[0:4]}_{str(this_date)[5:7]}.png")
    plt.show()
    plt.close()

### Raw + No impute

In [None]:
select_char = char_raw_no_impute[char_raw_no_impute['date'] == random_date]

# plot all the char

fig, axs = plt.subplots(subplot_x, 5, figsize=(20, int(3*subplot_x)))
fig.subplots_adjust(hspace=0.5, wspace=0.5)

for i, column in enumerate(char_list):
    ax = axs[i // 5, i % 5]
    ax.hist(select_char[column], bins=100, edgecolor='black') #, range=(-1, 1)
    ax.set_title(f'char: {column}')
    ax.set_xlabel('Values')
    ax.set_ylabel('Frequency')
plt.show()

### Rank + No impute

In [None]:
select_char = char_rank_no_impute[char_rank_no_impute['date'] == random_date]

# plot all the char
fig, axs = plt.subplots(subplot_x, 5, figsize=(20, int(3*subplot_x)))
fig.subplots_adjust(hspace=0.5, wspace=0.5)

for i, column in enumerate(rank_columns):
    ax = axs[i // 5, i % 5]
    ax.hist(select_char[column], bins=20, edgecolor='black') # , range=(-1, 1)
    ax.set_title(f'char: {column}')
    ax.set_xlabel('Values')
    ax.set_ylabel('Frequency')
plt.show()

In [None]:
for this_date in all_dates:
    select_char = char_rank_no_impute[char_rank_no_impute['date'] == this_date]

    # plot all the char
    fig, axs = plt.subplots(subplot_x, 5, figsize=(20, int(3*subplot_x)))
    fig.subplots_adjust(hspace=0.5, wspace=0.5)

    for i, column in enumerate(rank_columns):
        ax = axs[i // 5, i % 5]
        ax.hist(select_char[column], bins=20, range=(-1, 1), edgecolor='black')
        ax.set_title(f'char: {column}')
        ax.set_xlabel('Values')
        ax.set_ylabel('Frequency')
    plt.savefig(f"test_rank_no_impute/char_plot_{str(this_date)[0:4]}_{str(this_date)[5:7]}.png")
    plt.show()
    plt.close()

### Raw + Imputed

In [None]:
select_char = char_raw_imputed[char_raw_imputed['date'] == random_date]

# plot all the char

fig, axs = plt.subplots(subplot_x, 5, figsize=(20, int(3*subplot_x)))
fig.subplots_adjust(hspace=0.5, wspace=0.5)

for i, column in enumerate(char_list):
    ax = axs[i // 5, i % 5]
    ax.hist(select_char[column], bins=20, edgecolor='black') # , range=(-1, 1)
    ax.set_title(f'char: {column}')
    ax.set_xlabel('Values')
    ax.set_ylabel('Frequency')
plt.show()

In [None]:
chars60 = pd.read_feather("/home/yuanzhi/Tree_Prediction/data/update_rawdata/chars_data/chars60_rank_imputed.feather")
import random

chars60['date'] = pd.to_datetime(chars60['date'])
all_dates = chars60['date'].unique()

# randomly find a date
random_date = random.choice(all_dates)
print(random_date)

select_char = chars60[chars60['date'] == random_date]
rank_columns = select_char.filter(like='rank').columns

# plot all the char

fig, axs = plt.subplots(math.ceil(len(rank_columns)/5), 5, figsize=(20, int(3*math.ceil(len(rank_columns)/5))))
fig.subplots_adjust(hspace=0.5, wspace=0.5)

for i, column in enumerate(rank_columns):
    ax = axs[i // 5, i % 5]
    ax.hist(select_char[column], bins=20, edgecolor='black') #, range=(-1, 1)
    ax.set_title(f'char: {column}')
    ax.set_xlabel('Values')
    ax.set_ylabel('Frequency')
plt.show()

In [None]:
# # Select columns that start with 'rank'
# rank_columns = select_char.filter(like='rank')

# # Randomly select a column
# selected_column = random.choice(rank_columns.columns)

# # Plot histogram for the selected column
# plt.hist(select_char[selected_column], bins=20, range=(-1, 1), edgecolor='black')
# plt.title(f'Histogram for column: {selected_column}')
# plt.xlabel('Values')
# plt.ylabel('Frequency')
# plt.show()

In [None]:
# rank_columns = select_char.filter(like='rank')

# # Plot histograms for each 'rank' column
# for column in rank_columns:
#     plt.figure(figsize=(6, 4))
#     plt.hist(select_char[column], bins=20, range=(-1, 1), edgecolor='black')
#     plt.title(f'Histogram for column: {column}')
#     plt.xlabel('Values')
#     plt.ylabel('Frequency')
#     plt.show()