In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import time
import glob
import warnings
warnings.filterwarnings('ignore')


#data
path = "../data/data_daily"
all_files = glob.glob(path + "/*.csv")   

li = []


for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    date = os.path.basename(filename)[:-4]
    df['date'] = date
    li.append(df)


frame = pd.concat(li, axis=0, ignore_index=True)
result_df = frame.sort_values(by=['code', 'date'], ascending=True)
result_df = result_df.reset_index(drop=True)
df = result_df.copy()
# result_df = result_df[(result_df['date'] >= '2021-01-01') & (result_df['date'] <= '2021-12-31')]

In [2]:
df['Turn20'] = df.groupby(['code'])['turnover_ratio'].transform(lambda x: x.rolling(20).mean())
df['Turn_Bench'] = df.groupby(['code'])['turnover_ratio'].transform(lambda x: (x.rolling(60).sum() - x.rolling(20).sum()) / 40)
df['tmp'] = df['turnover_ratio'] / df['Turn_Bench'] - 1
df['PctTurn20'] = df.groupby(['code'])['tmp'].transform(lambda x: x.rolling(20).mean())
df['STR'] = df.groupby(['code'])['turnover_ratio'].transform(lambda x: x.rolling(20).std())


In [7]:
df['score1'] = df.groupby(['date'])['STR'].transform(lambda x: x.rank())
df['tag1'] = df.groupby(['date'])['STR'].transform(lambda x: x.rank(pct=True))
tmp1 = df[df['tag1'] < 0.5] 
tmp1['score2'] = tmp1.groupby(['date'])['STR'].transform(lambda x: x.rank(ascending=False))

tmp2 = df[df['tag1'] >= 0.5] 
tmp2['score2'] = tmp2.groupby(['date'])['STR'].transform(lambda x: x.rank())

final = pd.concat([tmp1, tmp2]).sort_values(['date', 'code'])
final.reset_index(drop=True)
final['UTR'] = final['score1'] + final['score2']

In [8]:

final['UTR2'] = df['STR'] + (df['STR'] / (1 + np.abs(df['STR']))) * df['Turn20']


In [9]:
import statsmodels.api as sm
results = sm.OLS(final['UTR'], final[['STR', 'Turn20']])
final['UTR_resid'] = results.fit().resid
results2 = sm.OLS(final['UTR2'], final[['STR', 'Turn20']])
final['UTR2_resid'] = results2.fit().resid

In [12]:
df = final
factor_name = "UTR"
output_folder = "../data/factors" + "/" + factor_name + "/unneutralized"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

selected_col = ['code', 'Turn20', 'PctTurn20', 'STR', 'UTR', 'UTR2', 'UTR_resid', 'UTR2_resid']
# date_index = pd.date_range(df['date'].min(), df['date'].max(), freq='D')
# trade_date = [d for d in date_index if d in df['date'].unique()]
trade_date = df['date'].unique()

for date in tqdm(trade_date):
    group = df.loc[df['date'] == date, selected_col]
    file_name = os.path.join(output_folder, date+'.csv')
    group.sort_values(by='code', ascending=True, inplace=True)
    group.to_csv(file_name, index=False)

100%|██████████| 930/930 [04:10<00:00,  3.71it/s]


In [13]:
from Barra_Neutralization import NeutralizationProcessor
barra_path = "../data/data_barra"
factor_path = "../data/factors/UTR/unneutralized"
output_folder =  "../data/factors/UTR/neutralized"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

In [14]:
import ray
# Usage
mode = 3
barra_col = ['size']
ray.shutdown()
ray.init(num_cpus=4, num_gpus=0)

process = NeutralizationProcessor.remote(barra_path, factor_path, barra_col, output_folder, mode)
s = time.time()
barrl_files = sorted(os.listdir(barra_path))
results = []
for f in barrl_files:
    results.append(process.process_file.remote(f))
    # process.process_file.remote(f)

ray.get(results)
e = time.time()
print("Time taken: ", e-s)

ray.shutdown()


2024-12-28 14:17:33,935	INFO worker.py:1821 -- Started a local Ray instance.
[36m(NeutralizationProcessor pid=62280)[0m   factor_df[col] = factor_df[col] / 10**np.ceil(np.log10(factor_df[col].abs().max()))
[36m(NeutralizationProcessor pid=62280)[0m   factor_df[col] = factor_df[col] / 10**np.ceil(np.log10(factor_df[col].abs().max()))
[36m(NeutralizationProcessor pid=62280)[0m   factor_df[col] = factor_df[col] / 10**np.ceil(np.log10(factor_df[col].abs().max()))
[36m(NeutralizationProcessor pid=62280)[0m   factor_df[col] = factor_df[col] / 10**np.ceil(np.log10(factor_df[col].abs().max()))
[36m(NeutralizationProcessor pid=62280)[0m   factor_df[col] = factor_df[col] / 10**np.ceil(np.log10(factor_df[col].abs().max()))
[36m(NeutralizationProcessor pid=62280)[0m   factor_df[col] = factor_df[col] / 10**np.ceil(np.log10(factor_df[col].abs().max()))
[36m(NeutralizationProcessor pid=62280)[0m   factor_df[col] = factor_df[col] / 10**np.ceil(np.log10(factor_df[col].abs().max()))
[36m(

Time taken:  71.08023166656494
