In [1]:
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from model.model import Model
from check_submission import check_submission

## 1. Data Analysis And Preprocess

In [2]:
file23 = './data/230619.csv'
df23 = pd.read_csv(file23, encoding='utf-8')
df23.head()

Unnamed: 0,the_datetime,symbol,track,sharpe_ratio_modified_3m,sharpe_ratio_modified_6m,sharpe_ratio_modified_1y,sharpe_ratio_modified_2y,annual_returns_3m,annual_returns_6m,annual_returns_1y,...,sec_ind_ex_avg_cximp_6m,sec_ind_ex_avg_cximp_1y,sec_ind_ex_avg_cximp_2y,comp_winning_yield_3m_1y,comp_winning_yield_3m_2y,ind_ex_avg_cximp_3m,ind_ex_avg_cximp_6m,ind_ex_avg_cximp_1y,ind_ex_avg_cximp_2y,daily_return
0,2012-12-31,1,价值,0.483971,-0.266978,0.314474,0.339701,0.233459,0.353288,0.351001,...,0.138174,-0.042008,,1.565427,0.27898,-0.195429,-0.085693,-0.257223,,0.012645
1,2012-12-31,11,价值,-0.375688,-0.538772,-0.239646,0.514503,-0.133633,-0.186934,0.096457,...,0.334712,0.581397,,0.388653,0.364187,-0.464664,-0.055407,0.405757,,0.009002
2,2012-12-31,21,均衡,0.288329,-0.551492,-0.791903,-1.201647,0.431506,-0.45729,-0.651966,...,-0.755362,-0.785504,,-0.557381,-1.108675,-0.88062,-0.953606,-0.746467,,0.011981
3,2012-12-31,31,均衡,-0.170698,-0.555539,-0.803354,-1.198081,-0.059994,-0.493962,-0.883196,...,0.036506,-0.224371,,-2.172562,-1.863365,-0.530784,-0.569187,-0.615701,,0.012488
4,2012-12-31,61,价值,0.864647,-0.115932,0.860236,-0.527318,1.320411,0.485371,0.98155,...,0.490441,0.5369,,0.388653,-0.646123,0.033425,0.08626,0.079238,,0.014706


In [3]:
df = df23.copy()
# 将分类变量转换为虚拟变量
dummy_df = pd.get_dummies(df['track'])
# 将虚拟变量拼接到原始 DataFrame
df = pd.concat([df, dummy_df], axis=1)
df.drop('track', axis=1, inplace=True)
# 将缺失值填补为0
df.fillna(0, inplace=True)
df = df.replace({True: 1, False: 0})

In [4]:
df.head()

Unnamed: 0,the_datetime,symbol,sharpe_ratio_modified_3m,sharpe_ratio_modified_6m,sharpe_ratio_modified_1y,sharpe_ratio_modified_2y,annual_returns_3m,annual_returns_6m,annual_returns_1y,annual_returns_2y,...,价值,其他,制造,医药,周期,均衡,成长,消费,科技,金融
0,2012-12-31,1,0.483971,-0.266978,0.314474,0.339701,0.233459,0.353288,0.351001,0.108795,...,1,0,0,0,0,0,0,0,0,0
1,2012-12-31,11,-0.375688,-0.538772,-0.239646,0.514503,-0.133633,-0.186934,0.096457,0.841372,...,1,0,0,0,0,0,0,0,0,0
2,2012-12-31,21,0.288329,-0.551492,-0.791903,-1.201647,0.431506,-0.45729,-0.651966,-0.946475,...,0,0,0,0,0,1,0,0,0,0
3,2012-12-31,31,-0.170698,-0.555539,-0.803354,-1.198081,-0.059994,-0.493962,-0.883196,-1.089881,...,0,0,0,0,0,1,0,0,0,0
4,2012-12-31,61,0.864647,-0.115932,0.860236,-0.527318,1.320411,0.485371,0.98155,-0.258823,...,1,0,0,0,0,0,0,0,0,0


In [5]:

grouped_dfs = df.groupby('symbol')
dfs = []
weeks_data = []
weeks_return = []
T=50
# Iterate over each group 
for group_name, group_df in tqdm(grouped_dfs):
    group_df['the_datetime'] = pd.to_datetime(group_df['the_datetime'])
    group_df.set_index('the_datetime', inplace=True)

    weekly_returns = group_df['daily_return'].resample('W-MON', label='left', closed='left').sum()
    group_df['weekly_return'] = weekly_returns
    
    

    ys = group_df['weekly_return'].values
    xs = group_df.drop("weekly_return", axis=1).values

    weeks_return_idx = []
    for i, week_return in enumerate(ys):
        if np.isnan(week_return) == False:# and i >= T:
            weeks_return.append(week_return)
            weeks_return_idx.append(i)
            
    # print("y len:", len(weeks_return_idx))
    for idx in weeks_return_idx:
        if idx < T:
            week_data = xs[0:idx]
            x = np.zeros((T-week_data.shape[0],119))
            week_data = np.concatenate((x, week_data), axis=0)
            weeks_data.append(week_data)
        else:
            week_data = xs[idx-T:idx]
            weeks_data.append(week_data)
    # print("x len:", len(weeks_data))

    dfs.append(group_df)

# Combine the groups back into a single DataFrame
df = pd.concat(dfs, axis=0)

# Reset the index
df.reset_index(drop=True, inplace=True)

100%|██████████| 3791/3791 [00:38<00:00, 99.16it/s] 


In [6]:
np.save('./data/X.npy', np.array(weeks_data))
np.save('./data/y.npy', np.array(weeks_return))
df.to_csv('./data/230619_preprocessed.csv', index=False)

## 2. Train Model

通过下面的命令训练模型
```
python train.py
```

## 3. Result Generation

In [7]:
model = Model()
model.load_state_dict(torch.load('./model/final_model.pth'))
model.cuda()

Model(
  (lstm): LSTM(119, 128, batch_first=True)
  (fc1): Linear(in_features=128, out_features=1, bias=True)
)

In [8]:
file = './data/230619_preprocessed.csv'
df = pd.read_csv(file, encoding='utf-8')
df = df.replace({True: 1, False: 0})


In [9]:
# 加载测试集数据并进行预处理
test_symbols = pd.read_csv('./data/fund_pool.csv')
symbols = test_symbols['symbol'].values
print(symbols.shape)
test_symbols.head()

(3443,)


Unnamed: 0,symbol
0,1
1,6
2,11
3,17
4,20


In [10]:
df_pred = df[df['symbol'].isin(symbols)]
df_pred['symbol'].unique().shape

(3443,)

In [11]:
grouped_dfs = df_pred.groupby('symbol')
symbol_dict = {}
features = []
i = 0
T =50
# Iterate over each group and perform interpolation
for group_name, group_df in tqdm(grouped_dfs):
    symbol_dict[group_name] = i 
    i = i + 1
    
    xs = group_df.drop("weekly_return", axis=1).values
    weeks_data = xs[-T:]
    X = np.array(weeks_data)
    try:
        assert X.shape == (50, 119)
    except AssertionError:
        # print("X的shape为:", X.shape)
        x = np.zeros((T-X.shape[0],119))
        X = np.concatenate((x, X), axis=0)
    features.append(X)
features = np.array(features)
print(features.shape)

100%|██████████| 3443/3443 [00:04<00:00, 705.22it/s] 

(3443, 50, 119)





In [12]:
test_tensor = torch.from_numpy(features).float().cuda()

In [13]:
# 使用模型预测收益率
predictions = model(test_tensor)

In [14]:
predictions = predictions.cpu().detach().numpy()

In [15]:
pd.Series(predictions.reshape(-1)).unique()

array([0.0023409 , 0.00381846, 0.00278918, ..., 0.00207868, 0.00207868,
       0.00207868], dtype=float32)

In [16]:
test_symbols['prediction'] = 0.0
for index, row in test_symbols.iterrows():
    s = row['symbol']
    test_symbols.at[index, 'prediction'] = predictions[symbol_dict[s]]


In [17]:
test_symbols['prediction']

0       0.002341
1       0.003818
2       0.002789
3       0.002761
4       0.002929
          ...   
3438    0.002079
3439    0.002079
3440    0.002079
3441    0.002079
3442    0.002079
Name: prediction, Length: 3443, dtype: float64

In [18]:
# 根据预测收益率对基金进行排序
test_symbols['rank_id'] = test_symbols['prediction'].rank(ascending=True,  method='first').astype(int)

# 计算最大的10%的阈值
threshold = np.percentile(test_symbols['prediction'], 90)

# 将大于阈值的部分保留，其余设为0
test_symbols.loc[test_symbols['prediction'] > threshold, 'prediction'] = test_symbols['prediction']
test_symbols.loc[test_symbols['prediction'] <= threshold, 'prediction'] = 0

# 归一化权值
total_weight = test_symbols['prediction'].sum()
test_symbols['portfolio'] = test_symbols['prediction'] / total_weight

In [19]:
test_symbols

Unnamed: 0,symbol,prediction,rank_id,portfolio
0,1,0.000000,3074,0.000000
1,6,0.003818,3363,0.003443
2,11,0.002789,3238,0.002515
3,17,0.002761,3229,0.002489
4,20,0.002929,3271,0.002641
...,...,...,...,...
3438,970068,0.000000,2704,0.000000
3439,970073,0.000000,2705,0.000000
3440,970094,0.000000,2706,0.000000
3441,970101,0.000000,2707,0.000000


In [20]:
# 将结果保存为CSV文件
test_symbols[['symbol','portfolio','rank_id']].to_csv('predict_table.csv', index=False)

In [21]:
check_submission('predict_table.csv', 'fund_pool.csv')

'The submission file is valid.'

In [22]:
def compare_files(file1_path, file2_path):
    with open(file1_path, 'r') as file1, open(file2_path, 'r') as file2:
        content1 = file1.read()
        content2 = file2.read()
        
    if content1 == content2:
        return True
    else:
        return False

file1_path = 'predict_table.csv'
file2_path = 'final_result.csv'

result = compare_files(file1_path, file2_path)

if result:
    print("两个文件相同")
else:
    print("两个文件不同")


两个文件相同
