In [None]:
import numpy as np
import pandas as pd

# 获取基础数据

In [None]:
# 这里我们以中证500成份股为股票池
# 日期：20190102 - 20220630
# csv里为按日期和股票整理好的因子矩阵，是真实数据，因子值未经预处理

In [None]:
# 获取 ROA 数据
roa = pd.read_csv('data/roa_ttm_zz500.csv', index_col=0)
roa.head(2)

In [None]:
# 查看是否有缺失值
roa.isna().any().any()

In [None]:
# 获取中信一级行业分类信息
industry_info = pd.read_csv('data/info_zz500.csv', index_col=0)
industry_info.head(2)

# 数据预处理

In [None]:
# 处理之前
roa.T.describe()

## 去极值

In [None]:
# 定义函数。for 循环也可以改为 pandas 的向量化操作

def winsorize(df):
    """截断与中位数相距5个[离差中位数]以上的数值"""
    new_data = []
    for i in range(len(df)):
        
        df_i = df.iloc[i, :]
        df_i_median = df_i.median()
        mad = (df_i - df_i_median).abs().median()
        
        max_range = df_i_median + 5 * mad
        min_range = df_i_median - 5 * mad
        df_i_new = np.clip(df_i, min_range, max_range)
        
        new_data.append(df_i_new)
        
    new_df = pd.concat(new_data, axis=1).T
    return new_df

In [None]:
roa1 = winsorize(roa)
roa1.head(3)

In [None]:
roa1.T.describe()

## 标准化

In [None]:
# 定义函数。for 循环也可以改为 pandas 的向量化操作

def standardize(df):
    new_data = []
    for i in range(len(df)):
        
        df_i = df.iloc[i, :]
        mu = df_i.mean()
        sigma = df_i.std()
        df_i_new = (df_i - mu) / sigma
        
        new_data.append(df_i_new)
        
    new_df = pd.concat(new_data, axis=1).T
    return new_df

In [None]:
roa2 = standardize(roa1)
roa2.head(3)

In [None]:
roa2.T.describe()

## 缺失值处理（0填充）

In [None]:
roa3 = roa2.fillna(0)
roa3.T.describe()