In [7]:
import pandas as pd

In [8]:
finance = pd.read_csv('../data/step1/finance.csv', index_col=0)
returns = pd.read_csv('../data/step1/returns.csv', index_col=0)
for df in [finance, returns]:
    df['Time'] = pd.to_datetime(df['Time'])

In [9]:
T = [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]

for t in T:
    
    # 为了计算因子，需要先计算第t期的各指标，根据各指标的分位点来分组，用于确定第t期（t年7月至t+1年6月）的各股票组合
    # “市值” (Size) 指标是以股票i在第t年6月底的流通市值
    # “账面市值比”(BM)是用第t-1年末的“账面价值/股票i的流通市值”
    # “营运利润率”(OP)是用第t-1年末的“营业利润/股东权益合计”
    # “投资风格”(INV)是用第t-1年末相对于第t-2年末的总资产增加额，除以第t-2年末的总资产
    
    # Size
    Size = returns[(returns['Time'].dt.month == 7) & (returns['Time'].dt.year == t)][['Stkcd', 'Msmvosd']]
    Size.rename(columns={'Msmvosd': 'Size'}, inplace=True)
    
    # BM
    x = finance[(finance['Time'].dt.month == 12) & (finance['Time'].dt.year == t - 1)][['Stkcd', 'A001000000']]
    y = returns[(returns['Time'].dt.month == 1) & (returns['Time'].dt.year == t)][['Stkcd', 'Msmvosd']]
    BM = pd.merge(x, y, on='Stkcd')
    BM['BM'] = BM['A001000000'] / BM['Msmvosd']
    BM.drop('A001000000', axis=1, inplace=True)
    BM.drop('Msmvosd', axis=1, inplace=True)
    
    # OP
    x = finance[(finance['Time'].dt.month == 12) & (finance['Time'].dt.year == t - 1)][['Stkcd', 'B001300000']]
    y = finance[(finance['Time'].dt.month == 12) & (finance['Time'].dt.year == t - 1)][['Stkcd', 'A003000000']]
    OP = pd.merge(x, y, on='Stkcd')
    OP['OP'] = OP['B001300000'] / OP['A003000000']
    OP.drop('B001300000', axis=1, inplace=True)
    OP.drop('A003000000', axis=1, inplace=True)
    
    # INV
    x = finance[(finance['Time'].dt.month == 12) & (finance['Time'].dt.year == t - 1)][['Stkcd', 'A001000000']]
    y = finance[(finance['Time'].dt.month == 12) & (finance['Time'].dt.year == t - 2)][['Stkcd', 'A001000000']]
    x.rename(columns={'A001000000': 'A1'}, inplace=True)
    y.rename(columns={'A001000000': 'A2'}, inplace='True')
    INV = pd.merge(x, y, on='Stkcd')
    INV['INV'] = (INV['A1'] - INV['A2']) / INV['A2']
    INV.drop('A1', axis=1, inplace=True)
    INV.drop('A2', axis=1, inplace=True)
    
    # merge Stkcd, Size, BM, OP, INV
    df = pd.merge(Size, BM, on='Stkcd')
    df = pd.merge(df, OP, on='Stkcd')
    df = pd.merge(df, INV, on='Stkcd')
    
    # label
    a_50 = df['Size'].quantile(0.5)
    for i in range(df.shape[0]):
        if df.loc[i, 'Size'] < a_50:
            df.loc[i, 'Size'] = 'A'
        else:
            df.loc[i, 'Size'] = 'B'
    
    for item in ['BM', 'OP', 'INV']:
        a_30 = df[item].quantile(0.3)
        a_70 = df[item].quantile(0.7)
        for i in range(df.shape[0]):
            if df.loc[i, item] < a_30:
                df.loc[i, item] = 'a'
            elif df.loc[i, item] < a_70:
                df.loc[i, item] = 'b'
            else:
                df.loc[i, item] = 'c'
    
    df.to_csv('../data/step2/{}.csv'.format(t), index=None)