# 因子选股

In [None]:
import pandas as pd
import numpy as np
import yfinance as yf
import datetime as dt
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import akshare as ak
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [None]:
# 获取股票数据
def fetch_stock_data(stock_codes, start_date=(dt.datetime.now()-dt.timedelta(365*5)).shiftime("%Y%m%d"), end_date=dt.datetime.now().strftime("%Y%m%d")):
    """
    批量获取A股多维度数据
    :param stock_codes: list格式，如 ['600519.SH', '300750.SZ']
    :param start_date: 起始日期（YYYYMMDD）
    :param end_date: 结束日期（YYYYMMDD）
    :return: 合并后的DataFrame
    """
    all_data = []
    
    for code in tqdm(stock_codes, desc='正在获取数据'):
        ts_code, exchange = code.split('.')
        market = 'SH' if exchange == 'SH' else 'SZ'
        
        try:
            # === 1.获取基础行情（Tushare前复权） ===
            df_price = pro.daily(ts_code=code, start_date=start_date, end_date=end_date, adj='qfq')
            df_price['trade_date'] = pd.to_datetime(df_price['trade_date'], format='%Y%m%d')
            df_price = df_price.sort_values('trade_date')
            
            # === 2.计算VWAP ===
            df_price['vwap'] = (df_price['close'] * df_price['vol']).cumsum() / df_price['vol'].cumsum()
            
            # === 3.获取流通股本（计算换手率） ===
            df_share = pro.daily_basic(ts_code=code, start_date=start_date, end_date=end_date, 
                                      fields='trade_date,float_share')
            df_share['trade_date'] = pd.to_datetime(df_share['trade_date'], format='%Y%m%d')
            df_price = df_price.merge(df_share, on='trade_date', how='left')
            df_price['turnover_rate'] = df_price['vol'] / df_price['float_share'] * 100  # 换手率=成交量/流通股本
            
            # === 4.获取财务数据 ===
            # 资产负债表
            df_balance = pro.balancesheet(ts_code=code, start_date=start_date, 
                                         fields='end_date,total_assets,total_liab')
            df_balance['report_date'] = pd.to_datetime(df_balance['end_date'], format='%Y%m%d')
            
            # 利润表
            df_income = pro.income(ts_code=code, start_date=start_date, 
                                  fields='end_date,revenue, n_income')
            df_income['report_date'] = pd.to_datetime(df_income['end_date'], format='%Y%m%d')
            
            # 合并财务数据
            df_finance = pd.merge(df_balance, df_income, on='report_date', how='outer')
            df_finance['debt_to_equity'] = df_finance['total_liab'] / (df_finance['total_assets'] - df_finance['total_liab'])
            
            # === 5.获取ST标记和行业分类（AKShare） ===
            df_info = ak.stock_individual_info_em(symbol=ts_code)
            is_st = "ST" in df_info[df_info['item']=='证券简称']['value'].iloc[0]
            industry = df_info[df_info['item']=='所属行业']['value'].iloc[0]
            
            # === 6.数据合并 ===
            df_full = df_price.merge(df_finance, left_on='trade_date', right_on='report_date', how='left')
            df_full['is_st'] = is_st
            df_full['code'] = code
            df_full['bps'] = df_full['total_assets'] / df_full['total_liab']  # 此处为示例，实际需计算每股净资产
            
            # 保留所需字段
            keep_cols = ['trade_date', 'code', 'open', 'high', 'low', 'close', 'vol', 
                        'amount', 'vwap', 'turnover_rate', 'is_st', 'report_date',
                        'total_assets', 'n_income', 'revenue', 'bps', 'debt_to_equity']
            df_full = df_full[keep_cols].rename(columns={
                'vol': 'volume',
                'n_income': 'net_profit',
                'revenue': 'operating_revenue'
            })
            
            all_data.append(df_full)
            
        except Exception as e:
            print(f"股票 {code} 获取失败: {str(e)}")
            continue
    
    # 合并所有股票数据
    final_df = pd.concat(all_data).sort_values(['code', 'trade_date'])
    
    # 前向填充财务数据
    final_df = final_df.groupby('code').apply(lambda x: x.ffill())
    
    return final_df.reset_index(drop=True)





def del_cy_kc(tickers): # 去掉创业板和科创板的股票代码，同时为ticker加上后缀
    # tickers为股票代码的dataframe
    
    for code in tickers['code']:
        if code.endswith('.SH') or code.endswith('.SZ'):
            continue
        if code.startswith('300') or code.startswith('688'):
            tickers.drop(tickers[tickers['code']==code].index,inplace=True)
        elif code.startswith('6'):
            tickers.loc[tickers['code']==code,'code']=code+'.SH'
        elif code.startswith('0'):
            tickers.loc[tickers['code']==code,'code']=code+'.SZ'
        else:
            tickers.drop(tickers[tickers['code']==code].index,inplace=True)
    return tickers
        


关于因子计算所需要的数据
- 交易日:trade_date
- 股票代码:code
- 开盘价（前复权）:open
- 最高价（前复权）:high
- 最低价（前复权）:low
- 收盘价（前复权）:close
- 成交量（股）:volume
- 成交金额（元）:amount
- 日内成交量加权平均价:vwap
- 换手率:turnover_rate
- 是否是ST股:is_st
- 财报发布日期:report_date
- 公司总资产:total_assets
- 净利润:net_profit
- 营业收入:operating_revenue
- 基本每股收益:eps_basic
- 基点:bps
- 债务股本比:debt_to_equity

In [None]:
def IC(df,factors,target='future_return'):
    pass

# 计算因子
import pandas as pd
import numpy as np
from scipy.stats import rankdata

def calculate_wq_factors(df):
    """
    输入: get_stock_data返回的DataFrame
    输出: 包含因子的DataFrame
    """
    
    # 准备基础数据
    close = df['close'].values
    open_ = df['open'].values
    volume = df['volume'].values
    vwap = df['vwap'].values
    industry = df['industry'].values  # 需要行业数据
    
    # 初始化因子存储
    factors = pd.DataFrame(index=df.index)
    
    # === Alpha #6: 价量背离 ===
    roll_window = 10
    corrs = []
    for i in range(len(open_)):
        if i < roll_window:
            corrs.append(np.nan)
        else:
            corr = np.corrcoef(open_[i-roll_window:i], volume[i-roll_window:i])[0,1]
            corrs.append(-1 * corr)
    factors['alpha6'] = corrs
    
    # === Alpha #42: VWAP动量 ===
    vwap_diff = vwap - close
    ranked = df.groupby('trade_date').apply(lambda x: rankdata(x['vwap_diff']))
    factors['alpha42'] = ranked.rolling(5).mean()
    
    # === Alpha #53: 流动反转 ===
    ret_5 = close[5:]/close[:-5] - 1
    vol_ma20 = volume.rolling(20).mean()[5:]
    factors['alpha53'] = (ret_5 * vol_ma20).reindex_like(df, method='ffill')
    
    # === Alpha #70: 行业动量 (需要行业数据) ===
    def industry_neutral_ret(group):
        group['ret_5'] = group['close'].pct_change(5)
        return group['ret_5'] - group['ret_5'].mean()
    factors['alpha70'] = df.groupby(['trade_date','industry']).apply(industry_neutral_ret)
    
    # === Alpha #4: 波动调整 ===
    std_10 = close.rolling(10).std()
    ma_10 = close.rolling(10).mean()
    factors['alpha4'] = std_10 / ma_10
    
    # 合并原始数据
    return pd.concat([df, factors], axis=1)


# 对因子进行正交化分解，减少多重共线性
def factor_pca_optimization(df, n_components=3):
    """
    输入: 包含因子的DataFrame
    输出: PCA合成后的因子 + 解释方差
    """
    
    # 1. 数据清洗
    df_clean = df.dropna(subset=['alpha6','alpha42','alpha53','alpha70','alpha4'])
    
    # 2. 标准化处理
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_clean[['alpha6','alpha42','alpha53','alpha70','alpha4']])
    
    # 3. PCA降维 (保留85%方差)
    pca = PCA(n_components=0.85)
    pca_features = pca.fit_transform(X_scaled)
    
    # 4. 生成主成分因子
    for i in range(pca.n_components_):
        df_clean[f'pca_factor_{i+1}'] = pca_features[:,i]
    
    # 输出解释方差
    print(f"各主成分解释方差比: {pca.explained_variance_ratio_}")
    
    return df_clean

# 根据市场波动率动态调整权重
def dynamic_weight(df):
    # 计算波动率
    df['volatility'] = df['close'].rolling(20).std()
    
    # 波动率分箱
    df['vol_bin'] = pd.qcut(df['volatility'], q=3, labels=['low','mid','high'])
    
    # 权重分配规则，当波动率高时，增加pca_factor_2的权重，因为它与市场波动性相关；否则，增加pca_factor_3的权重，因为它与市场趋势相关
    df['weight'] = np.where(df['vol_bin'] == 'high', 
                            df['pca_factor_1']*0.6 + df['pca_factor_2']*0.4,
                            df['pca_factor_1']*0.3 + df['pca_factor_3']*0.7)
    return df

# 对pca因子进行行业和市值中性化处理
def neutralize_factor(df, factor_col):
    # 行业中性
    ind_neutral = df.groupby(['trade_date','industry'])[factor_col].transform(lambda x: x - x.mean())
    
    # 市值中性 (需要市值数据)
    df['size'] = np.log(df['market_cap'])
    size_neutral = df.groupby('trade_date').apply(lambda x: x[factor_col] - x['size'].dot(x[factor_col])/x['size'].sum())
    
    df[f'{factor_col}_neutral'] = (ind_neutral + size_neutral)/2
    return df



def xgb_enhance(df):
    X = df[['pca_factor_1','pca_factor_2','pca_factor_3']]
    y = df['close'].shift(-5)/df['close'] - 1  # 未来5日收益
    y = y[:-5]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    
    model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100)
    model.fit(X_train, y_train)
    
    df['xgb_score'] = model.predict(X)
    return df

# 这是对因子法的测试

In [None]:
# 完整流程示例
raw_data = fetch_stock_data(['600519.SH','000001.SZ'])  # 您的获取函数
df_factors = calculate_wq_factors(raw_data)           # 计算因子
df_pca = factor_pca_optimization(df_factors)          # PCA降维
df_weight = dynamic_weight(df_pca)                    # 动态权重
df_neutral = neutralize_factor(df_weight, 'pca_factor_1') # 中性化
df_final = xgb_enhance(df_neutral)                    # 非线性增强

# 选出最终信号
df_final['final_signal'] = df_final['xgb_score'] * 0.7 + df_final['pca_factor_1_neutral'] * 0.3

In [None]:
# 回测框架
class BacktestEngine:
    def __init__(self, data, initial_capital=1e6):
        """
        :param data: DataFrame, 必须包含以下列:
                    - trade_date: 日期
                    - code: 股票代码
                    - close: 收盘价
                    - signal: 因子信号值
                    - is_st: 是否ST股
                    - is_suspended: 是否停牌 (True/False)
        :param initial_capital: 初始资金
        """
        self.data = data.sort_values(['trade_date', 'code'])
        self.initial_capital = initial_capital
        self.trade_dates = self.data['trade_date'].unique()
        self.n_dates = len(self.trade_dates)
        
        # 参数设置
        self.commission_rate = 0.0005  # 单边佣金+印花税
        self.stop_limit_rate = 0.097    # 涨跌停限制
        self.leverage = 1.0            # 杠杆率
        self.max_position = 10         # 最大持仓股票数
        
    def run(self):
        """ 执行回测 """
        self._prepare_data()
        self._init_portfolio()
        
        for i in tqdm(range(1, self.n_dates)):
            current_date = self.trade_dates[i]
            prev_date = self.trade_dates[i-1]
            
            # 获取当前持仓市值
            current_pos = self.portfolio.loc[prev_date]
            current_cash = current_pos['cash']
            
            # 生成调仓信号（T日收盘生成，T+1日开盘执行）
            signal_df = self._generate_signal(current_date)
            
            # 执行交易
            self._rebalance_portfolio(current_date, signal_df, current_cash)
            
        return self._calculate_stats()
    
    def _prepare_data(self):
        """ 数据预处理 """
        # 剔除ST股和停牌日
        self.data = self.data[(self.data['is_st'] == False) & 
                             (self.data['is_suspended'] == False)]
        
        # 计算每日收益
        self.data['return'] = self.data.groupby('code')['close'].pct_change()
        
    def _init_portfolio(self):
        """ 初始化组合记录 """
        self.portfolio = pd.DataFrame(index=self.trade_dates,
                                     columns=['cash', 'value', 'positions'])
        self.portfolio.iloc[0] = {
            'cash': self.initial_capital,
            'value': self.initial_capital,
            'positions': {}
        }
        
    def _generate_signal(self, date):
        """ 生成调仓信号（需自定义逻辑） """
        # 示例：选择当日信号最高的前N只股票
        daily_data = self.data[self.data['trade_date'] == date]
        signals = daily_data.set_index('code')['signal']
        
        # 去极值 & 标准化
        signals = self._winsorize(signals)
        ranked = signals.rank(ascending=False, method='first')
        selected = ranked[ranked <= self.max_position].index.tolist()
        
        return pd.Series(1/len(selected), index=selected)
    
    def _rebalance_portfolio(self, date, signal, current_cash):
        """ 执行再平衡 """
        # 获取前一日价格
        prev_prices = self.data[self.data['trade_date'] == date]['close']
        
        # 计算目标仓位
        target_value = current_cash * self.leverage
        target_weights = signal.to_dict()
        
        # 计算当前持仓价值
        current_pos = self.portfolio.loc[date]['positions']
        current_value = sum([shares * prev_prices.get(code, 0) 
                           for code, shares in current_pos.items()])
        
        # 执行交易
        new_pos = {}
        total_commission = 0
        for code, weight in target_weights.items():
            # 跳过涨跌停股票
            if self._is_stop_limit(code, date):
                continue
                
            price = prev_prices[code]
            target_shares = (target_value * weight) / price
            current_shares = current_pos.get(code, 0)
            
            # 计算交易佣金
            trade_amount = abs(target_shares - current_shares) * price
            commission = trade_amount * self.commission_rate
            total_commission += commission
            
            new_pos[code] = target_shares
        
        # 更新组合
        new_cash = current_cash - (target_value - current_value) - total_commission
        new_value = sum([shares * prev_prices[code] for code, shares in new_pos.items()]) + new_cash
        
        self.portfolio.loc[date] = {
            'cash': new_cash,
            'value': new_value,
            'positions': new_pos
        }
        
    def _is_stop_limit(self, code, date):
        """ 判断是否涨跌停 """
        daily_data = self.data[(self.data['code'] == code) & 
                             (self.data['trade_date'] == date)]
        ret = daily_data['return'].values[0]
        return abs(ret) > self.stop_limit_rate
    
    def _winsorize(self, series, sigma=3):
        """ 去极值处理 """
        mean, std = series.mean(), series.std()
        return series.clip(lower=mean-sigma*std, upper=mean+sigma*std)
    
    def _calculate_stats(self):
        """ 计算绩效指标 """
        nav = self.portfolio['value'] / self.initial_capital
        returns = nav.pct_change().fillna(0)
        
        stats = {
            'Total Return': nav[-1] - 1,
            'Annualized Return': nav[-1]**(252/len(nav)) - 1,
            'Sharpe Ratio': returns.mean() / returns.std() * np.sqrt(252),
            'Max Drawdown': (nav / nav.cummax() - 1).min(),
            'Turnover Rate': self._calculate_turnover()
        }
        
        # 绘制净值曲线
        plt.figure(figsize=(10,6))
        nav.plot(title='Strategy Net Asset Value')
        plt.xlabel('Date')
        plt.ylabel('NAV')
        plt.grid(True)
        
        return pd.Series(stats)
    
    def _calculate_turnover(self):
        """ 计算换手率 """
        turnover = []
        for i in range(1, len(self.portfolio)):
            prev_pos = set(self.portfolio.iloc[i-1]['positions'].keys())
            curr_pos = set(self.portfolio.iloc[i]['positions'].keys())
            changed = prev_pos.symmetric_difference(curr_pos)
            turnover.append(len(changed) / self.max_position)
        return np.mean(turnover)

# 混合专家模型预测

In [None]:
from keras.models import Sequential
from keras.layers import LSTM,Dense,Attention
from sklearn.ensemble import RandomForestRegressor
from statsmodels.tsa.arima.model import ARIMA

In [None]:
# 构建特征工程
def build_features(df):
    # 待完善

    
    df.dropna(inplace=True)
    return df

In [None]:
# 构建LSTM模型
def build_lstm_model(input_shape):
    model=Sequential() # 创建一个序贯模型
    model.add(LSTM(30,input_shape=input_shape,return_sequences=True)) # LSTM层，30个单元，返回序列
    model.add(Attention()) # 添加注意力机制
    model.add(Dense(1)) # Dense层输出一个值
    model.compile(optimizer='adam',loss='mean_squared_error') # 编译模型，使用adam优化器和均方误差损失函数
    return model

# 构建随机森林模型
def build_random_forest_model():
    model=RandomForestRegressor(n_estimators=100,random_state=42) # 随机森林模型
    return model

# 构建ARIMA模型
def build_arima(series,order=(1, 1, 1)):
    model = ARIMA(series, order=order) # ARIMA模型
    return model

def hybrid_predict(lstm_pred,rf_pred,arima_pred):
    # 这里可以使用加权平均或其他方法组合预测结果
    combined_pred = (lstm_pred + rf_pred + arima_pred) / 3
    return combined_pred

def select_stock(predictions,top_n=10):
    ranked=predictions.sort_values('predicted_return',ascending=Fasle)
    return ranked[:top_n].index_tolist()

# 回测框架

In [None]:
import backtrader as bt

In [None]:
class MyStratege(bt.Strategy):
    def __init__(self):
        self.dataclose = self.datas[0].close

    def next(self):
        # 执行预测和调仓逻辑
        pass

    

# 主函数接口

In [None]:
if __name__=='__main__':
    # 获取数据
    tickers=ak.stock_info_a_code_name() # 获取股票代码和名称

    # 补全股票代码的后缀，并删除创业板和科创板的股票
    del_cy_kc(tickers)
    print(len(tickers)) # 打印股票数量

    # 获得股票代码和名称的映射
    tickers_mapping = tickers.set_index('code').to_dict()['name'] # 股票代码映射

    # 将股票代码转换为列表
    codes=tickers['code'].tolist() # 股票代码

    end_date=dt.datetime.now()
    start_date=end_date - dt.timedelta(days=365*5) # 5年数据
    stock_data=fetch_stock_data(codes,start_date,end_date)
    print(type(stock_data)) # 打印数据类型
    print(len(stock_data)) # 打印数据数量

    # 将stock_data转换为DataFrame
    
    


    # # 计算因子并正交化
    # # 待补充

    # # 特征工程
    # for ticker in tickers:
    #     stock_data[ticker]['factor']=compute_factor(stock_data[ticker])

    # # 训练模型（划分训练、测试集）
    # train_data = {}
    # test_data = {}
    # for ticker in tickers:
    #     data = stock_data[ticker]
    #     train_size = int(len(data) * 0.8)
    #     train_data[ticker] = data[:train_size]
    #     test_data[ticker] = data[train_size:]
    
    # # LSTM模型
    # lstm_model = build_lstm_model(input_shape=(train_data.shape[1], train_data.shape[2]))
    # lstm_model.fit(train_data, epochs=10, batch_size=32) # 训练模型
    # lstm_pred = lstm_model.predict(test_data) # 预测
    # # 随机森林模型
    # rf_model = build_random_forest_model()
    # rf_model.fit(train_data, train_data['target']) # 训练模型
    # rf_pred = rf_model.predict(test_data) # 预测
    # # ARIMA模型
    # arima_model = build_arima(train_data['target'])
    # arima_model_fit = arima_model.fit() # 训练模型
    # arima_pred = arima_model_fit.forecast(steps=len(test_data)) # 预测
    # # 组合预测
    # combined_pred = hybrid_predict(lstm_pred, rf_pred, arima_pred)
    # # 选股
    # selected_stocks = select_stock(combined_pred)

    # # 回测
    




In [None]:
import tushare as ts
import akshare as ak
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime

# 初始化Tushare Pro (需注册获取token)
pro = ts.pro_api('cd132b10f239d988d8f0afb681bc801e452ecc25554c63ff7d1c8b5b')



# ===== 使用示例 =====
if __name__ == "__main__":
    # 输入股票代码列表（带交易所后缀）
    stock_list = ['600519.SH', '300750.SZ', '688981.SH']
    
    # 获取数据
    df = get_stock_data(stock_list, start_date='20230101')
    
    # 保存数据
    df.to_csv('A股因子数据.csv', index=False)

In [None]:
ts.set_token('cd132b10f239d988d8f0afb681bc801e452ecc25554c63ff7d1c8b5b')
pro = ts.pro_api()
