In [132]:
#导入模块
import  numpy as np # numpy 库用于矩阵运算
import pandas as pd # pandas 库用于读取数据
from   pandas import DataFrame
import pandas_ta as ta # pandas_ta 库用于指标计算
import plotly # plotly 库用于绘制动态图形
import plotly.express as px # plotly.express 库用于绘制动态图形
import plotly.graph_objects as go # plotly.graph_objects 库用于绘制动态图形
import baostock as bs # 用于获取股票数据
from ydata_profiling import * # 用于数据探索
import datetime # 用于时间处理
import re
import time # 用于时间处理
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error



In [71]:
#### 获取沪深A股历史K线数据 ####
# 详细指标参数，参见“历史行情指标参数”章节；“分钟线”参数与“日线”参数不同。“分钟线”不包含指数。
# 分钟线指标：date,time,code,open,high,low,close,volume,amount,adjustflag
# 周月线指标：date,code,open,high,low,close,volume,amount,adjustflag,turn,pctChg
# 以下函数默认获取上证综合指数从2023-01-01开始，到2023-01-31结束的日K线数据，仅用作测试接口是否正常。正式使用时，可以调整get_stock_data函数的参数。
# 其中code为股票代码，默认为sh.000001，即上证综合指数。start_date为开始日期，默认为2023-01-01。end_date为结束日期，默认为2023-01-31。

def get_stock_data(code='sz.000002', index:str = "" , start_date = '2023-01-01', end_date = '2023-01-31', adjustflag="2"):
    # 登陆系统
    lg = bs.login()
    # 显示登陆返回信息
    print('login respond error_code:' + lg.error_code)
    print('login respond  error_msg:' + lg.error_msg)

    # 获取沪深A股历史K线数据
    stock_code = code
    stock_start_date = start_date #baostock包可以获取1990年12月19日的所有数据
    stock_end_date = end_date
    f_start_date = datetime.datetime.strptime(stock_start_date,'%Y-%m-%d').strftime('%Y%m%d')
    f_end_date = datetime.datetime.strptime(stock_end_date,'%Y-%m-%d').strftime('%Y%m%d')
    if index:
        stock_index = index
    else:
        stock_index = "date,code,open,high,low,close,preclose,volume,amount,turn,tradestatus,pctChg,peTTM,pbMRQ,psTTM,pcfNcfTTM"
    """
    date:  交易日日期
    code:  股票代码
    open:  开盘价
    high:  最高价
    low:   最低价
    close: 收盘价
    preclose: 前收盘价 说明：http://baostock.com/baostock/index.php/A%E8%82%A1K%E7%BA%BF%E6%95%B0%E6%8D%AE
    volume:	成交量（累计 单位：股）	
    amount:	成交额（单位：人民币元）	
    adjustflag: 复权状态(1：后复权， 2：前复权，3：不复权）	
    turn:   换手率 [指定交易日的成交量(股)/指定交易日的股票的流通股总股数(股)]*100%
    tradestatus: 交易状态(1：正常交易 0：停牌）	
    pctChg:	涨跌幅（百分比）	日涨跌幅=[(指定交易日的收盘价-指定交易日前收盘价)/指定交易日前收盘价]*100%
    peTTM:	滚动市盈率	(指定交易日的股票收盘价/指定交易日的每股盈余TTM)=(指定交易日的股票收盘价*截至当日公司总股本)/归属母公司股东净利润TTM
    pbMRQ:	市净率	(指定交易日的股票收盘价/指定交易日的每股净资产)=总市值/(最近披露的归属母公司股东的权益-其他权益工具)
    psTTM:  滚动市销率	(指定交易日的股票收盘价/指定交易日的每股销售额)=(指定交易日的股票收盘价*截至当日公司总股本)/营业总收入TTM
    pcfNcfTTM: 滚动市现率	(指定交易日的股票收盘价/指定交易日的每股现金流TTM)=(指定交易日的股票收盘价*截至当日公司总股本)/现金以及现金等价物净增加额TTM
    isST: 是否ST股，1是，0否
    
    """
    
    rs = bs.query_history_k_data_plus(stock_code, stock_index, start_date = stock_start_date, end_date = stock_end_date, frequency = 'd', adjustflag = adjustflag)
    # frequency：数据类型，默认为d，日k线；d=日k线、w=周、m=月、5=5分钟、15=15分钟、30=30分钟、60=60分钟k线数据，不区分大小写；指数没有分钟线数据；周线每周最后一个交易日才可以获取，月线每月最后一个交易日才可以获取。
    # adjustflag：复权类型，默认不复权：3；1：后复权；2：前复权。已支持分钟线、日线、周线、月线前后复权。 BaoStock提供的是涨跌幅复权算法复权因子，具体介绍见：复权因子简介或者BaoStock复权因子简介。
    print('query_history_k_data_plus respond error_code:'+rs.error_code)
    print('query_history_k_data_plus respond  error_msg:'+rs.error_msg)

    #### 打印结果集 ####
    data_list = []
    while (rs.error_code == '0') & rs.next():
        # 获取一条记录，将记录合并在一起
        data_list.append(rs.get_row_data())
    result = pd.DataFrame(data_list, columns=rs.fields)
    #### 结果集输出到csv文件 ####
    print(f_start_date, f_end_date)
    file_name = f"-{stock_code}-f{f_start_date}-t{f_end_date}"
    result.to_csv(f".\history_k_data{file_name}.csv", encoding="gbk", index=False)
    print(result)
    return file_name

fn = get_stock_data(start_date="2013-1-1",end_date="2023-12-31")

login success!
login respond error_code:0
login respond  error_msg:success
query_history_k_data_plus respond error_code:0
query_history_k_data_plus respond  error_msg:success
20130101 20231231
            date       code           open           high            low  \
0     2013-01-04  sz.000002   6.4056361600   6.4056361600   6.4056361600   
1     2013-01-07  sz.000002   6.4056361600   6.4056361600   6.4056361600   
2     2013-01-08  sz.000002   6.4056361600   6.4056361600   6.4056361600   
3     2013-01-09  sz.000002   6.4056361600   6.4056361600   6.4056361600   
4     2013-01-10  sz.000002   6.4056361600   6.4056361600   6.4056361600   
...          ...        ...            ...            ...            ...   
2667  2023-12-25  sz.000002  10.3400000000  10.3800000000  10.2800000000   
2668  2023-12-26  sz.000002  10.3000000000  10.3300000000  10.1400000000   
2669  2023-12-27  sz.000002  10.1700000000  10.2500000000  10.0900000000   
2670  2023-12-28  sz.000002  10.1500000000  10.

In [82]:
pattern = r'-f(\d+)-t(\d+)'
match = re.search(pattern, fn)
start_date = datetime.datetime.strptime(match.group(1), '%Y%m%d').strftime('%Y-%m-%d')
end_date = datetime.datetime.strptime(match.group(2), '%Y%m%d').strftime('%Y-%m-%d')

stock_dataframe = pd.read_csv(f".\history_k_data{fn}.csv")
stock_dataframe['date'] = pd.to_datetime(stock_dataframe['date'])
stock_dataframe.set_index("date", inplace=True)

# 增加 5日 10日 20日 60日 120日 200日均线
stock_dataframe['mean5'] = stock_dataframe.close.rolling(5).mean()
stock_dataframe['mean10'] = stock_dataframe.close.rolling(10).mean()
stock_dataframe['mean20'] = stock_dataframe.close.rolling(20).mean()
stock_dataframe['mean60'] = stock_dataframe.close.rolling(60).mean()
stock_dataframe['mean120'] = stock_dataframe.close.rolling(120).mean()
stock_dataframe['mean200'] = stock_dataframe.close.rolling(200).mean()
stock_dataframe

Unnamed: 0_level_0,code,open,high,low,close,preclose,volume,amount,turn,tradestatus,...,peTTM,pbMRQ,psTTM,pcfNcfTTM,mean5,mean10,mean20,mean60,mean120,mean200
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-04,sz.000002,6.405636,6.405636,6.405636,6.405636,6.405636,0,0.000000e+00,,0,...,10.005978,1.975702,1.255889,-110.183797,,,,,,
2013-01-07,sz.000002,6.405636,6.405636,6.405636,6.405636,6.405636,0,0.000000e+00,,0,...,10.005978,1.975702,1.255889,-110.183797,,,,,,
2013-01-08,sz.000002,6.405636,6.405636,6.405636,6.405636,6.405636,0,0.000000e+00,,0,...,10.005978,1.975702,1.255889,-110.183797,,,,,,
2013-01-09,sz.000002,6.405636,6.405636,6.405636,6.405636,6.405636,0,0.000000e+00,,0,...,10.005978,1.975702,1.255889,-110.183797,,,,,,
2013-01-10,sz.000002,6.405636,6.405636,6.405636,6.405636,6.405636,0,0.000000e+00,,0,...,10.005978,1.975702,1.255889,-110.183797,6.405636,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-25,sz.000002,10.340000,10.380000,10.280000,10.300000,10.380000,27320474,2.817907e+08,0.2812,1,...,6.418407,0.485894,0.269208,-8.439585,10.348000,10.507,10.8125,11.609500,12.633149,13.326813
2023-12-26,sz.000002,10.300000,10.330000,10.140000,10.170000,10.300000,35102346,3.573613e+08,0.3613,1,...,6.337398,0.479761,0.265810,-8.333066,10.306000,10.423,10.7335,11.560000,12.605745,13.297721
2023-12-27,sz.000002,10.170000,10.250000,10.090000,10.170000,10.170000,34363764,3.489655e+08,0.3537,1,...,6.337398,0.479761,0.265810,-8.333066,10.288000,10.376,10.6690,11.511667,12.579371,13.270340
2023-12-28,sz.000002,10.150000,10.560000,10.100000,10.520000,10.170000,78980828,8.216411e+08,0.8128,1,...,6.555499,0.496272,0.274958,-8.619848,10.308000,10.378,10.6230,11.469500,12.556310,13.246231


## 清洗数据

In [102]:
stock_code = stock_dataframe.code
stock_dataframe = stock_dataframe[stock_dataframe['tradestatus']!=0] # 删除停牌日期数据
stock_dataframe = stock_dataframe.drop(['tradestatus','code'] ,axis=1) # 删除高度重复的交易状态数据
stock_dataframe.bfill(inplace=True)


AttributeError: 'DataFrame' object has no attribute 'code'

In [103]:
stock_dataframe

Unnamed: 0_level_0,open,high,low,close,preclose,volume,amount,turn,pctChg,peTTM,pbMRQ,psTTM,pcfNcfTTM,mean5,mean10,mean20,mean60,mean120,mean200
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2013-01-21,7.044934,7.044934,7.044934,7.044934,6.405636,23430893,2.607858e+08,0.242501,9.980240,11.004598,2.172881,1.381229,-121.180402,6.533496,6.469566,6.915175,7.057699,7.058132,6.701828
2013-01-22,7.399396,7.709550,7.323440,7.443704,7.044934,464332645,5.468736e+09,4.805664,5.660378,11.627500,2.295875,1.459412,-128.039670,6.741109,6.573373,6.915175,7.057699,7.058132,6.701828
2013-01-23,7.469022,7.488011,7.279132,7.399396,7.443704,129945195,1.514145e+09,1.344883,-0.595244,11.558288,2.282209,1.450725,-127.277529,6.939861,6.672749,6.915175,7.057699,7.058132,6.701828
2013-01-24,7.348758,7.734869,7.323440,7.563968,7.399396,219041827,2.623809e+09,2.266999,2.224125,11.815359,2.332968,1.482991,-130.108338,7.171527,6.788582,6.915175,7.057699,7.058132,6.701828
2013-01-25,7.582957,7.608275,7.323440,7.405726,7.563968,86321707,1.013224e+09,0.893396,-2.092050,11.568176,2.284161,1.451966,-127.386406,7.371545,6.888591,6.915175,7.057699,7.058132,6.701828
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-25,10.340000,10.380000,10.280000,10.300000,10.380000,27320474,2.817907e+08,0.281200,-0.770700,6.418407,0.485894,0.269208,-8.439585,10.348000,10.507000,10.812500,11.609500,12.633149,13.326813
2023-12-26,10.300000,10.330000,10.140000,10.170000,10.300000,35102346,3.573613e+08,0.361300,-1.262100,6.337398,0.479761,0.265810,-8.333066,10.306000,10.423000,10.733500,11.560000,12.605745,13.297721
2023-12-27,10.170000,10.250000,10.090000,10.170000,10.170000,34363764,3.489655e+08,0.353700,0.000000,6.337398,0.479761,0.265810,-8.333066,10.288000,10.376000,10.669000,11.511667,12.579371,13.270340
2023-12-28,10.150000,10.560000,10.100000,10.520000,10.170000,78980828,8.216411e+08,0.812800,3.441500,6.555499,0.496272,0.274958,-8.619848,10.308000,10.378000,10.623000,11.469500,12.556310,13.246231


## 可视化显示股票收盘价

In [104]:
fig_close = px.line(stock_dataframe, x=stock_dataframe.index, y="close", 
              template="plotly_dark",  title="股票收盘价",
              hover_data=[stock_dataframe.index,'open','close','high','low'],
              width=970, height=700,
              range_x=[f'{start_date}',f'{end_date}'])
fig_close.update_xaxes(ticklabelmode="period",minor=dict(ticks="inside", showgrid=True,ticklen=4,  
                     dtick=7*24*60*60*1000,  
                     tick0=f"{start_date}"))
fig_close.show()


In [105]:
fig_candle =  go.Figure(data=[go.Candlestick(x=stock_dataframe.index, # 绘制蜡烛图
                                             open=stock_dataframe.open, high=stock_dataframe.high, 
                                             close=stock_dataframe.close, low=stock_dataframe.low,
                                             name='日线'),
                              # 绘制均线
                              go.Scatter(x=stock_dataframe.index, y=stock_dataframe.mean5, name='MA5'),
                              go.Scatter(x=stock_dataframe.index, y=stock_dataframe.mean10, name='MA10'), 
                              go.Scatter(x=stock_dataframe.index, y=stock_dataframe.mean20, name='MA20'),
                              go.Scatter(x=stock_dataframe.index, y=stock_dataframe.mean60, name='MA60'),
                              go.Scatter(x=stock_dataframe.index, y=stock_dataframe.mean120, name='MA120'),
                              go.Scatter(x=stock_dataframe.index, y=stock_dataframe.mean200, name='MA200'),],
                               layout=go.Layout(title=f"股票{stock_code}  K线图",autosize=True))
fig_candle.update_layout(xaxis_range=[f'{start_date}',f'{end_date}'])
fig_candle.update_yaxes(autorange=True)
fig_candle.show()

## 归一化处理

In [97]:
column_to_scale = ['open','high','low','close','preclose','volume','amount','turn','pctChg','peTTM','pbMRQ','psTTM','pcfNcfTTM']
scaler = MinMaxScaler(feature_range=(-1, 1))

stock_dataframe_scaler = pd.DataFrame(stock_dataframe[column_to_scale])
stock_dataframe_scaler[column_to_scale] = scaler.fit_transform(stock_dataframe_scaler[column_to_scale])
stock_dataframe_scaler


Unnamed: 0_level_0,open,high,low,close,preclose,volume,amount,turn,pctChg,peTTM,pbMRQ,psTTM,pcfNcfTTM
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2013-01-04,-0.837606,-0.848580,-0.832295,-0.837651,-0.837651,-1.000000,-1.000000,,-0.000769,-0.241382,-0.125647,0.220880,0.079257
2013-01-07,-0.837606,-0.848580,-0.832295,-0.837651,-0.837651,-1.000000,-1.000000,,-0.000769,-0.241382,-0.125647,0.220880,0.079257
2013-01-08,-0.837606,-0.848580,-0.832295,-0.837651,-0.837651,-1.000000,-1.000000,,-0.000769,-0.241382,-0.125647,0.220880,0.079257
2013-01-09,-0.837606,-0.848580,-0.832295,-0.837651,-0.837651,-1.000000,-1.000000,,-0.000769,-0.241382,-0.125647,0.220880,0.079257
2013-01-10,-0.837606,-0.848580,-0.832295,-0.837651,-0.837651,-1.000000,-1.000000,,-0.000769,-0.241382,-0.125647,0.220880,0.079257
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-25,-0.544494,-0.560601,-0.538548,-0.547601,-0.541643,-0.946867,-0.971970,-0.955019,-0.077745,-0.734556,-0.996415,-0.995810,0.169776
2023-12-26,-0.547474,-0.564223,-0.549163,-0.557283,-0.547601,-0.931732,-0.964453,-0.939832,-0.126825,-0.745693,-1.000000,-1.000000,0.169871
2023-12-27,-0.557159,-0.570020,-0.552954,-0.557283,-0.557283,-0.933169,-0.965288,-0.941273,-0.000769,-0.745693,-1.000000,-1.000000,0.169871
2023-12-28,-0.558649,-0.547558,-0.552195,-0.531216,-0.557283,-0.846396,-0.918271,-0.854232,0.342962,-0.715711,-0.990350,-0.988719,0.169616


## 原始数据探索

In [None]:
profile = ProfileReport(stock_dataframe_scaler, title = 'Stock Profiling')
profile.to_notebook_iframe()
profile.to_file("data_profile_report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

## 主要成分分析（PCA）

In [123]:
# 提取特征
features = ['open', 'high', 'low', 'close', 'preclose', 'volume', 'amount', 'turn', 'pctChg']
X = stock_dataframe_scaler[features] 
X.bfill(inplace=True)

# 标准化特征（（特征值-均值）/标准差）
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)
# 提交主成分分析
pca = PCA()
X_pca = pca.fit_transform(X_standardized)

# Explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_

# Cumulative explained variance
cumulative_explained_variance = explained_variance_ratio.cumsum()

# Create a DataFrame for plotting
plot_data = pd.DataFrame({
    'Number of Principal Components': range(1, len(explained_variance_ratio) + 1),
    'Cumulative Explained Variance': cumulative_explained_variance
})

# Plotting using Plotly
fig = px.line(plot_data,
              x='Number of Principal Components',
              y='Cumulative Explained Variance',
              title='Cumulative Explained Variance by Principal Components',
              labels={'Cumulative Explained Variance': 'Cumulative Explained Variance'},
              markers=True)


fig.show()
print(cumulative_explained_variance)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



[0.5770055  0.89128776 0.99478024 0.9996701  0.99980586 0.99989003
 0.999941   0.99998269 1.        ]


In [130]:
pca2 = PCA(n_components=2)
X_pca2 = pca2.fit_transform(X_standardized)

loadings = pd.DataFrame(pca2.components_.T, columns=[f'PC{i+1}'for i in range(pca3.n_components_)],index=features)
loadings_squared = loadings ** 2

cumulative_contribution_rate = loadings_squared.cumsum() / loadings_squared.sum() # 计算每个主要成分的累积贡献率
print("Cumulative Contribution Rate for Each Principal Component:")
print(cumulative_contribution_rate)

# Calculate individual contribution
individual_contribution = cumulative_contribution_rate.diff().fillna(cumulative_contribution_rate)
# 
# for i in range(1, len(cumulative_contribution_rate.columns)):
#     individual_contribution.iloc[:, i] -= cumulative_contribution_rate.iloc[:, i - 1]

print("Individual Contribution Rate for Each Principal Component:")
print(individual_contribution)


Cumulative Contribution Rate for Each Principal Component:
               PC1       PC2
open      0.188427  0.007181
high      0.375594  0.016919
low       0.564335  0.023625
close     0.751928  0.032484
preclose  0.940330  0.039495
volume    0.969658  0.331928
amount    0.969671  0.672376
turn      0.999503  0.963678
pctChg    1.000000  1.000000
Individual Contribution Rate for Each Principal Component:
               PC1       PC2
open      0.188427  0.007181
high      0.187168  0.009738
low       0.188740  0.006706
close     0.187593  0.008859
preclose  0.188402  0.007011
volume    0.029328  0.292433
amount    0.000013  0.340448
turn      0.029832  0.291302
pctChg    0.000497  0.036322


In [133]:
X_train, X_test, y_train, y_test = train_test_split(X_pca2, stock_dataframe_scaler['close'], test_size=0.2, random_state=42)
l_regress_model = LinearRegression()
l_regress_model.fit(X_train,y_train)

y_pred = l_regress_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared error:{mse}')


Mean Squared error:0.00014495960974547504
