## 数据校验

使用新的工具`pandera`来快速实现数据校验。通过返回原来的DataFrame，否则会报错。

In [8]:
import pandas as pd
import pandera as pa
import akshare as ak

print(pd.__version__)
print(pa.__version__)
print(ak.__version__)

# data to validate
df = pd.DataFrame({
    "column1": [1, 4, 0, 10, 9],
    "column2": [-1.3, -1.4, -2.9, -10.1, -20.4],
    "column3": ["value_1", "value_2", "value_3", "value_2", "value_1"],
})

df.dtypes


1.5.3
0.14.5
1.8.61


column1      int64
column2    float64
column3     object
dtype: object

In [5]:
# define schema
schema = pa.DataFrameSchema({
    "column1": pa.Column(int, checks=pa.Check.le(10)),
    "column2": pa.Column(float, checks=pa.Check.lt(-1.2)),
    "column3": pa.Column(str, checks=[
        pa.Check.str_startswith("value_"),
        # define custom checks as functions that take a series as input and
        # outputs a boolean or boolean Series
        pa.Check(lambda s: s.str.split("_", expand=True).shape[1] == 2)
    ]),
})

validated_df = schema(df)
print(validated_df)

   column1  column2  column3
0        1     -1.3  value_1
1        4     -1.4  value_2
2        0     -2.9  value_3
3       10    -10.1  value_2
4        9    -20.4  value_1


## AKshare 测试

快速数据类型转换：`df = df.convert_dtypes()`。

In [21]:
import pandas as pd
import pandera as pa
import akshare as ak

stock_zh_a_hist_df = ak.stock_zh_a_hist(symbol="300059", period="daily", start_date="20100101", end_date='20110101', adjust="qfq")
print(stock_zh_a_hist_df.dtypes)

stock_zh_a_hist_df = stock_zh_a_hist_df.convert_dtypes()
print(stock_zh_a_hist_df.dtypes)

schema = pa.DataFrameSchema({
    "日期" : pa.Column("string", checks=pa.Check.str_startswith("2010")),
    "最低" : pa.Column("float64", checks=pa.Check.greater_than(-2)),
})

validated_data = schema.validate(stock_zh_a_hist_df)
print(validated_data)

日期      object
开盘     float64
收盘     float64
最高     float64
最低     float64
成交量      int64
成交额    float64
振幅     float64
涨跌幅    float64
涨跌额    float64
换手率    float64
dtype: object
日期      string
开盘     Float64
收盘     Float64
最高     Float64
最低     Float64
成交量      Int64
成交额      Int64
振幅     Float64
涨跌幅    Float64
涨跌额    Float64
换手率    Float64
dtype: object
             日期    开盘    收盘    最高    最低     成交量         成交额     振幅    涨跌幅  \
0    2010-03-19  0.56  0.53  0.58  0.52  197373  1182393994   20.0  76.67   
1    2010-03-22  0.56   0.6   0.6  0.55  110104   693595698   9.43  13.21   
2    2010-03-23  0.59  0.61  0.62  0.58   85522   547135876   6.67   1.67   
3    2010-03-24  0.61  0.66  0.69  0.61   72530   491778563  13.11    8.2   
4    2010-03-25  0.66  0.67   0.7  0.65   60225   419010108   7.58   1.52   
..          ...   ...   ...   ...   ...     ...         ...    ...    ...   
184  2010-12-27  0.47  0.45  0.48  0.45    5515    29590932   6.38  -4.26   
185  2010-12-28  0.44  0.4