In [1]:
import requests
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup

### get_html


eg: http://fund.eastmoney.com/f10/F10DataApi.aspx?type=lsjz&code=050026&page=1&sdate=2020-01-01&edate=2020-03-01&per=20

type：lsjz表示历史净值

code：表示基金代码，如050026表示博时医疗保健行业混合

page：表示获取的数据的页码

per：表示获取的数据每页显示的条数

sdate：表示开始时间

edate：表示结束时间


In [2]:
def get_html(code, start_date, end_date, page=1, per=20):
    url = 'http://fund.eastmoney.com/f10/F10DataApi.aspx?type=lsjz&code={0}&page={1}&sdate={2}&edate={3}&per={4}'.format(
        code, page, start_date, end_date, per)
    rsp = requests.get(url)
    html = rsp.text
    return html

def get_fund(code, start_date, end_date, page=1, per=20):
    html = get_html(code, start_date, end_date, page, per)
    soup = BeautifulSoup(html, 'html.parser')
    
    # 获取总页数
    pattern = re.compile('pages:(.*),')
    result = re.search(pattern, html).group(1)
    total_page = int(result)
    print("Total pages:", total_page)
    
    # 获取表头信息
    heads = []
    for head in soup.findAll("th"):
        heads.append(head.contents[0])
        
    # 数据存取列表
    records = []
    # 获取每一页的数据
    current_page = 1
    while current_page <= total_page:
        html = get_html(code, start_date, end_date, current_page, per)
        soup = BeautifulSoup(html, 'html.parser')
        # 获取数据
        for row in soup.findAll("tbody")[0].findAll("tr"):
            row_records = []
            for record in row.findAll('td'):
                val = record.contents
                # 处理空值
                if val == []:
                    row_records.append(np.nan)
                else:
                    row_records.append(val[0])
            # 记录数据
            records.append(row_records)
        # 下一页
        current_page = current_page + 1
    
    # 将数据转换为Dataframe对象
    np_records = np.array(records)
    fund_df = pd.DataFrame()
    for col, col_name in enumerate(heads):
        fund_df[col_name] = np_records[:, col]
    # 按照日期排序
    fund_df['净值日期'] = pd.to_datetime(fund_df['净值日期'], format='%Y/%m/%d')
    fund_df = fund_df.sort_values(by='净值日期', axis=0, ascending=True).reset_index(drop=True)
    fund_df = fund_df.set_index('净值日期')
    # 数据类型处理
    fund_df['单位净值'] = fund_df['单位净值'].astype(float)
    fund_df['累计净值'] = fund_df['累计净值'].astype(float)
    fund_df['日增长率'] = fund_df['日增长率'].str.strip('%').astype(float)
    
    return fund_df

In [8]:
def english_fund_df(fund_df):
    fund_df.index.name = "date"
    fund_df.rename(columns = {'单位净值':'unit_net_value',
                          '累计净值':'cum_net_value',
                          '日增长率':'daily_return',
                          '申购状态':'subscription_status',
                          '赎回状态':'redemption_status', 
                          '分红送配':"dividend" },inplace=True)
    fund_df['redemption_status'].replace('开放赎回',bool(1),inplace = True)
    fund_df['redemption_status'].replace('封闭期',bool(0),inplace = True)
    fund_df['subscription_status'].replace('开放申购',bool(1),inplace = True)
    fund_df['subscription_status'].replace('封闭期',bool(0),inplace = True)
    fund_df.to_csv("net_value_fund"+str(code)+"_"+start_date+".csv")
    return fund_df

In [11]:
code = "000404"
start_date= "2019-09-30" #9.30,12.30,3.30,6.30
end_date =  "2020-08-24"
fund_df = get_fund(code, start_date, end_date, page=1, per=20)

Total pages: 11


In [12]:
fund_df = english_fund_df(fund_df)

In [13]:
fund_df.dtypes

unit_net_value         float64
cum_net_value          float64
daily_return           float64
subscription_status       bool
redemption_status         bool
dividend                object
dtype: object