In [1]:
# 通过新浪接口获取如下数据
# 1.目标股票的最新价格
# 2.三张财务报表
# 3.目标股票历史明细行情（每3秒的撮合分笔交易数据）仅限深圳股市，沪市只提供当日数据
import numpy as np
import pandas as pd
import requests 

# target1 = 'http://market.finance.sina.com.cn/downxls.php?date=2021-07-08&symbol=sh600900'    # 服务已下线
# target2 = 'http://vip.stock.finance.sina.com.cn/quotes_service/view/cn_price.php?symbol=sh600900'


#### 新浪接口---最新成交价

In [2]:
# 方法1：基于requests模块 
def getTick():
    #获取某股票最新成交价 
    url = "http://hq.sinajs.cn/list=sh600519" 
    headers={'Referer':'https://finance.sina.com.cn/'}     # 需要设置请求头
    page = requests.get(url,headers=headers)     # page是HTTP的response结构
    stock_info = page.text     # 提取返回的字符串内容
    print(stock_info) 
    mt_info = stock_info.split(",")
    #爬取到数据信息 
    last = float(mt_info[1]) 
    trade_datetime = mt_info[30] + ' '+ mt_info[31]
    #交易时间 
    tick = (last,trade_datetime) 
    return tick
    
#得到股票最新成交价、时间 
last_tick = getTick()
#进入网页获取数据 
print(last_tick)

var hq_str_sh600519="贵州茅台,1859.000,1854.200,1885.000,1888.000,1854.200,1885.000,1885.070,2192534,4107597895.000,11098,1885.000,1400,1884.890,400,1884.880,200,1884.870,200,1884.850,100,1885.070,300,1885.100,100,1885.160,100,1885.200,100,1885.440,2022-08-25,15:00:00,00,";

(1859.0, '2022-08-25 15:00:00')


In [3]:
# 方法2：基于urllib模块获取最新的股票价格
from urllib.request import Request, urlopen
 

In [4]:
url = "http://hq.sinajs.cn/list=sh600519"
req = Request(url)    # urllib请求包结构
req.add_header('Referer', 'https://finance.sina.com.cn/')    # 向请求包添加头
resp = urlopen(req)    # HTTPResponse结构
content = resp.read().decode('gbk')     # 提取字符串结构

In [5]:
stock_info = content.split(',')     # 基于‘，’分割成list结构
price = stock_info[1]
print(price)

1900.000


#### 新浪财经接口---财务报表

In [6]:
server = "http://money.finance.sina.com.cn/"
path = "corp/go.php/"
path2 = "displaytype/4/stockid/"    # 4代表包含所有季报
table1 = "vDOWN_BalanceSheet/"
table2 = "vDOWN_ProfitStatement/"
table3 = "vDOWN_CashFlow/"

type = "/ctrl/all.phtml"

In [130]:
stock = "000027"    # 根据需求变更stock中的股票代码, 600585, 600900, 300059, 300846, 300070, 000027

In [131]:
url1 = server + path + table1 + path2 + stock + type
url2 = server + path + table2 + path2 + stock + type
url3 = server + path + table3 + path2 + stock + type

print(url1)
print(url2)
print(url3)

http://money.finance.sina.com.cn/corp/go.php/vDOWN_BalanceSheet/displaytype/4/stockid/000027/ctrl/all.phtml
http://money.finance.sina.com.cn/corp/go.php/vDOWN_ProfitStatement/displaytype/4/stockid/000027/ctrl/all.phtml
http://money.finance.sina.com.cn/corp/go.php/vDOWN_CashFlow/displaytype/4/stockid/000027/ctrl/all.phtml


In [132]:
df1 = pd.read_csv(url1, header=0, sep='\t',encoding="gb18030")
df2 = pd.read_csv(url2, header=0, sep='\t',encoding="gb18030")
df3 = pd.read_csv(url3, header=0, sep='\t',encoding="gb18030")


In [133]:
# 行列转换
bs = pd.DataFrame(df1.values.T, columns=df1['报表日期'], index=df1.columns)
pl = pd.DataFrame(df2.values.T, columns=df2['报表日期'], index=df2.columns)
cf = pd.DataFrame(df3.values.T, columns=df3['报表日期'], index=df3.columns)

# 删除不需要的行与列
bs.drop('报表日期', inplace=True)    # 删除第一行
bs.drop(bs.tail(1).index, inplace=True)    # 删除最后一行
bs.drop('单位', axis=1, inplace=True)    # 删除'单位'这一列
pl.drop('报表日期', inplace=True)    # 删除第一行
pl.drop(pl.tail(1).index, inplace=True)    # 删除最后一行
pl.drop('单位', axis=1, inplace=True)    # 删除'单位'这一列
cf.drop('报表日期', inplace=True)    # 删除第一行
cf.drop(cf.tail(1).index, inplace=True)    # 删除最后一行
cf.drop('单位', axis=1, inplace=True)    # 删除'单位'这一列

# 把字符串转换为数值
headers_bs = bs.columns    # 获取表头column清单
headers_pl = pl.columns
headers_cf = cf.columns

for head in headers_bs:
    bs[head] = pd.to_numeric(bs[head], errors='coerce').fillna(0) # 不符合转换条件的设为‘Nan’，再用0替换
for head in headers_pl:
    pl[head] = pd.to_numeric(pl[head], errors='coerce').fillna(0)
for head in headers_cf:
    cf[head] = pd.to_numeric(cf[head], errors='coerce').fillna(0)

bs.index = pd.to_datetime(bs.index)    # 把index的数据格式从text转换为日期格式
pl.index = pd.to_datetime(pl.index)    # 把index的数据格式从text转换为日期格式
cf.index = pd.to_datetime(cf.index)    # 把index的数据格式从text转换为日期格式

bs.insert(0, '股票代码', stock)    # 添加股票代码列
pl.insert(0, '股票代码', stock)    # 添加股票代码列
cf.insert(0, '股票代码', stock)    # 添加股票代码列

In [134]:
pl.head()

报表日期,股票代码,一、营业总收入,营业收入,二、营业总成本,营业成本,营业税金及附加,销售费用,管理费用,财务费用,研发费用,...,五、净利润,归属于母公司所有者的净利润,少数股东损益,六、每股收益,基本每股收益(元/股),稀释每股收益(元/股),七、其他综合收益,八、综合收益总额,归属于母公司所有者的综合收益总额,归属于少数股东的综合收益总额
2022-06-30,27,16284220000.0,16284220000.0,15358660000.0,13411460000.0,90572990.0,57134320.0,645768700.0,1137107000.0,16610050.0,...,1108726000.0,1108726000.0,0.0,0.0,0.2,0.2,0.0,0.0,0.0,0.0
2022-03-31,27,7766934000.0,7766934000.0,7250923000.0,6286731000.0,32939570.0,24452510.0,311769100.0,587870000.0,7160825.0,...,530071200.0,517808900.0,12262340.0,0.0,0.0841,0.0841,-362793800.0,167277400.0,155015100.0,12262340.0
2021-12-31,27,31569550000.0,31569550000.0,29538200000.0,25425400000.0,245301900.0,116841300.0,1506990000.0,1998905000.0,244770800.0,...,2109591000.0,2128517000.0,-18926910.0,0.0,0.32,0.32,74670710.0,2184261000.0,2210071000.0,-25810160.0
2021-09-30,27,20571690000.0,20571690000.0,18058060000.0,15383940000.0,164277700.0,78085220.0,967932800.0,1398422000.0,65403340.0,...,2784919000.0,2532378000.0,252541100.0,0.0,0.479,0.479,-185049400.0,2599870000.0,2347299000.0,252571200.0
2021-06-30,27,13262880000.0,13262880000.0,11518980000.0,9615526000.0,117008400.0,52094810.0,715720700.0,969081500.0,49544610.0,...,1876814000.0,1663014000.0,213799800.0,0.0,0.3496,0.3496,-283169600.0,1593644000.0,1379844000.0,213799800.0


In [116]:
# 在本机上建立MySQL数据库用来存放采集自the Internet的数据
import pandas as pd
import pymysql  
from sqlalchemy import create_engine

In [103]:
engine = create_engine('mysql+pymysql://buffett:@localhost:3306/test')

In [135]:
sql_bs = 'select * from balance_sheet where 股票代码 = ' + stock
sql_pl = 'select * from profit_loss where 股票代码 = ' + stock
sql_cf = 'select * from cash_flow where 股票代码 = ' + stock

# index不要用blob或text，先转换成日期格式
# 若表中有重复数据可以利用df.drop_dupliates()去重
df = pd.read_sql(sql_bs, engine)
if df.empty == True:
    bs.to_sql('balance_sheet', engine, if_exists='append', index_label='报表日期')   # replace: Drop the table before inserting new values.
else:
     print("该股票已存在,放弃导入bs数据库!")

df = pd.read_sql(sql_pl, engine)
if df.empty == True:
    pl.to_sql('profit_loss', engine, if_exists='append', index_label='报表日期')    # append: Insert new values to the existing table.
else:
     print("该股票已存在,放弃导入pl数据库!")    

df = pd.read_sql(sql_cf, engine)
if df.empty == True:
    cf.to_sql('cash_flow', engine, if_exists='append', index_label='报表日期')
else:
     print("该股票已存在,放弃导入cf数据库!")      


该股票已存在,放弃导入bs数据库!


In [136]:
engine.dispose()    # 关闭数据库连接引擎

#### 新浪财经接口---行情

历史成交明细

In [2]:
server = "http://market.finance.sina.com.cn/"
path = "transHis.php?symbol="
stock = "sz300846"    # 根据需求变更stock中的股票代码，目前只提供深圳的，上海的2021年7以后就没有了，而且页面数据每天0时清空
date = "2022-08-15"

# target4 = "http://market.finance.sina.com.cn/transHis.php?symbol=sz300846&date=2022-08-15&page=1"

In [6]:
# 可以获得完整的深圳股市的完整历史明细行情
df_tick = pd.DataFrame()
# 循环调用page，拼接成一个DataFrame
for i in range(1, 99):
    page = str(i)
    target4 = server + path + stock + "&date=" + date + "&page=" + page
    req = urlopen(target4)
    html = req.read()    # 返回一个静态的html网页
    data1 = pd.read_html(html)
    if data1[0].empty:
        print("it is the end of tick log: %d" %i)
        break
    df_tick = pd.concat([df_tick, data1[0]], axis=0, ignore_index=True)

it is the end of tick log: 38


In [11]:
# 当日沪市成交明细，当晚0：00后数据清空。
target5 = "https://vip.stock.finance.sina.com.cn/quotes_service/view/vMS_tradedetail.php?symbol=sh600036"
req5 = urlopen(target5)
html5 = req5.read().decode('gbk') 

In [13]:
# 无法下载完整的当日数据
data5 = pd.read_html(html5)
data5[3]

Unnamed: 0,成交时间,成交价,涨跌幅,价格变动,成交量(手),成交额(元),性质
0,15:00:00,33.91,+0.53%,--,0,0,买盘
1,15:00:00,33.91,+0.53%,+0.04,7378,25018798,卖盘
2,14:59:54,33.87,+0.42%,--,0,0,卖盘
3,14:59:30,33.87,+0.42%,--,0,0,卖盘
4,14:59:27,33.87,+0.42%,--,0,0,卖盘
5,14:59:24,33.87,+0.42%,--,0,0,卖盘
6,14:59:21,33.87,+0.42%,--,0,0,卖盘
7,14:59:00,33.87,+0.42%,--,0,0,卖盘
8,14:58:54,33.87,+0.42%,--,0,0,卖盘
9,14:58:51,33.87,+0.42%,--,0,0,卖盘


In [7]:
df_tick

Unnamed: 0,成交时间,成交价,价格变动,成交量(手),成交额(元),性质
0,15:00:03,12.37,-0.01,959,1186283,卖盘
1,14:57:03,12.38,--,5,6190,买盘
2,14:56:57,12.38,--,2,2476,买盘
3,14:56:54,12.38,--,52,64376,卖盘
4,14:56:51,12.38,--,2,2476,卖盘
5,14:56:48,12.38,0.01,1,1238,中性盘
6,14:56:45,12.37,--,108,133596,中性盘
7,14:56:39,12.37,-0.01,8,9896,卖盘
8,14:56:36,12.38,0.02,109,134942,买盘
9,14:56:33,12.36,-0.01,82,101352,卖盘
