# 主数据获取器

# 参考资料
* [1] [知乎 - 抓取数据的代码](https://zhuanlan.zhihu.com/p/34956727)
* [2] [CSDN - macOS下使用Automator转换CSV编码格式](https://blog.csdn.net/wqdwin/article/details/76058154)
* [3] [CSDN - 带有搜索框的爬取](https://blog.csdn.net/hguo11/article/details/69813583)
* [4] [CSDN - PhantomJS, Selenium, Python3配置](https://blog.csdn.net/zxy987872674/article/details/53082896)
* [5] [CSDN - Beauttifulsoup爬取网站table](https://blog.csdn.net/belldeep/article/details/78887318)
* [6] [CSDN - Python爬取类似股票表格](https://blog.csdn.net/mini_mooned/article/details/53575289)
* [7] [CSDN - 使用Python+selenium+BeautifulSoup抓取动态网页的关键信息](https://blog.csdn.net/vincentluo91/article/details/52947214)
* [8] [CSDN - 使用Decimal进行精确计算](https://blog.csdn.net/weixin_37989267/article/details/79473706)
* [9] [CNBLOGS - Python异常处理](https://www.cnblogs.com/cui0x01/p/6196378.html)

# 数据分析方向
### 目标 (Main Goal)：
* 主要行业：电子信息，新能源，新材料，新技术 （OK）
* 金叉（MACD上穿）
* 成交量环比增幅30%以上（OK）
* 换手率大于5%（OK）
* 营业收入增加30%以上（同年）
* 净利润增加30%以上（同年）

### 各列中英文对应表
* code = 代码，name = 名称，close = 最新价
* percent_chg = 涨跌幅，change = 涨跌额
* volume = 成交量，turn_volume = 成交额，amplitude = 振幅
* high = 最高，low = 最低
* now_open = 今开，previous_close = 昨收
* volume_rate = 量比，turnover_rate = 换手率，pr_ratio = 市盈率(实时变化 暂不采用)

### SFrame命名中英文对应表
* info = 电子信息
* energy = 新能源
* material = 新材料
* tech = 全息技术

# 为每个子分区建立不同的list

In [1]:
# Import Statement
from selenium import webdriver
from bs4 import BeautifulSoup
from decimal import Decimal
from selenium.common.exceptions import ElementNotVisibleException
import time
from time import sleep
import urllib
import re
import requests
import csv
import pymysql
import os
import os,sys
import turicreate as tc
import pandas as pd

In [2]:
# 定义要搜索的URL信息
search_area = {'电子信息' : 'http://quote.eastmoney.com/center/boardlist.html#boards-BK04471', 
               '新能源' : 'http://quote.eastmoney.com/center/boardlist.html#boards-BK04931', 
               '新材料':'http://quote.eastmoney.com/center/boardlist.html#boards-BK05231', 
               '全息技术':'http://quote.eastmoney.com/center/boardlist.html#boards-BK06991'}


# 解析表格

In [3]:
# 一个从页面获取页数的函数
def getPageNumber(bs):
    all_buttons = bs.findAll(class_ = "paginate_button")
    if len(all_buttons) == 2:
        return 1  # 处理只有一页的情况
    else:
        return len(all_buttons) - 2  # 下一页和Go按钮

# 一个自动判断量词的函数
def smartMultiply(string):
    if(string[len(string)-1:len(string)] == '万'):
        string = Decimal(string[0:len(string)-1])
        string = float(string) * 10000
    elif(string[len(string)-1:len(string)] == '亿'):
        string = Decimal(string[0:len(string)-1])
        string = float(string) * 100000000
    elif(string[len(string)-1:len(string)] == '%'):
        string = Decimal(string[0:len(string)-1])
        string = float(string) * 0.01
    else:
        string = float(string)
    return string

# 从一个静态BeautifulSoup页面解析表格并存储进SFrame
def grabData(bs, SFrame):
    # 解出表格
    table = bs.findAll(role = 'row')
    table = table[7: len(table)-1]
    # 分析每个表格
    counter = 0

    while counter < len(table):

        row_sframe = tc.SFrame({'code':[str(table[counter].find(class_ = ' listview-col-Code').string)],
                                'name':[str(table[counter].find(class_ = ' listview-col-Name').string)], 
                                'close':[smartMultiply(table[counter].find(class_ = ' listview-col-Close').string)], 
                                'percent_chg':[smartMultiply(table[counter].find(class_ = 'listview-col-ChangePercent sorting_1').string)],
                                'change':[smartMultiply(table[counter].find(class_ = ' listview-col-Change').string)],
                                'volume':[smartMultiply(table[counter].find(class_ = ' listview-col-Volume').string)],
                                'turn_volume':[smartMultiply(table[counter].find(class_ = ' listview-col-Amount').string)], 
                                'amplitude':[smartMultiply(table[counter].find(class_ = ' listview-col-Amplitude').string)], 
                                'high':[smartMultiply(table[counter].find(class_ = ' listview-col-High').string)], 
                                'low':[smartMultiply(table[counter].find(class_ = ' listview-col-Low').string)], 
                                'now_open':[smartMultiply(table[counter].find(class_ = ' listview-col-Open').string)], 
                                'previous_close':[smartMultiply(table[counter].find(class_ = ' listview-col-PreviousClose').string)],
                                'volume_rate':[smartMultiply(table[counter].find(class_ = ' listview-col-VolumeRate').string)],
                                'turnover_rate':[smartMultiply(table[counter].find(class_ = ' listview-col-TurnoverRate').string)], 
                                'report_url':['http://emweb.securities.eastmoney.com/f10_v2/FinanceAnalysis.aspx?type=web&code=sz' + table[counter].find(class_ = ' listview-col-Code').string + '#lrb-0'], 
                                })
        counter += 1
        # print(row_sframe)
        SFrame = SFrame.append(row_sframe)
        
    return SFrame

# 自动处理数据的主程序
def makeData(topic, SFrame):
    browser = webdriver.Chrome() # Get local session of chrome
    url = search_area[topic]  # Example: '电子信息'
    browser.get(url) #Load page
    browser.implicitly_wait(2) #智能等待2秒
    
    # 第一次访问时判定菜单数量来决定浏览多少次表格
    bs = BeautifulSoup(browser.page_source, "lxml")
    page_number = getPageNumber(bs)
    
    # 循环浏览页面直到搜集完毕所有table
    counter = 0
    while counter < page_number:
        SFrame = grabData(bs, SFrame)
        try:
            browser.find_element_by_id('main-table_next').click()
        except ElementNotVisibleException:
            print('Warning: Some data are out of reach.')
        bs = BeautifulSoup(browser.page_source, "lxml")
        counter += 1
     
    SFrame = SFrame[1:len(SFrame)] # 删掉占位符
    SFrame = SFrame.unique()
    return SFrame

# 创建占位符的函数, 因为SFrame不允许创建空行，于是预先准备占位符用于定义各列数据类型。
def initSFrame():
    sframe = tc.SFrame({'code':['000000'],'name':['哔哩哔哩'],
                      'close':[0.0],'percent_chg':[0.0],
                      'change':[0.0],'volume':[0.0],'turn_volume':[0.0], 'amplitude':[0.0], 
                      'high':[0.0], 'low':[0.0], 
                      'now_open':[0.0], 'previous_close':[0.0], 'volume_rate':[0.0],
                      'turnover_rate':[0.0], 'report_url':['http://www.bilibili.com']})
    return sframe

In [4]:
# 创建四个空SFrame，以占位行开头
info = initSFrame()
energy = initSFrame()
material = initSFrame()
tech = initSFrame()

# 获取信息
info = makeData('电子信息', info)
energy = makeData('新能源', energy)
material = makeData('新材料', material)
tech = makeData('全息技术', tech)



# 初步数据分析

In [5]:
# 初步筛选分析程序
def analyze_stock(SFrame):
    SFrame = analysis_turnover_rate(SFrame)
    SFrame = analysis_volume_rate(SFrame)
    return SFrame

# 返回所有换手率大于5%的行
def analysis_turnover_rate(SFrame):
    return SFrame[SFrame['turnover_rate'] > 0.05]

# 返回所有量比大于30%的行
def analysis_volume_rate(SFrame):
    return SFrame[ SFrame['volume_rate'] > 0.3]


analyze_info = analyze_stock(info)
analyze_energy = analyze_stock(energy)
analyze_material = analyze_stock(material)
analyze_tech = analyze_stock(tech)

# analyze_tech.show()  # Debug

# 深度分析报表

In [24]:
def getReport(url, income_limit, profit_limit):
    browser = webdriver.Chrome() # Get local session of chrome
    browser.get(url) #Load page
    soup = BeautifulSoup(browser.page_source, "lxml")
    browser.close()

    ulist = []
    trs = soup.find_all('tr')
    for tr in trs:
        ui = []
        for td in tr:
            ui.append(td.string)
        ulist.append(ui)


    income_increase = 0
    profit_increase = 0
    for element in ulist:
        if ('营业总收入' in element):
            income_data_list = element
            now_data = smartMultiply(income_data_list[3])
            past_data = smartMultiply(income_data_list[11])
            income_increase = (now_data - past_data) / past_data
            # print('现营业总收入', now_data)
            # print('一年前营业总收入', past_data)
            # print('营业总收入增长', income_increase)
        elif('净利润' in element):
            profit_data_list = element
            now_data = smartMultiply(profit_data_list[3])
            past_data = smartMultiply(profit_data_list[11])
            profit_increase = (now_data - past_data) / past_data
            # print('现净利润', now_data)
            # print('一年前净利润', past_data)
            # print('净利润增长', income_increase)
    # increase_list = [income_increase, profit_increase]  # [营业总收入增长, 净利润增长]
    
    if(income_increase > income_limit and profit_increase > profit_limit):
        print('营业总收入增长', income_increase)
        print('净利润增长', profit_increase)
    return income_increase > income_limit and profit_increase > profit_limit

# 跑一下
# url = 'http://emweb.securities.eastmoney.com/f10_v2/FinanceAnalysis.aspx?type=web&code=sz002195#lrb-0'
# getReport(url)

# 推荐股票

In [25]:
def recommendStock(SFrame):
    income_limit = 0.25
    profit_limit = 0.25
    counter = 0
    while counter < len(SFrame):
        if getReport(SFrame[counter]['report_url'], income_limit, profit_limit):
            print(SFrame[counter]['name'], SFrame[counter]['code'])
        counter += 1

In [26]:
recommendStock(analyze_info)

营业总收入增长 0.5740740740740741
净利润增长 0.8906560636182903
优博讯


In [27]:
recommendStock(analyze_energy)

In [28]:
recommendStock(analyze_material)

In [29]:
recommendStock(analyze_tech)

# ============================================================
# ----------------------------------------TRASH----------------------------------------
# ============================================================

In [8]:
analyze_info['increase'] = analyze_info['report_url'].apply(getReport)

营业总收入增长 0.186473429951691
净利润增长 0.186473429951691
营业总收入增长 0.17987860952731285
净利润增长 0.17987860952731285
营业总收入增长 -0.6056915470879055
净利润增长 -0.6056915470879055
营业总收入增长 -0.08842105263157894
净利润增长 -0.08842105263157894
营业总收入增长 0.5740740740740741
净利润增长 0.5740740740740741
营业总收入增长 0.2306940371456501
净利润增长 0.2306940371456501
营业总收入增长 0.11932724372915761
净利润增长 0.11932724372915761


In [7]:
url = 'http://emweb.securities.eastmoney.com/f10_v2/FinanceAnalysis.aspx?type=web&code=sz002195#lrb-0'
browser = webdriver.Chrome() # Get local session of chrome
browser.get(url) #Load page
soup = BeautifulSoup(browser.page_source, "lxml")
browser.close()

ulist = []
trs = soup.find_all('tr')
for tr in trs:
    ui = []
    for td in tr:
        ui.append(td.string)
    ulist.append(ui)

    
income_increase = 0
profit_increase = 0
for element in ulist:
    if ('营业总收入' in element):
        income_data_list = element
        now_data = smartMultiply(income_data_list[3])
        past_data = smartMultiply(income_data_list[11])
        income_increase = (now_data - past_data) / past_data
        print('现营业总收入', now_data)
        print('一年前营业总收入', past_data)
        print('营业总收入增长', income_increase)
    elif('净利润' in element):
        profit_data_list = element
        now_data = smartMultiply(profit_data_list[3])
        past_data = smartMultiply(profit_data_list[11])
        profit_increase = (now_data - past_data) / past_data
        print('现净利润', now_data)
        print('一年前净利润', past_data)
        print('净利润增长', income_increase)

现营业总收入 439200000.00000006
一年前营业总收入 420500000.0
营业总收入增长 0.04447086801426887
现净利润 216900000.0
一年前净利润 209800000.0
净利润增长 0.04447086801426887


In [None]:
# 参考了[7], 利用selenium解析出来的page source抓取表格
# 居然有urllib解析不出的表格！！我去买彩票算了！！

browser = webdriver.Chrome() # Get local session of chrome
url = search_area['电子信息']
browser.get(url) #Load page
browser.implicitly_wait(2) #智能等待xx秒
time.sleep(5) #加载时间较长，等待加载完毕
# 把selenium的webdriver调用page_source函数在传入BeautifulSoup中，就可以用BeautifulSoup解析网页了
bs = BeautifulSoup(browser.page_source, "lxml")

In [None]:
code = table[0].find(class_ = ' listview-col-Code').string  # code
print('代码', code)

name = table[0].find(class_ = ' listview-col-Name').string # name
print('名称', name)

close = smartMultiply(table[0].find(class_ = ' listview-col-Close').string)
print('最新价', close)

percent_chg = smartMultiply(table[0].find(class_ = 'listview-col-ChangePercent sorting_1').string)
print('涨跌幅', percent_chg)

change = smartMultiply(table[0].find(class_ = ' listview-col-Change').string)
print('涨跌额', change)

volume = smartMultiply(table[0].find(class_ = ' listview-col-Volume').string)
print('成交量', volume)

turn_volume = smartMultiply(table[0].find(class_ = ' listview-col-Amount').string)
print('成交额', turn_volume)

amplitude = smartMultiply(table[0].find(class_ = ' listview-col-Amplitude').string)
print('振幅', amplitude)

high = smartMultiply(table[0].find(class_ = ' listview-col-High').string)
print('最高', high)

low = smartMultiply(table[0].find(class_ = ' listview-col-Low').string)
print('最低', low)

now_open = smartMultiply(table[0].find(class_ = ' listview-col-Open').string)
print('今开', now_open)

previous_close = smartMultiply(table[0].find(class_ = ' listview-col-PreviousClose').string)
print('昨收', previous_close)

volume_rate = smartMultiply(table[0].find(class_ = ' listview-col-VolumeRate').string)
print('量比', volume_rate)

turnover_rate = smartMultiply(table[0].find(class_ = ' listview-col-TurnoverRate').string)
print('换手率', turnover_rate)

pr_rate = smartMultiply(table[0].find(class_ = ' listview-col-PERation').string)
print('市盈率', pr_rate)

In [None]:
def get_allele_feq(browser, snp):
    browser.get(
    'https://www.ncbi.nlm.nih.gov/variation/tools/1000genomes/?q=%s' %snp) #Load page
    # browser.implicitly_wait(60) #智能等待xx秒
    time.sleep(30) #加载时间较长，等待加载完毕
    # browser.find_element_by_css_selector("div[title=\"Han Chinese in Bejing, China\"]") #use selenium function to find elements

    # 把selenium的webdriver调用page_source函数在传入BeautifulSoup中，就可以用BeautifulSoup解析网页了
    bs = BeautifulSoup(browser.page_source, "lxml")
    # bs.find_all("div", title="Han Chinese in Bejing, China")
    try:
        race = bs.find(string="CHB")
        race_data = race.find_parent("div").find_parent(
            "div").find_next_sibling("div")
        # print race_data
        race_feq = race_data.find("span", class_="gt-selected").find_all("li") # class_ 防止Python中类关键字重复，产生语法错误
        base1_feq = race_feq[0].text  #获取标签的内容
        base2_feq = race_feq[1].text
        return snp, base1_feq, base2_feq  # T=0.1408 C=0.8592

    except NoSuchElementException:
        return "%s:can't find element" %snp 

In [None]:
browser = webdriver.Chrome() # Get local session of chrome
fh = open("./4diseases_snps_1kCHB_allele_feq.list2", 'w')
snps = open("./4diseases_snps.list.uniq2",'r')
for line in snps:
    snp = line.strip()
    response = get_allele_feq(browser, snp)
    time.sleep(1)
    fh.write("\t".join(response)) #unicode 编码的对象写到文件中后相当于print效果
    fh.write("\n")
    print "\t".join(response)
    time.sleep(1)  # sleep a few seconds
fh.close()
browser.quit()  # 退出并关闭窗口的每一个相关的驱动程序


In [None]:
#coding:utf-8
# 用搜索框获取URL模块[3]（暂时不用）

#这里设置用哪个，关于具体的使用可以百度，建议用phantomjs.exe读者可以做对比
driver = webdriver.Chrome('/usr/local/bin/chromedriver')
driver.get('http://www.eastmoney.com')

# 找到输入框，并输入文字
driver.find_element_by_id('code_suggest').send_keys('新能源')
sleep(2)

# 点击搜索按钮
driver.find_element_by_id('search_view_btn3').click()

#获取当前的URL的地址
print(driver.current_url)

#关闭浏览器
driver.close()

In [None]:
# 导入需要使用到的模块 


# 爬虫抓取网页函数 
def getHtml(url):
    html = urllib.request.urlopen(url).read()
    html = html.decode('gbk')
    return html


# 抓取网页股票代码函数 
def getStackCode(html):
    s = r'<li><a target="_blank" href="http://quote.eastmoney.com/\S\S(.*?).html">'
    pat = re.compile(s)
    code = pat.findall(html)
    return code

Url = 'http://quote.eastmoney.com/stocklist.html'  # 东方财富网股票数据连接地址
filepath = '../Datasets/Eastmoney/Stock_History/'  # 定义数据文件保存路径

In [None]:
# 实施抓取
code = getStackCode(getHtml(Url)) 
# 获取所有股票代码（以6开头的，应该是沪市数据）集合
CodeList = []
for item in code:
    if item[0] == '6':
        CodeList.append(item)
# 抓取数据并保存到本地csv文件 
for code in CodeList:
    print('正在获取股票%s数据'%code)
    url = 'http://quotes.money.163.com/service/chddata.html?code=0'+code+\
        '&end=20161231&fields=TCLOSE;HIGH;LOW;TOPEN;LCLOSE;CHG;PCHG;TURNOVER;VOTURNOVER;VATURNOVER;TCAP;MCAP'
    urllib.request.urlretrieve(url, filepath+code+'.csv')

In [None]:
news_data = tc.SFrame('../Datasets/Eastmoney/Stock_History/600000.csv', decode='utf-8')