In [109]:
import sys, os

In [110]:
try:
    from dao import fromDB
    from backtesting import test
    from backtesting import recorder
    from strategies import Buyer
    from strategies import Seller
except Exception:
    sys.path.append(os.path.join(os.getcwd(), '..'))
    from dao import fromDB
    from backtesting import test
    from backtesting import recorder
    from strategies import Buyer
    from strategies import Seller

In [None]:
import pandas as pd
pd.set_option('display.precision', 4)
pd.__version__

In [None]:
code='300393'
test_df = fromDB.get_stock(code, 'qfq')
test_record = test.single_stock_maxsize(test_df, Buyer.DadBuyer_1, Seller.DadSeller_1, base_period='w', MA5_p=0.99)

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib.gridspec import GridSpec
from matplotlib import dates as mdates
import mpld3
import numpy as np

In [None]:
gs = GridSpec(3, 1, height_ratios=[1, 4, 1], left=0.05, right=0.95, bottom=0.15, top=0.975, hspace=0)
base_period = test_record.base_period
relevant_df = (test_record.df[[col_name for col_name in test_record.df.columns 
                              if col_name[-1]==base_period]]
               .drop_duplicates()
               .set_index('Date_'+base_period))
fig = plt.figure(figsize=(12,10))
ax_macd = fig.add_subplot(gs[2,0])
ax_close = fig.add_subplot(gs[1,0], sharex=ax_macd)
ax_profit = fig.add_subplot(gs[0, 0], sharex=ax_macd)
plt.setp(ax_close.get_xticklabels(), visible=False);
plt.setp(ax_profit.get_xticklabels(), visible=False);

macd_df = (test_record.df[[col_name for col_name in test_record.df.columns
                           if col_name[-1]=='m']]
           .drop_duplicates()
           .set_index('Date_m'))
ax_macd.bar(macd_df[macd_df.MACD_m>0].index.values, 
            macd_df[macd_df.MACD_m>0].MACD_m.values,
            width=15, color='salmon')
ax_macd.bar(macd_df[macd_df.MACD_m<0].index.values, 
            macd_df[macd_df.MACD_m<0].MACD_m.values,
            width=15, color='steelblue')

ax_close.plot_date(relevant_df.index.values, relevant_df['Close_'+base_period].values, fmt='r-', label='Close')
ax_close.plot_date(relevant_df.index.values, relevant_df['MA_2_'+base_period].values, fmt='b--', lw=2, label='MA_2')
ax_close.plot_date(relevant_df.index.values, relevant_df['MA_5_'+base_period].values, fmt='m:', lw=2, label='MA_5')

profits_dates = np.array(test_record.profits_date)
profits = np.array(test_record.profits)
ax_profit.plot_date(profits_dates[profits>=0], profits[profits>=0], color='r', ms=8)
ax_profit.plot_date(profits_dates[profits<0], profits[profits<0], color='g', ms=8)
ax_close.plot_date(test_record.buy_date, relevant_df['Close_'+base_period].ix[test_record.buy_date].values,
                   '^', ms=12, label='BUY')
ax_close.plot_date(test_record.sell_date, relevant_df['Close_'+base_period].ix[test_record.sell_date].values,
                   'v', ms=12, mfc='salmon', label='SELL')
ax_profit.plot_date([relevant_df.index.values[0], relevant_df.index.values[-1]], [0, 0], 'k--')
for profit_date in profits_dates:
    ax_close.plot_date([profit_date, profit_date], ax_close.get_ylim(), ls='--', color='k', lw=1)
    ax_profit.plot_date([profit_date, profit_date], ax_profit.get_ylim(), ls='--', color='k', lw=1)
ax_macd.grid(True)
ax_close.legend(loc='best');
mpld3.display()

In [None]:
gs = GridSpec(2, 1, height_ratios=[1, 5], left=0.05, right=0.95, bottom=0.15, top=0.975, hspace=0)
base_period = test_record.base_period
relevant_df = (test_record.df[[col_name for col_name in test_record.df.columns 
                              if col_name[-1]==base_period]]
               .drop_duplicates()
               .set_index('Date_'+base_period))
fig = plt.figure(figsize=(12,10))
ax_close = fig.add_subplot(gs[1,0])
ax_profit = fig.add_subplot(gs[0, 0], sharex=ax_close)
plt.setp(ax_profit.get_xticklabels(), visible=False);


ax_close.plot_date(relevant_df.index.values, relevant_df['Close_'+base_period].values, fmt='r-', label='Close')
ax_close.plot_date(relevant_df.index.values, relevant_df['MA_2_'+base_period].values, fmt='b--', lw=2, label='MA_2')
ax_close.plot_date(relevant_df.index.values, relevant_df['MA_5_'+base_period].values, fmt='m:', lw=2, label='MA_5')
ax_close.set_ylim([0, ax_close.get_ylim()[1]])
profits_dates = np.array(test_record.profits_date)
profits = np.array(test_record.profits)
ax_profit.plot_date(profits_dates[profits>=0], profits[profits>=0], color='r', ms=8)
ax_profit.plot_date(profits_dates[profits<0], profits[profits<0], color='g', ms=8)
ax_close.plot_date(test_record.buy_date, relevant_df['Close_'+base_period].ix[test_record.buy_date].values,
                   '^', ms=12, label='BUY')
ax_close.plot_date(test_record.sell_date, relevant_df['Close_'+base_period].ix[test_record.sell_date].values,
                   'v', ms=12, mfc='salmon', label='SELL')
ax_profit.plot_date([relevant_df.index.values[0], relevant_df.index.values[-1]], [0, 0], 'k--')
for profit_date in profits_dates:
    ax_close.plot_date([profit_date, profit_date], ax_close.get_ylim(), ls='--', color='k', lw=1)
    ax_profit.plot_date([profit_date, profit_date], ax_profit.get_ylim(), ls='--', color='k', lw=1)
ax_close.legend(loc='best')

ax_close_pos = ax_close.get_position()
ax_macd_pos = [ax_close_pos.x0, ax_close_pos.y0, ax_close_pos.width, ax_close_pos.height/5.0]
ax_macd = fig.add_axes(ax_macd_pos, frameon=True, sharex=ax_close)
macd_df = (test_record.df[[col_name for col_name in test_record.df.columns
                           if col_name[-1]=='m']]
           .drop_duplicates()
           .set_index('Date_m'))
ax_macd.bar(macd_df[macd_df.MACD_m>0].index.values, 
            macd_df[macd_df.MACD_m>0].MACD_m.values,
            width=15, color='salmon')
ax_macd.bar(macd_df[macd_df.MACD_m<0].index.values, 
            macd_df[macd_df.MACD_m<0].MACD_m.values,
            width=15, color='steelblue')
ax_macd.grid(True)
mpld3.display()

In [None]:
reload(recorder)
reload(test)

In [None]:
code='300393'
test_df = fromDB.get_stock(code, 'qfq')
test_record = test.single_stock_maxsize(test_df, Buyer.DadBuyer_1, Seller.DadSeller_1, base_period='w', MA5_p=0.99)
fig, ax_close, ax_profit = test_record.get_plot()

relevant_df = test_record.get_relevant_df()
ax_close.plot_date(relevant_df.index.values, relevant_df['MA_2_'+base_period].values, fmt='b--', lw=2, label='MA_2')
ax_close.plot_date(relevant_df.index.values, relevant_df['MA_5_'+base_period].values, fmt='m:', lw=2, label='MA_5')
ax_close_pos = ax_close.get_position()
ax_macd_pos = [ax_close_pos.x0, ax_close_pos.y0, ax_close_pos.width, ax_close_pos.height/5.0]
ax_macd = fig.add_axes(ax_macd_pos, frameon=True, sharex=ax_close)
macd_df = (test_record.df[[col_name for col_name in test_record.df.columns
                           if col_name[-1]=='m']]
           .drop_duplicates()
           .set_index('Date_m'))
ax_macd.bar(macd_df[macd_df.MACD_m>0].index.values, 
            macd_df[macd_df.MACD_m>0].MACD_m.values,
            width=15, color='salmon')
ax_macd.bar(macd_df[macd_df.MACD_m<0].index.values, 
            macd_df[macd_df.MACD_m<0].MACD_m.values,
            width=15, color='steelblue')
ax_macd.grid(True)
mpld3.display()

# Checking the data qfq. Only use the `Close` ratio to get all others

In [None]:
from dao.Const import Const

In [None]:
code = 'SZ#300393.txt'
file_name_qfq = os.path.join(Const.tdx_base_path, 'qfq', code)
file_name_bfq = os.path.join(Const.tdx_base_path, 'bfq', code)
df_qfq = pd.read_csv(file_name_qfq, header=None, names = Const.tdx_csv_schema, skipfooter=1, 
                 parse_dates=[0], infer_datetime_format=True, engine='python')
df_bfq = pd.read_csv(file_name_bfq, header=None, names = Const.tdx_csv_schema, skipfooter=1, 
                 parse_dates=[0], infer_datetime_format=True, engine='python')

In [None]:
epsilon = 0.01 # less than 0.1 cent

ratio = df_qfq.Close / df_bfq.Close
estimate = pd.DataFrame()
estimate.loc[:, 'Open'] = ratio * df_bfq.Open
estimate.loc[:, 'High'] = ratio * df_bfq.High
estimate.loc[:, 'Low'] = ratio * df_bfq.Low
diff = (df_qfq[['Open', 'High', 'Low']] - estimate[['Open', 'High', 'Low']]).abs()

In [None]:
(diff>epsilon).sum()

In [None]:
df_qfq.head()

In [None]:
df_bfq.tail()

# qfq bfq is more complicated than I thought

In [None]:
import tushare as ts
with open('tk.key', 'r') as f:
    token = f.readline()
    ts.set_token(token)

In [None]:
from_ts = ts.get_h_data('300393', start='2013-12-30', end = '2016-12-23', drop_factor=False, autype='bfq')

In [None]:
from_ts.head()

In [None]:
from dao import fromTDX

In [None]:
df = fromTDX.get_stock('SZ#000563.txt', 'qfq')

# use 雪球

In [None]:
import urllib2,csv,cookielib

#site = "http://xueqiu.com/S/AAPL/historical.csv"
#site= "http://www.nseindia.com/live_market/dynaContent/live_watch/get_quote/getHistoricalData.jsp?symbol=JPASSOCIAT&fromDate=1-JAN-2012&toDate=1-AUG-2012&datePeriod=unselected&hiddDwnld=true"
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
       'Accept-Encoding': 'none',
       'Accept-Language': 'en-US,en;q=0.8',
       'Connection': 'keep-alive'}

#req = urllib2.Request(site, headers=hdr)
symbolTest = 'APPL'
Exchange = 'NASDAQ'

try:
    with open(Exchange +'.csv') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            print(row['Symbol'], row['Name'])
            symbol = row['Symbol'].strip()

            if '^' not in symbol:
                site = "http://xueqiu.com/S/" + symbol + "/historical.csv"
                req = urllib2.Request(site, headers=hdr)
                page = urllib2.urlopen(req)
                #content = page.read()
                with open(Exchange + '/'+symbol+'.csv','w') as symbolCSV:
                    symbolCSV.write(page.read())
            else:
                print 'symbol contains ^, not valid, passed...'

except urllib2.HTTPError, e:
    print e.fp.read()

In [None]:
import urllib2
import pandas as pd

hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
       'Accept-Encoding': 'none',
       'Accept-Language': 'en-US,en;q=0.8',
       'Connection': 'keep-alive'}

site = "http://xueqiu.com/S/" + "SZ000030" + "/historical.csv"
req = urllib2.Request(site, headers=hdr)
page = urllib2.urlopen(req)

df = pd.read_csv(page)

In [None]:
df.head()

# Use requests session

In [None]:
import requests
import io

with requests.Session() as s:
    s.headers.update(hdr)
    r = s.get(site)
    if r.status_code == 200:
        csv = r.content.decode('utf8')
        df = pd.read_csv(io.StringIO(csv), parse_dates=['date'], infer_datetime_format=True)
        print df.head()  
    else:
        print r.status_code

# Parse js generated webpage

In [None]:
import pandas as pd
from pandas.io import html as pd_html
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from time import sleep

driver = webdriver.Chrome() # ChromeDriver need be installed by Homebrew
driver.implicitly_wait(10) # This line will cause it to search for 10 seconds
url_test = 'https://xueqiu.com/S/SZ000030/FHPS'
driver.get(url_test)

dividen_dfs = []
while True:
    try:
        table = driver.find_element_by_xpath('//table[@class="dataTable table table-bordered"]')
        table_html = table.get_attribute('outerHTML')
        df = pd_html.read_html(table_html, na_values = '-')
        
        # below is just some normal dataframe munging
        processed_df = df[0].T
        processed_df.columns = processed_df.iloc[0,:]
        processed_df = processed_df.drop(0, axis=0)
        processed_df = processed_df.reset_index(drop=True)
        dividen_dfs.append(processed_df)
        
        # get the link and click on the link
        link = driver.find_element_by_link_text(u'下一页')
        link.click()
        sleep(0.1)
    except NoSuchElementException:
        # no more 'next page'
        break
    except StaleElementReferenceException:
        # the new form not loaded yet
        print 'stale'
        sleep(0.1)
        
driver.close()
dividen_df = pd.concat(dividen_dfs).drop_duplicates().reset_index(drop=True)

In [None]:
dividen_df

In [None]:
from contextlib import contextmanager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver



@contextmanager
def wait_for_page_load(driver, timeout=10):
    # I used chrome to get this xpath
    check_ele = driver.find_element_by_xpath('//*[@id="center"]/div[2]/div[2]/div[2]/div')
    check_text = check_ele.text
    if check_text == u'暂无数据':
        old_td = None
        yield
    else:
        old_td = driver.find_element_by_xpath('//*[@id="center"]/div[2]/div[2]/div[2]/div[1]/table/tbody/tr[1]/td[2]')
        yield 
    # yield nothing, just want keep the current state of old_td 
    # when exit the with wait_for_page_load block, the next line
    # make sure that the old_td will be changed, or 'no next page' element showing up
    if old_td:
        WebDriverWait(driver, timeout=timeout).until(EC.staleness_of(old_td))
    

@contextmanager
def open_driver():
    driver = webdriver.Chrome()
    driver.implicitly_wait(10) # This line will cause it to wait upto 10 seconds if an element is not there
    yield driver
    driver.quit()



In [None]:
import pandas as pd
from pandas.io import html as pd_html
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.common.exceptions import TimeoutException



# driver = webdriver.Chrome() # ChromeDriver need be installed by Homebrew
# driver.implicitly_wait(10) # This line will cause it to search for 10 seconds
with open_driver() as driver:
    url_test = 'https://xueqiu.com/S/SH600000/FHPS'
    driver.get(url_test)

    dividen_dfs = []
    while True:
        with wait_for_page_load(driver):
            check_ele = driver.find_element_by_xpath('//*[@id="center"]/div[2]/div[2]/div[2]/div')
            check_text = check_ele.text
            if check_text == u'暂无数据':
                break
            table = driver.find_element_by_xpath('//table[@class="dataTable table table-bordered"]')
            table_html = table.get_attribute('outerHTML')
            df = pd_html.read_html(table_html, na_values = '-')

            # below is just some normal dataframe munging
            processed_df = df[0].T
            processed_df.columns = processed_df.iloc[0,:]
            processed_df = processed_df.drop(0, axis=0)
            processed_df = processed_df.reset_index(drop=True)
            dividen_dfs.append(processed_df)

            # get the link and click on the link
            link = driver.find_element_by_link_text(u'下一页')
#             print link.get_attribute('class')
#             print type(link.get_attribute('class'))
#             if 'noClick' in link.get_attribute('class'):
#                 break
            link.click()
        
dividen_df = pd.concat(dividen_dfs).drop_duplicates().reset_index(drop=True)
dividen_df

In [None]:
cleaned_df = dividen_df.iloc[:,[3,4,5,7,11,12]].copy()
cleaned_df.columns=['Sg_ratio', 'Zg_ratio', 'Date_regi', 'Dividend', 'Date_Zg', 'Date_Sg']
cleaned_df.head(2)
for col_name in cleaned_df.columns:
    if col_name[:4] == 'Date':
        cleaned_df.loc[:, col_name] = pd.to_datetime(cleaned_df.loc[:, col_name], errors='coerce', infer_datetime_format=True)
    if col_name[-5:] == 'ratio':
        cleaned_df.loc[:, col_name] = cleaned_df.loc[:, col_name].astype(float).fillna(0)/float(10)
    if col_name == 'Dividend':
        cleaned_df.loc[:, col_name] = cleaned_df.loc[:, col_name].astype(float).fillna(0)
# cleaned_df.columns = ['fhpg.{}'.format(name) for name in cleaned_df.columns]
cleaned_df.to_dict('records')   

In [None]:
[dict(v.dropna()) for k, v in cleaned_df.iterrows()]

# Testing nested data in Mongodb

In [173]:
import pymongo
client = pymongo.MongoClient(host='localhost', port=27017)
db = client['personalTrader']
symbol_coll = db['symbol']

## Insert dataframe into an array of embedded sub-documents

In [218]:
symbol_coll.update_one({'code':code}, 
                       {'$set': {'fhps': [dict(v.dropna()) for k, v in fhps_df[['Date', 'Fh', 'Ps']].iterrows()]}}, 
                       upsert=False)
symbol_coll.find_one({'code':code})

{u'TDXname': u'SH#600589.txt',
 u'_id': ObjectId('58622523208c8e0a0b0d966b'),
 u'area': u'\u5e7f\u4e1c',
 u'bvps': 4.18,
 u'code': u'600589',
 u'data_end': datetime.datetime(2016, 12, 27, 0, 0),
 u'data_start': datetime.datetime(2001, 6, 12, 0, 0),
 u'esp': 0.11199999999999999,
 u'fhps': [{u'Date': datetime.datetime(2002, 6, 6, 0, 0),
   u'Fh': 0.15,
   u'Ps': 0.0},
  {u'Date': datetime.datetime(2003, 6, 5, 0, 0), u'Fh': 0.2, u'Ps': 0.0},
  {u'Date': datetime.datetime(2004, 6, 10, 0, 0), u'Fh': 0.25, u'Ps': 0.2},
  {u'Date': datetime.datetime(2005, 6, 22, 0, 0), u'Fh': 0.2, u'Ps': 0.0},
  {u'Date': datetime.datetime(2006, 6, 26, 0, 0), u'Fh': 0.1, u'Ps': 0.0},
  {u'Date': datetime.datetime(2007, 6, 6, 0, 0), u'Fh': 0.05, u'Ps': 0.5},
  {u'Date': datetime.datetime(2008, 5, 14, 0, 0), u'Fh': 0.05, u'Ps': 0.5},
  {u'Date': datetime.datetime(2009, 5, 14, 0, 0), u'Fh': 0.02, u'Ps': 0.0},
  {u'Date': datetime.datetime(2010, 6, 1, 0, 0), u'Fh': 0.02, u'Ps': 0.0},
  {u'Date': datetime.datetime

## get the dataframe back from mongodb

### By `$project`

In [189]:
p_match = {'$match':{'code':code}}
p_project = {'$project': {'fhps':1, '_id':0}}
pipeline=[p_match, p_project]
df = pd.DataFrame(list(symbol_coll.aggregate(pipeline))[0]['fhps'])
df.head()

Unnamed: 0,Date,Fh,Ps
0,2002-06-06,0.15,0.0
1,2003-06-05,0.2,0.0
2,2004-06-10,0.25,0.2
3,2005-06-22,0.2,0.0
4,2006-06-26,0.1,0.0


### By `$unwind`

In [196]:
p_match = {'$match':{'code':code}}
p_project = {'$project': {'fhps':1, '_id':0}}
p_unwind = {'$unwind': {'path':'$fhps'}}
pipeline=[p_match, p_project, p_unwind]
df = pd.DataFrame([record['fhps'] for record in symbol_coll.aggregate(pipeline)])
df.head()

Unnamed: 0,Date,Fh,Ps
0,2002-06-06,0.15,0.0
1,2003-06-05,0.2,0.0
2,2004-06-10,0.25,0.2
3,2005-06-22,0.2,0.0
4,2006-06-26,0.1,0.0


## get only one column of the dataframe back from mongodb 

In [201]:
p_match = {'$match':{'code':code}}
p_unwind = {'$unwind': {'path':'$fhps'}}
p_project = {'$project': {'Date': '$fhps.Date', '_id':0}}
pipeline=[p_match, p_unwind, p_project]
df = pd.DataFrame(list(symbol_coll.aggregate(pipeline)))

Unnamed: 0,Date
0,2002-06-06
1,2003-06-05
2,2004-06-10
3,2005-06-22
4,2006-06-26


### Get matching rows of the dataframe back from mongodb

#### The following two methods only return the first match

In [205]:
symbol_coll.find_one({'code': code}, 
                     {'fhps': {'$elemMatch': {'Fh': {'$gt': 0.01}}}, '_id':0})

{u'fhps': [{u'Date': datetime.datetime(2002, 6, 6, 0, 0),
   u'Fh': 0.15,
   u'Ps': 0.0}]}

In [210]:
symbol_coll.find_one({'code':code, 
                      'fhps.Date':{'$lt':date_parser.parse('2012-06-25')}},
                     {'fhps.$':1, '_id':0})

{u'fhps': [{u'Date': datetime.datetime(2002, 6, 6, 0, 0),
   u'Fh': 0.15,
   u'Ps': 0.0}]}

#### The correct method

In [219]:
p_match = {'$match':{'code':code}}
p_project = {'$project': {'fhps':{'$filter':{'input':'$fhps', 
                                             'as':'fhps_f',
                                             'cond':{'$gt': ['$$fhps_f.Date', date_parser.parse('2012-06-25')]}}},
                          '_id':0}}
pipeline=[p_match, p_project]
df = pd.DataFrame(list(symbol_coll.aggregate(pipeline))[0]['fhps'])
df

Unnamed: 0,Date,Fh,Ps
0,2012-07-05,0.015,0.0
1,2013-07-12,0.055,0.0
2,2014-07-16,0.028,0.0
3,2015-07-14,0.0155,0.0
4,2016-07-20,0.009,0.0


## Remove the dataframe

In [216]:
res = symbol_coll.update_one({'code':code}, {'$unset':{'fhps':''}}) # note the empty ''

# Testing of Yahoo API

## <font color='red'>雅虎的API里面没有增股的数据，只有送股的数据，不用</font>

In [77]:
import requests
import pandas as pd
from dateutil import parser as date_parser

base_url = 'http://ichart.finance.yahoo.com/x?'
ncode = '600033.SS'
params = {'g':'v',
          's':ncode}
r = requests.get(base_url, params=params)
r.status_code

fhps_records = []
for line in r.iter_lines():
    if line[:5]=='DIVID':
        tokens = line.strip().split(',')
        fhps_date = date_parser.parse(tokens[1].strip())
        fh_amt = float(tokens[2].strip())
        fhps_records.append({'Date':fhps_date, 'Fh':fh_amt})
    elif line[:5] == 'SPLIT':
        tokens = line.strip().split(',')
        fhps_date = date_parser.parse(tokens[1].strip())
        ps_amt = (int(tokens[2].strip().split(':')[0].strip()) - int(tokens[2].strip().split(':')[1].strip())) / float(tokens[2].strip().split(':')[1].strip())
        fhps_records.append({'Date':fhps_date, 'Ps':ps_amt})

fhps_df = pd.DataFrame(fhps_records)

for field in ['Fh', 'Ps']:
    if field not in fhps_df:
        fhps_df.loc[:, field] = 0.0

fhps_df.loc[:, ['Fh', 'Ps']] = fhps_df.loc[:, ['Fh', 'Ps']].fillna(0.0)
fhps_df = fhps_df.groupby('Date').sum().reset_index().sort_values('Date')
fhps_df

Unnamed: 0,Date,Fh,Ps
0,2002-06-27,0.12,0.0
1,2003-04-30,0.05,0.2
2,2004-06-03,0.15,0.2
3,2005-06-16,0.35,0.0
4,2006-06-01,0.35,0.0
5,2007-07-05,0.25,0.0
6,2008-04-17,0.25,0.0
7,2009-05-21,0.15,0.0
8,2010-06-17,0.09,0.0
9,2011-08-08,0.1,0.0


In [None]:
for line in r.iter_lines():
    print line

# Use Sohu API

In [75]:
import re
from dateutil import parser as date_parser
import pandas as pd
from contextlib import contextmanager
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

@contextmanager
def open_fast_driver():
    firefox_p = webdriver.FirefoxProfile()
    firefox_p.set_preference('permissions.default.image', 2)
    firefox_p.set_preference('network.http.connection-timeout', 1)
    driver = webdriver.Firefox(firefox_profile=firefox_p)
    yield driver
    driver.quit()
    

zg_re = re.compile(ur'转增(\d+)股')
sg_re = re.compile(ur'送(\d+)股')
fh_re = re.compile(ur'派息(.+)元')
date_re = re.compile(ur'(\d{4}-\d{2}-\d{2})')


base_url = 'http://q.stock.sohu.com/cn/{}/fhsp.shtml'
code = '600589'
url = base_url.format(code)
table_xpath = '/html/body/div[4]/div[2]/div[2]/div[2]/div/div[2]/table'
table = None
with open_fast_driver() as driver:
    driver.get(url)
    d_table = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH,table_xpath)))
    table = d_table.text

if table:
    line_iter = iter(table.split('\n'))
    records = []
    while True:
        record = dict()
        date_m = None
        try:
            line = line_iter.next()
            tokens = line.strip().split()
            if len(tokens) == 1:
                continue
            if tokens[0] != u'除权除息日':
                continue
            if len(tokens) == 2:
                next_line = line_iter.next()
                date_m = date_re.search(next_line)
            if not date_m:
                date_m = date_re.search(line)
            zg_m = zg_re.search(line)
            sg_m = sg_re.search(line)
            fh_m = fh_re.search(line)
            record['Date'] = date_parser.parse(date_m.group(1))
            if zg_m:
                record['Zg'] = float(zg_m.group(1))/10.
            if sg_m:
                record['Sg'] = float(sg_m.group(1))/10.
            if fh_m:
                record['Fh'] = float(fh_m.group(1))/10.
            records.append(record)
        except StopIteration:
            break
    fhps_df = pd.DataFrame(records)
    for field in ['Fh', 'Zg', 'Sg']:
        if field not in fhps_df:
            fhps_df.loc[:, field] = 0.0
    fhps_df.loc[:, ['Fh', 'Zg', 'Sg']] = fhps_df.loc[:, ['Fh', 'Zg', 'Sg']].fillna(0.0)
    fhps_df.loc[:, 'Ps'] = fhps_df['Zg']+fhps_df['Sg']
    fhps_df = fhps_df.sort_values('Date')

fhps_df[['Date', 'Fh', 'Ps']]

Unnamed: 0,Date,Fh,Ps
14,2002-06-06,0.15,0.0
13,2003-06-05,0.2,0.0
12,2004-06-10,0.25,0.2
11,2005-06-22,0.2,0.0
10,2006-06-26,0.1,0.0
9,2007-06-06,0.05,0.5
8,2008-05-14,0.05,0.5
7,2009-05-14,0.02,0.0
6,2010-06-01,0.02,0.0
5,2011-07-08,0.025,0.0


# Try use non-GUI webkit to speed up - phantomJS

In [185]:
from contextlib import contextmanager
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import re
from dateutil import parser as date_parser
import pandas as pd

@contextmanager
def open_phantomJS_driver():
    
    capabilities = webdriver.DesiredCapabilities.PHANTOMJS.copy()
    capabilities['phantomjs.page.settings.loadImages'] = False
    capabilities['phantomjs.page.settings.webSecurityEnabled'] = False
    capabilities['phantomjs.page.settings.javascriptCanOpenWindows'] = False
    capabilities['phantomjs.page.settings.javascriptCanCloseWindows'] = False
    capabilities['phantomjs.page.settings.userAgent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/538.1 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/538.1'

    driver = webdriver.Remote("http://localhost:4444/wd/hub", capabilities)
    yield driver
    driver.quit()

zg_re = re.compile(ur'转增(\d+)股')
sg_re = re.compile(ur'送(\d+)股')
fh_re = re.compile(ur'派息(.+)元')
date_re = re.compile(ur'(\d{4}-\d{2}-\d{2})')


base_url = 'http://q.stock.sohu.com/cn/{}/fhsp.shtml'
code = '600589'
url = base_url.format(code)
table_xpath = '/html/body/div[4]/div[2]/div[2]/div[2]/div/div[2]/table'
table = None

with open_phantomJS_driver() as driver:
    try:
        driver.get(url)
        d_table = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH,table_xpath)))
        table = d_table.text
    except Exception as e:
        print e.msg
        for line in e.stacktrace:
            print line

if table:
    line_iter = iter(table.split('\n'))
    records = []
    while True:
        record = dict()
        date_m = None
        try:
            line = line_iter.next()
            tokens = line.strip().split()
            if len(tokens) == 1:
                continue
            if tokens[0] != u'除权除息日':
                continue
            if len(tokens) == 2:
                next_line = line_iter.next()
                date_m = date_re.search(next_line)
            if not date_m:
                date_m = date_re.search(line)
            zg_m = zg_re.search(line)
            sg_m = sg_re.search(line)
            fh_m = fh_re.search(line)
            record['Date'] = date_parser.parse(date_m.group(1))
            if zg_m:
                record['Zg'] = float(zg_m.group(1))/10.
            if sg_m:
                record['Sg'] = float(sg_m.group(1))/10.
            if fh_m:
                record['Fh'] = float(fh_m.group(1))/10.
            records.append(record)
        except StopIteration:
            break
    if records:
        fhps_df = pd.DataFrame(records)
        for field in ['Fh', 'Zg', 'Sg']:
            if field not in fhps_df:
                fhps_df.loc[:, field] = 0.0
        fhps_df.loc[:, ['Fh', 'Zg', 'Sg']] = fhps_df.loc[:, ['Fh', 'Zg', 'Sg']].fillna(0.0)
        fhps_df.loc[:, 'Ps'] = fhps_df['Zg']+fhps_df['Sg']
        fhps_df = fhps_df.sort_values('Date')
fhps_df[['Date', 'Fh', 'Ps']]

Unnamed: 0,Date,Fh,Ps
14,2002-06-06,0.15,0.0
13,2003-06-05,0.2,0.0
12,2004-06-10,0.25,0.2
11,2005-06-22,0.2,0.0
10,2006-06-26,0.1,0.0
9,2007-06-06,0.05,0.5
8,2008-05-14,0.05,0.5
7,2009-05-14,0.02,0.0
6,2010-06-01,0.02,0.0
5,2011-07-08,0.025,0.0


# Test of using Sina API

In [453]:
from contextlib import contextmanager
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import re
from dateutil import parser as date_parsere
from pandas.io import html as pd_html

@contextmanager
def open_phantomJS_driver():
    
    capabilities = webdriver.DesiredCapabilities.PHANTOMJS.copy()
    capabilities['phantomjs.page.settings.loadImages'] = False
    capabilities['phantomjs.page.settings.webSecurityEnabled'] = False
    capabilities['phantomjs.page.settings.javascriptCanOpenWindows'] = False
    capabilities['phantomjs.page.settings.javascriptCanCloseWindows'] = False
    capabilities['phantomjs.page.settings.userAgent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/538.1 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/538.1'

    driver = webdriver.Remote("http://localhost:4444/wd/hub", capabilities)
    yield driver
    driver.quit()

base_url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_MarketHistory/stockid/{}.phtml?year={}&jidu={}'
code = '600033'
year = '2011'
jidu = '1'

url = base_url.format(code, year, jidu)
table_xpath = '//*[@id="FundHoldSharesTable"]'
year_list_xpath = '//*[@id="con02-4"]/table/tbody/tr/td/form/select[1]'
df = None

with open_phantomJS_driver() as driver:
    try:
        driver.get(url)
        
        d_table = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, table_xpath)))
        df = pd_html.read_html(d_table.get_attribute('outerHTML'), header=1)
    except Exception as e:
        print e

if df:
    df[0].columns = ['Date', 'Open', 'High', 'Close', 'Low', 'Volume', 'Amount']
    df[0].Date = pd.to_datetime(df[0].Date)
    df[0]['Volume'] = df[0]['Volume'].astype(int)
    df = df[0].set_index('Date').sort_index()

df.head()

#         year_list_options = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, year_list_xpath)))
#         year_list_html = year_list_options.get_attribute('innerHTML')
#     except Exception as e:
#         print e
        
# print year_list_html
# soup = BeautifulSoup(year_list_html)

# for year in soup.find_all(name = 'option'):
#     print year.text


					<option value="2017">2017</option>
					<option value="2016">2016</option>
					<option value="2015">2015</option>
					<option value="2014">2014</option>
					<option value="2013">2013</option>
					<option value="2012">2012</option>
					<option value="2011" selected="">2011</option>
					<option value="2010">2010</option>
					<option value="2009">2009</option>
					<option value="2008">2008</option>
					<option value="2007">2007</option>
					<option value="2006">2006</option>
					<option value="2005">2005</option>
					<option value="2004">2004</option>
					<option value="2003">2003</option>
					<option value="2002">2002</option>
					<option value="2001">2001</option>
			


## Another Better Sina API ？
Not really, because missing Amount data, but much much faster!

In [488]:
import numpy as np

hq_baseURL = 'http://biz.finance.sina.com.cn/stock/flash_hq/kline_data.php?symbol={}&begin_date={}&end_date={}'

code = 'sh600589'
start='20140901'
end='20150101'
url = hq_baseURL.format(code, start, end)

r = requests.get(url)
page = r.content
soup = BeautifulSoup(page)
hq_list = []
for el in soup.find_all(name='content'):
    hq_cur = dict()
    hq_cur['Close'] = float(el.attrs['c'])
    hq_cur['Open'] = float(el.attrs['o'])
    hq_cur['Volume'] = int(el.attrs['v'])
    hq_cur['High'] = float(el.attrs['h'])
    hq_cur['Low'] = float(el.attrs['l'])
    hq_cur['Date'] = date_parser.parse(el.attrs['d'])
    hq_list.append(hq_cur)
df = pd.DataFrame(hq_list)
df['Amount'] = np.NaN
df

Unnamed: 0,Close,Date,High,Low,Open,Volume,Amount
0,5.39,2014-09-01,5.43,5.28,5.32,72819,
1,5.49,2014-09-02,5.52,5.38,5.40,111495,
2,5.59,2014-09-03,5.65,5.45,5.51,170968,
3,5.64,2014-09-04,5.68,5.53,5.60,133587,
4,5.61,2014-09-05,5.67,5.55,5.67,117608,
5,5.73,2014-09-09,5.75,5.56,5.60,131686,
6,5.72,2014-09-10,5.84,5.66,5.68,115630,
7,5.66,2014-09-11,5.77,5.60,5.72,135807,
8,5.81,2014-09-12,5.82,5.60,5.65,131828,
9,5.83,2014-09-15,5.93,5.78,5.80,162283,


### FHPS

In [415]:
fhps_base_url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vISSUE_ShareBonus/stockid/{}.phtml'
code ='000022'
url = fhps_base_url.format(code)

fhps_table_xpath = '//*[@id="sharebonus_1"]'
df = None

with open_phantomJS_driver() as driver:
    try:
        driver.get(url)
        d_table = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, fhps_table_xpath)))
        df = pd_html.read_html(d_table.get_attribute('outerHTML'))
    except Exception as e:
        print e


df = df[0].iloc[:, [1, 2, 3, 5]]
df.columns = ['Sg', 'Zg', 'Fh', 'Date']
df[['Sg', 'Zg', 'Fh']] = df[['Sg', 'Zg', 'Fh']].astype(float).fillna(0)/10
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df.loc[:, 'Ps'] = df['Sg'] + df['Zg']
df[['Date', 'Fh', 'Ps']].set_index('Date').sort_index()

Unnamed: 0_level_0,Fh,Ps
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1994-06-13,0.081,0.1
1995-05-30,0.197,0.0
1996-06-24,0.195,0.0
1997-07-04,0.169,0.0
1998-07-20,0.146,0.0
1999-07-21,0.087,0.0
2000-07-24,0.116,0.0
2001-07-16,0.129,0.0
2002-07-08,0.102,0.0
2003-07-24,0.288,0.0


# Test of using Sohu API

In [369]:
sohu_base_url = 'http://q.stock.sohu.com/hisHq?&code=cn_{}&start={}&end={}&t=d&rt=json'
code = '600202'
start_date = '20000101'
end_date = '20161201'
url = sohu_base_url.format(code, start_date, end_date)
url

'http://q.stock.sohu.com/hisHq?&code=cn_600202&start=20000101&end=20161201&t=d&rt=json'

In [371]:
import json
from StringIO import StringIO
r = requests.get(url)
print r.status_code
page = r.content[1:-2]
page_io = StringIO(page)
data = json.load(page_io)
df = pd.DataFrame(data['hq'])

200


In [372]:
from datetime import datetime

In [373]:
sohu_name = ['Date', 'Open', 'Close', 'Fluctuation', 'Fluctuation_ratio', 'Low', 'High', 'Volume', 'Amount', 'Change_ratio']
sohu_dtypes = [datetime, float, float, float, float, float, float, int, float, float]
df.columns = sohu_name
for dtype, col in zip(sohu_dtypes, sohu_name):
    if col == 'Date':
        df[col] = pd.to_datetime(df[col], errors='coerce')
    elif col[-5:] == 'ratio':
        df[col] = df[col].apply(lambda ratio: float(ratio[:-1])/100)
    else:
        df[col] = df[col].astype(dtype)
df = df.set_index('Date').sort_index()
df.head()

Unnamed: 0_level_0,Open,Close,Fluctuation,Fluctuation_ratio,Low,High,Volume,Amount,Change_ratio
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2000-01-04,13.42,13.58,0.21,0.0157,13.13,13.6,2104,282.76,0.007
2000-01-05,13.6,13.5,-0.08,-0.0059,13.28,13.8,2955,401.99,0.0098
2000-01-06,13.4,13.85,0.35,0.0259,13.3,13.9,3017,411.93,0.01
2000-01-07,13.9,14.11,0.26,0.0188,13.8,14.32,9695,1361.83,0.0323
2000-01-10,14.13,14.26,0.15,0.0106,13.92,14.39,5827,820.76,0.0194


# Test using of Tushare

There are two different data source
1. `get_h_data`
2. `get_k_data`

[reference link](http://mp.weixin.qq.com/s?__biz=MzAwOTgzMDk5Ng==&mid=2650833972&idx=1&sn=4de9f9ee81bc8bf85d1e0a4a8f79b0de&chksm=80adb30fb7da3a19817c72ff6f715ee91d6e342eb0402e860e171993bb0293bc4097e2dc4fe9&mpshare=1&scene=1&srcid=1106BPAdPiPCnj6m2Xyt5p2M#wechat_redirect)

In [374]:
import tushare as ts

In [381]:
code = '600202'
df = ts.get_k_data(code)

In [382]:
df.head()

Unnamed: 0,date,open,close,high,low,volume,code
0,2015-02-11,7.202,7.222,7.272,7.182,35192.0,600202
1,2015-02-12,7.222,7.362,7.392,7.202,44545.0,600202
2,2015-02-13,7.372,7.452,7.512,7.372,59613.0,600202
3,2015-02-16,7.462,7.612,7.632,7.452,55791.0,600202
4,2015-02-17,7.622,7.582,7.662,7.492,55353.0,600202


In [378]:
df.tail()

Unnamed: 0,date,open,close,high,low,volume,code
375,2016-12-26,11.23,11.07,11.23,10.8,82667.0,600202
376,2016-12-27,11.01,11.1,11.2,11.0,36503.0,600202
377,2016-12-28,11.16,11.03,11.18,10.92,49169.0,600202
378,2016-12-29,11.0,11.56,11.88,10.94,122373.0,600202
379,2016-12-30,11.7,12.1,12.58,11.58,177582.0,600202


In [379]:
df = ts.get_h_data(code)
df.tail()

[Getting data:]####

  df = df.sort('date', ascending=False)


Unnamed: 0_level_0,open,high,close,low,volume,amount
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-01-12,10.35,11.83,11.46,9.78,53176530.0,554440000.0
2016-01-11,11.5,11.66,10.87,10.87,20338457.0,226045768.0
2016-01-08,12.79,12.79,12.08,11.26,27782574.0,339482984.0
2016-01-07,13.52,13.58,12.48,12.48,8926180.0,115109122.0
2016-01-06,13.29,14.0,13.87,13.02,50380531.0,684744925.0


In [380]:
df.head()

Unnamed: 0_level_0,open,high,close,low,volume,amount
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-12-30,11.7,12.58,12.1,11.58,17758221.0,215324754.0
2016-12-29,11.0,11.88,11.56,10.94,12237356.0,139999314.0
2016-12-28,11.16,11.18,11.03,10.92,4916912.0,54207619.0
2016-12-27,11.01,11.2,11.1,11.0,3650327.0,40613821.0
2016-12-26,11.23,11.23,11.07,10.8,8266760.0,90642291.0


# Testing using Tecent

In [406]:
fhps_url_base = 'http://stock.finance.qq.com/corp1/distri.php?zqdm={}'
code = '600589'
url = fhps_url_base.format(code)
r = requests.get(url)
print r.status_code
df_all = pd.read_html(r.content, na_values='--')
df = df_all[3].iloc[4:, [2, 3, 4, 6]]
df.columns = ['Sg', 'Zg', 'Fh', 'Date']
df[['Sg', 'Zg', 'Fh']] = df[['Sg', 'Zg', 'Fh']].astype(float).fillna(0)/10
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df.loc[:, 'Ps'] = df['Sg'] + df['Zg']
df[['Date', 'Fh', 'Ps']].set_index('Date').sort_index()

200


Unnamed: 0_level_0,Fh,Ps
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2002-06-06,0.15,0.0
2003-06-05,0.2,0.0
2004-06-10,0.0,0.2
2004-06-10,0.25,0.0
2005-06-22,0.2,0.0
2006-06-26,0.1,0.0
2007-06-06,0.0,0.5
2007-06-06,0.05,0.0
2008-05-14,0.0,0.5
2008-05-14,0.05,0.0


# <font color='red'>Checking of data integraty</font>

## check TDX qfq and Yahoo

In [232]:
from pandas_datareader import data as web
from dao import fromDB

In [294]:
reload(fromDB)

<module 'dao.fromDB' from '/Users/yugan/Dropbox/personalTrader/resources/../dao/fromDB.py'>

### fromDB data is not clean
1. `Volume != 0`

### Yahoo data is not clean either.
1. `Volume != 0`
2. `Volume != Volume.shift()`

### After checking 
TDX missed the date `2001-08-16`, but Yahoo missed the date `2001-11-19` for stock `600589`

In [276]:
code = '600589'
tdx_qfq = fromDB.get_stock_ori(code, 'qfq')
tdx_qfq = tdx_qfq[tdx_qfq.Volume !=0]
start_date = tdx_qfq.index[0]
end_date = tdx_qfq.index[-1]
yahoo_name = fromDB.from_code_get_yahoo_name(code)
yahoo_df = web.DataReader(yahoo_name, 'yahoo', start=start_date, end=end_date)
yahoo_df = yahoo_df[yahoo_df.Volume!=0]
yahoo_df = yahoo_df[yahoo_df.Volume!=yahoo_df.Volume.shift()]
tdx_qfq_close = tdx_qfq.Close
yahoo_adj_close = yahoo_df['Adj Close']
diff = ((tdx_qfq_close - yahoo_adj_close)/tdx_qfq_close).abs()

diff[diff.isnull()]

Date
2001-08-16   NaN
2001-11-19   NaN
dtype: float64

In [338]:
yahoo_df['2001-08-16':'2001-08-16']

Unnamed: 0_level_0,Open,High,Close,Low,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2001-08-16,27.26001,27.50004,26.81,26.80001,1119700


## check TDX bfq and Yahoo

In [284]:
code = '600589'
tdx_bfq = fromDB.get_stock_ori(code, 'bfq')
tdx_bfq = tdx_bfq[tdx_bfq.Volume !=0]
start_date = tdx_bfq.index[0]
end_date = tdx_bfq.index[-1]
yahoo_name = fromDB.from_code_get_yahoo_name(code)
yahoo_df = web.DataReader(yahoo_name, 'yahoo', start=start_date, end=end_date)
yahoo_df = yahoo_df[yahoo_df.Volume!=0]
yahoo_df = yahoo_df[yahoo_df.Volume!=yahoo_df.Volume.shift()]
tdx_bfq = tdx_bfq[['Open', 'High', 'Close', 'Low', 'Volume']]
yahoo_df = yahoo_df[['Open', 'High', 'Close', 'Low', 'Volume']]
diff = ((tdx_bfq - yahoo_df)/tdx_bfq).abs()
diff.Open.sort_values(ascending=False).head(1)

Date
2010-10-29    0.025271
Name: Open, dtype: float64

In [286]:
yahoo_df['2010-10-29':'2010-10-29']

Unnamed: 0_level_0,Open,High,Close,Low,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-10-29,8.1,8.75,8.58,7.98,37386800


In [287]:
tdx_bfq['2010-10-29':'2010-10-29']

Unnamed: 0_level_0,Open,High,Close,Low,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-10-29,8.31,8.75,8.58,8.17,37386802


## check TDX bfq and Xueqiu

### xueqiu data is not clean:
1. `Volume !=0 `

In [289]:
from dao import fromXueQiu

In [298]:
reload(fromXueQiu)

<module 'dao.fromXueQiu' from '/Users/yugan/Dropbox/personalTrader/resources/../dao/fromXueQiu.py'>

In [316]:
xueqiu_df = fromXueQiu.get_stock(code, start_date, end_date)
xueqiu_df = xueqiu_df[['Open', 'High', 'Close', 'Low', 'Volume']]
xueqiu_df = xueqiu_df[xueqiu_df.Volume!=0]
diff = ((tdx_bfq - xueqiu_df)/tdx_bfq).abs()
diff.max()

Open      0.000000
High      0.000000
Close     0.010091
Low       0.011122
Volume    1.737411
dtype: float64

In [324]:
diff.isnull().any(axis=0).sum()

5

In [333]:
diff[diff.isnull().any(axis=1)].index

DatetimeIndex(['2001-08-16', '2014-08-06'], dtype='datetime64[ns]', name=u'Date', freq=None)

In [336]:
tdx_bfq['2014-08-06':'2014-08-06']

Unnamed: 0_level_0,Open,High,Close,Low,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-08-06,5.27,5.35,5.34,5.24,11860272


In [339]:
xueqiu_df['2001-08-16':'2001-08-16']

Unnamed: 0_level_0,Open,High,Close,Low,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2001-08-16,27.26,27.5,26.81,26.8,414727


# Planning of the data cleaning

Currently we have the following available data API

for historical daily data:
1. tushare get_k_data
2. sina
3. yahoo
4. sohu
5. tdx

for fhps data:  
1. xueqiu (slow)
2. tecent (best)
3. sohu
4. sina
2. yahoo (not good, don't use, becuase lack of zg data)

