In [2]:
# all imports and constant definition

import pandas as pd
import numpy as np

import re
import glob
import os
import os.path
import sys

DAILY_DATA_FOLDER = 'E:/analytics/stock/hist-D-2016-10-10-1224'
DAILY_SAMPLE_PATH = 'E:/analytics/stock/hist-D-2016-10-10-1224/002415.csv'

WEEKLY_DATA_FOLDER = 'E:/analytics/stock/hist-w-2016-12-1-0314'
WEEKLY_SAMPLE_PATH = 'E:/analytics/stock/hist-w-2016-12-1-0212/002643.csv'

DATE_FOR_PAUSE_CHECK = '2017-03-10'

pd.set_option('max_rows', 20)

In [26]:
df = pd.read_csv(DAILY_SAMPLE_PATH, index_col=0, parse_dates=True, \
                 usecols=['date', 'close', 'p_change', 'ma5', 'ma10', 'ma20'], \
                 error_bad_lines=False)
df

Unnamed: 0_level_0,close,p_change,ma5,ma10,ma20
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-12-23,23.26,1.53,23.168,23.580,24.391
2016-12-22,22.91,-1.08,23.274,23.769,24.459
2016-12-21,23.16,0.13,23.486,23.968,24.530
2016-12-20,23.13,-1.07,23.638,24.170,24.589
2016-12-19,23.38,-1.72,23.842,24.403,24.642
2016-12-16,23.79,-0.75,23.992,24.607,24.692
2016-12-15,23.97,0.21,24.264,24.745,24.721
2016-12-14,23.92,-0.95,24.450,24.888,24.741
2016-12-13,24.15,0.08,24.702,24.996,24.773
2016-12-12,24.13,-4.06,24.964,25.117,24.783


In [5]:
r = df.resample('W')
# daily average price change percent on weekly basis
r['p_change'].mean().dropna().sort_values()

date
2015-07-12   -5.413333
2015-06-21   -3.862000
2013-12-08   -2.440000
2015-03-08   -2.382500
2014-03-23   -2.336000
                ...   
2015-04-05    2.442000
2013-06-16    2.645000
2014-12-07    2.842000
2015-01-11    3.418000
2015-05-24    6.086000
Name: p_change, dtype: float64

In [27]:
df = pd.read_csv(WEEKLY_SAMPLE_PATH, index_col=0, parse_dates=True, \
                 usecols=['date', 'close', 'p_change', 'ma5', 'ma10', 'ma20'], \
                 error_bad_lines=False)
df

Unnamed: 0_level_0,close,p_change,ma5,ma10,ma20
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-02-10,33.8,-1.43,33.984,35.437,37.753
2017-02-03,34.29,-0.03,34.588,35.897,38.086
2017-01-26,34.3,3.16,34.99,36.506,38.522
2017-01-20,33.25,-3.0,35.366,37.216,38.985
2017-01-13,34.28,-6.9,36.082,37.795,39.505
2017-01-06,36.82,1.43,36.89,38.348,40.101
2016-12-30,36.3,0.33,37.206,38.541,40.62
2016-12-23,36.18,-1.76,38.022,39.096,41.101
2016-12-16,36.83,-3.89,39.066,39.606,41.542


In [14]:
df.index.get_loc(df[df['p_change']<0].index[0])
df.index[0].dayofweek

0

In [7]:
r_df = df.sort_index()
r_df.rolling(window=5)['close'].mean()

date
2016-01-08       NaN
2016-01-15       NaN
2016-01-22       NaN
2016-01-29       NaN
2016-02-05    37.140
2016-02-19    35.772
2016-02-26    35.592
2016-03-04    34.584
2016-03-11    33.980
2016-03-18    32.910
               ...  
2016-04-29    38.478
2016-05-06    41.772
2016-05-13    44.952
2016-05-20    46.850
2016-05-27    47.968
2016-06-03    47.926
2016-06-08    48.250
2016-06-17    49.130
2016-06-24    50.660
2016-06-27    52.744
Name: close, dtype: float64

In [3]:
''' detect increase trend by use of MA, applicable for both daily (kind: 'd') and weekly data (kind: 'w')
'''
def find_increase_trend(df, trend_threshold=3):
    
    if len(df) == 0:
        return None
    
    s = df['ma5']
    # the position from which (exclusive) MA starts to rise until latest
    ma_change_point = -1
    for i in range(0, len(s)-1):
        if s[i] < s[i+1]:
            ma_change_point = i 
            break
    else:
        ma_change_point = len(s) - 1
    
    # the position from which (exclusive) closing price starts to rise until latest
    # 0 means it has been falling in recent period, may rise 'tomorrow'
    if not (df['p_change']<0).any():
        starting_rise_point = len(df)-1
    else:
        starting_rise_point = df.index.get_loc(df[df['p_change']<0].index[0])
    
    trend_start = starting_rise_point if starting_rise_point > ma_change_point else ma_change_point
    # a trend is established at least lasting for 3 occurrences
    if trend_start < trend_threshold:
        return None
    else:
        startdate = df.index[trend_start]
        # use close price of two ends to calc total increase percent
        total_increase = round(df.iat[0, 0] / df.iat[trend_start, 0] - 1, 4) * 100
        # the observation at change point doesn't count as increase, but only later ones
        trend_length = trend_start
        # mean week-over-week increase percent over this period
        mean_increase = round(df[:trend_start]['p_change'].mean(), 2)
        first_above_mean_position = df.index.get_loc(df[df['p_change'] >= mean_increase].index[0])
        # num of recent consecutive obserations whose WoW increase is lower than mean
        recent_below_mean_count = first_above_mean_position
        std = round(df[:trend_start]['p_change'].std(), 2)
        return startdate, total_increase, trend_length, mean_increase, std, recent_below_mean_count

In [11]:
find_increase_trend(WEEKLY_DATA_FOLDER + '/002800.csv')

(Timestamp('2016-06-03 00:00:00'),
 527.96000000000004,
 5,
 45.84,
 22.04,
 True,
 1)

In [5]:
pd.Timestamp('20161001') > pd.Timestamp('20000101')

True

In [4]:
def analyze_trend(folder, kind='w', trend_threshold=3, increase_threshold=10, max_recent_slowdown=1):
    reg = re.compile(r'(\d{6}).csv')
    stocks = {t[1].group(1):t[0] for t in ((x, reg.search(x)) for x in glob.glob(folder + '/*.csv')) if t[1]}
    # [os.path.isfile(x) for x in list(stocks.values())[:5]]
    resultmap = {}
    latest = pd.Timestamp('20000101')
    for code in stocks:
        try: 
#             print('processing ', code)
            df = pd.read_csv(stocks[code], index_col=0, parse_dates=True, \
                             usecols=['date', 'close', 'p_change', 'ma5', 'ma10', 'ma20'], \
                             error_bad_lines=False)
            if len(df) == 0:
                continue
            
            # delete first entry if it doesn't stand for weekly data (whose timestamp should be Fri)
            # usually daily data for date at retrieval is also collected
            if kind == 'w' and df.index[0].dayofweek != 4:
                df = df[1:]
                
            # latest date available in input stock data,
            # absence of it indicates the stock's tranding is paused at that time
            if df.index[0] > latest:
                latest = df.index[0]
            elif df.index[0] < latest:
                continue
                
            res = find_increase_trend(df, trend_threshold)
            ''' take as valid entry when following conditions are met:
                    1. increasing trend lasts longer than 3 observations 
                    2. actual increase percent over the period is above 10% (MA trails behind actual varation)
                    3. increasing trend didn't considerably slowdown lately
            '''
            if res and res[1] > increase_threshold and res[-1] <= max_recent_slowdown:
                resultmap[code] = res
        except Exception as ex:
            print('error occurred in processing %s: %s' % (code, ex))
    df = pd.DataFrame.from_dict(resultmap, orient='index')
    df.columns = ['startdate', 'increase', 'length', 'mean', 'std', 'RSL']
    print(df.head())
    return df

In [5]:
df = analyze_trend(WEEKLY_DATA_FOLDER, kind='w', increase_threshold=2, max_recent_slowdown=0)
print('done analyzing trend analysis, found %d records' % len(df))

error occurred in processing 603630: index 0 is out of bounds for axis 0 with size 0
error occurred in processing 600242: index 0 is out of bounds for axis 0 with size 0
error occurred in processing 600645: index 0 is out of bounds for axis 0 with size 0
error occurred in processing 600212: index 0 is out of bounds for axis 0 with size 0
error occurred in processing 600346: index 0 is out of bounds for axis 0 with size 0
error occurred in processing 002664: index 0 is out of bounds for axis 0 with size 0
error occurred in processing 603986: index 0 is out of bounds for axis 0 with size 0
error occurred in processing 603960: index 0 is out of bounds for axis 0 with size 0
error occurred in processing 000711: index 0 is out of bounds for axis 0 with size 0
error occurred in processing 600768: index 0 is out of bounds for axis 0 with size 0
error occurred in processing 600654: index 0 is out of bounds for axis 0 with size 0
error occurred in processing 002335: index 0 is out of bounds for

In [6]:
TREND_ANALYSIS_OUTPUT_PATH = 'E:/analytics/stock/analysis_weekly_0314.csv'
df.to_csv(TREND_ANALYSIS_OUTPUT_PATH)

In [7]:
pd.set_option('max_rows', 20)

BASIC_DATA_PATH = r'E:\analytics\stock\basics\basic.csv'
basic_df = pd.read_csv(BASIC_DATA_PATH, index_col=False, dtype={'code':np.str}, \
                       usecols=['code', 'pe', 'pb', 'outstanding', 'totals', 'esp', 'timeToMarket'], \
                       error_bad_lines=False)
basic_df.set_index('code', inplace=True)
basic_df['timeToMarket'] = pd.to_datetime(basic_df['timeToMarket'], errors='coerce', format='%Y%m%d')
basic_df = basic_df[basic_df['timeToMarket'].notnull()]
basic_df

Unnamed: 0_level_0,pe,outstanding,totals,esp,pb,timeToMarket
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
300577,21.98,0.17,0.67,0.962,6.48,2016-12-21
603416,30.06,0.25,1.00,0.641,5.06,2016-12-21
600115,11.18,84.81,144.68,0.490,1.98,1997-11-05
000010,221.06,4.08,8.20,0.027,2.94,1995-10-27
603823,25.21,0.45,2.25,0.500,4.29,2016-12-20
300576,28.41,0.20,0.80,0.345,3.39,2016-12-20
002040,190.65,2.46,2.46,0.086,7.84,2005-03-25
603878,43.97,0.51,2.02,0.442,3.26,2016-12-19
600523,48.81,2.89,2.89,0.350,3.10,2001-12-27
000710,0.00,1.51,1.51,-0.045,40.03,1997-04-22


In [8]:
REPORT_DATA_FOLDER = r'E:/analytics/stock/report'

reg = re.compile(r'(\d{4}-\d).csv')
reports = {t[1].group(1):t[0] for t in ((x, reg.search(x)) for x in glob.glob(REPORT_DATA_FOLDER + '/*.csv')) if t[1]}
report_dfs = []
report_terms = []
for term in reports:
    rdf = pd.read_csv(reports[term], index_col=False, dtype={'code':np.str}, \
                           usecols=['code', 'roe', 'profits_yoy'], \
                           error_bad_lines=False)
    rdf.set_index('code', inplace=True)
    d = rdf.index.duplicated()
#     print('duplicates in %s: %d (%s)' % (term, len(d[d==True]), rdf.index[d==True][:5]))
    rdf.drop_duplicates(inplace=True)
    report_dfs.append(rdf)
    report_terms.append(term)
# print(report_dfs)
all_report_df = pd.concat(report_dfs, keys=report_terms, axis=1, join='outer')
all_report_df

Unnamed: 0_level_0,2016-2,2016-2,2016-3,2016-3,2016-1,2016-1,2015-4,2015-4
Unnamed: 0_level_1,roe,profits_yoy,roe,profits_yoy,roe,profits_yoy,roe,profits_yoy
000001,7.35,6.10,11.02,5.52,3.70,8.12,14.94,10.42
000002,5.20,10.42,8.13,20.54,0.83,28.14,19.14,15.08
000004,3.60,146.30,19.25,518.44,,,1.54,-67.19
000005,10.48,-647.59,11.00,-548.35,,,-8.07,-227.98
000006,2.30,-60.44,2.60,-58.64,1.11,-75.79,,
000007,-7.49,258.30,1.77,71.68,2.05,-257.50,3.87,-146.35
000008,,,2.99,126.37,,,7.60,2290.28
000009,1.61,-38.77,4.67,-69.06,0.67,-45.77,19.70,160.12
000010,1.20,1519.15,,,,,4.05,-144.86
000011,-0.25,-116.84,-0.35,-115.50,0.03,61.92,7.59,-62.44


In [9]:
CONSOLIDATED_DATA_PATH = r'E:/analytics/stock/consolidated-W-0314.csv'
consolidated = df.join(basic_df).join(all_report_df)
consolidated.to_csv(CONSOLIDATED_DATA_PATH)