In [1]:
matching_fname = "all.out.matching"
pr_info_fname = "pressrelease_info.csv"
kabuka_fname = "kabuka_tse1"
market_fname = "market_tse1"

In [2]:
pr_info = {}
with open(pr_info_fname, 'r') as f:
    for line in f:
        article_id, date, comp_name, prtype = line.strip().split('\t')
        pr_info[article_id] = prtype

In [3]:
print(pr_info["NIKPRLRSP037981_06012003"])
print(len(pr_info))

06: Order
336271


In [4]:
matching = set()
with open(matching_fname, 'r') as f:
    for line in f:
        article_id, date, sentence, comp_code, comp_name, address_pr, address_lc, score = line.strip().split('\t')
        if article_id not in pr_info:
            continue
        if int(score) >= 3:
            matching.add( (article_id, date, pr_info[article_id], comp_code) )
#print(matching)

In [5]:
print(list(matching)[:3])
print(len(matching))

[('NIKPRLRSP117970_09122005', '20051209', '05: PR', '1812'), ('NIKPRLRSP253032_03062010', '20100603', '06: Capacity', '9622'), ('NIKPRLRSP097432_04042005', '20050404', '01: Product', '8585')]
63802


In [6]:
# TOPIXデータの読み込み

topix = []
with open(market_fname, 'r') as f:
    for line in f:
        data = line.strip().split(',')
        #comp_code = data[0]
        for x in data[1:]:
            date, value = x.split(':')
            topix.append((date, value))
            
import pandas as pd
dates = [x[0] for x in topix]
values = [float(x[1]) for x in topix]

market = pd.DataFrame({"value": values})
market.index = pd.to_datetime(dates)

In [7]:
# 株価データの読み込み、欠損値がある企業はどうする？

from collections import defaultdict

kabuka = defaultdict(list)
with open(kabuka_fname, 'r') as f:
    for line in f:
        data = line.strip().split(',')
        comp_code = data[0]
        for x in data[1:]:
            date, value = x.split(':')
            kabuka[comp_code].append((date, value))

In [8]:
def determine_interval(data, market, date):   # DataFrame, DataFrame, datetime
    er_start= -246 - 1
    er_end = -30 
    #ar_start = -1 - 1
    #ar_end = 1
    
    data_val = data[data.columns[0]]
    market_val = market[market.columns[0]]
    
    ## 指定された日付がデータの範囲に収まっているか判定
    max_date = max(data.index)
    while (max_date not in data_val or data_val[max_date] == 0.):
        max_date = max_date - pd.offsets.Day(1)
    min_date = min(data.index)
    while (min_date not in data_val or data_val[min_date] == 0.):
        min_date = min_date + pd.offsets.Day(1)
    if date < min_date or date > max_date:
        return   # OutOfIndex
    
    ## 与えられたdate以降の日付で、株価データが存在するような最も近い日付を求める
    while (date not in data_val or data_val[date] == 0.):
        date = date + pd.offsets.Day(1) 
        if date > max_date:
            return 
    origin = date
    #print("Origin:", date)
    
    ## 計算に使う範囲の(有効な)株価データを抽出 -> リファクタリング: 先に株価0のエントリーを除去してから連続的に区間を抽出するだけで良い
    er_data = []
    er_market = []
    ar_data = []
    ar_market = []
    
    count = 0
    date = origin
    while (er_end < count):
        date = date - pd.offsets.Day(1)
        if date < min_date:
            return
        if date in data_val and data_val[date] >  0.:
            count -= 1
    #print("Er_end:", date)
    er_data.append(data_val[date])
    er_market.append(market_val[date])
    while (er_start < count):
        date = date - pd.offsets.Day(1)
        if date < min_date:
            return
        if date  in data_val and data_val[date] >  0.:
            er_data.append(data_val[date])
            er_market.append(market_val[date])
            count -= 1
    #print("Er_start:", date)
    er_data.reverse()
    er_market.reverse()
    #print(er_data, er_market)
    
    date = origin - pd.offsets.Day(1)
    while (date not in data_val or data_val[date] == 0.):
        date = date - pd.offsets.Day(1) 
        if date < min_date:
            return
    ar_data.append(data_val[date])    # at -1
    ar_market.append(market_val[date])
    date = date - pd.offsets.Day(1)
    while (date not in data_val or data_val[date] == 0.):
        date = date - pd.offsets.Day(1) 
        if date < min_date:
            return
    ar_data.append(data_val[date])   # at -2
    ar_market.append(market_val[date])
    ar_data.reverse()
    ar_market.reverse()
    
    ar_data.append(data_val[origin])   # at 0
    ar_market.append(market_val[origin])
    date = origin + pd.offsets.Day(1)
    while (date not in data_val or data_val[date] == 0.):
        date = date + pd.offsets.Day(1) 
        if date > max_date:
            return
    ar_data.append(data_val[date])   # at +1
    ar_market.append(market_val[date])
    #print(ar_data, ar_market)
    
    import numpy as np
    return (np.array(er_data), np.array(er_market), np.array(ar_data), np.array(ar_market))

In [9]:
# data: arを計算したい企業の株価時系列データ
# market: 株価指数時系列データ
# start_period [end_period]: 期待値の計算に使う期間の開始[終了]日時
# start_window [end_window]: arを計算したい期間の開始[終了]日時

# 欠損値はこれに渡す前に整形する？dataで0になっているところをdata, market両方から除去とか
# その場合、*_periodの値も調整してから渡す必要がある

def market_return(er_data, er_market, ar_data, ar_market):
    import numpy as np
    import pandas as pd
    from scipy import stats
    from datetime import datetime
    
    #print("DATA:", ar_data)
    #print("MARKET:", ar_market)
        
    def calculate_returns(d):   # np.array of stock values
        dr = np.zeros(shape=d.shape)
        dr[1:] = d[1:] / d[0:-1]   # why -1?
        #print(dr[1:])
        dr[1:] = np.log(dr[1:])
        #print(dr[1:])
        return dr[1:]
    
    # 1. Linear Regression: On the estimation_period
    er_data = calculate_returns(er_data)
    er_market = calculate_returns(er_market)
    ar_data = calculate_returns(ar_data)
    ar_market = calculate_returns(ar_market)
    
    #print("")
    #print("RETURN(DATA):", ar_data)
    #print("RETURN(MARKET):", ar_market)
    
    #c_name = dr_data.columns[0]
    #x =  dr_market[c_name][start_period:end_period]
    #y = dr_data[c_name][start_period:end_period]
    slope, intercept, r_value, p_value, std_error = stats.linregress(er_market, er_data)
    er_reg = lambda x: x * slope + intercept
    
    #print("")
    #print("REG(DATA):", er_data)
    #print("REG(MARKET):", er_market)
    #print("REG: DATA = " + str(slope) + "MARKET + " + str(intercept))
    #print("")

    # 2. Analysis on the event window
    # Expexted Return:
    er = er_reg(ar_market)
    #er.name = 'Expected return'
    # Abnormal return: Return of the data - expected return
    ar = ar_data - er
    #ar.name = 'Abnormal return'
    # Cumulative abnormal return
    car = ar.cumsum()
    #car.name = 'Cum abnormal return'
    
    #print("ER:", er)
    #print("AR:", ar)
    #print("CAR:", car)
    return (car[-1], r_value, p_value)

In [10]:
def load_kabuka(code):
    import pandas as pd
    dates = [x[0] for x in kabuka[code]]
    values = [float(x[1]) for x in kabuka[code]]

    data = pd.DataFrame({"value": values})
    data.index = pd.to_datetime(dates)
    return data

In [16]:
count = 0
car = {}
for articleid, date, prtype, code in matching:
    print(articleid)
    
    count += 1
    if count > 10:
        break
       
    if code not in kabuka:
        #print("no code in the kabuka data")
        continue
    kabuka_data = load_kabuka(code)
    #print(kabuka_data)
    ret = determine_interval(kabuka_data, market, pd.to_datetime(date))
    if ret == None:
        #print("no kabuka data in such interval")
        continue
    ed, em, ad, am = ret
    print(ret)
    car[(articleid, prtype, code)] = market_return(ed, em, ad, am)
    print(car[(articleid, prtype, code)])
#print(car)

NIKPRLRSP117970_09122005
(array([ 417.,  420.,  408.,  405.,  408.,  410.,  416.,  412.,  420.,
        425.,  425.,  429.,  434.,  435.,  436.,  436.,  441.,  444.,
        439.,  442.,  441.,  447.,  443.,  435.,  439.,  443.,  438.,
        438.,  431.,  432.,  455.,  464.,  456.,  455.,  458.,  457.,
        452.,  449.,  445.,  442.,  446.,  447.,  439.,  441.,  447.,
        446.,  430.,  430.,  434.,  430.,  425.,  425.,  422.,  426.,
        431.,  435.,  442.,  442.,  442.,  445.,  441.,  443.,  446.,
        459.,  463.,  461.,  459.,  458.,  463.,  466.,  456.,  457.,
        453.,  463.,  451.,  440.,  443.,  443.,  437.,  435.,  440.,
        436.,  438.,  432.,  432.,  427.,  419.,  412.,  391.,  403.,
        397.,  393.,  394.,  400.,  399.,  391.,  399.,  400.,  413.,
        410.,  402.,  398.,  397.,  391.,  378.,  369.,  368.,  378.,
        374.,  373.,  374.,  370.,  365.,  371.,  370.,  372.,  378.,
        382.,  386.,  385.,  381.,  386.,  379.,  385.,  382.,  

In [15]:
with open("car.all", 'w') as f:
    for key, v in car_p.items():
        articleid, prtype, code = key
        car, rval, pval = v
        f.write('\t'.join([articleid, prtype, code, str(car), str(rval), str(pval)]))
        f.write('\n')

In [14]:
len(car_p)

27840

In [12]:
## 並列化版
def calculate_car(matching_data):
    articleid, date, prtype, code = matching_data
    #print(date)
    if code not in kabuka:
        #print("no code")
        return
    kabuka_data = load_kabuka(code)
    #print(kabuka_data)
    ret = determine_interval(kabuka_data, market, pd.to_datetime(date))
    if ret == None:
        #print("no kabuka")
        return
    ed, em, ad, am = ret
    #car[(articleid, prtype, code)] = market_return(ed, em, ad, am)   # エイリアスはここで縮約されるか
    return (articleid, prtype, code, market_return(ed, em, ad, am))

In [13]:
from multiprocessing import Pool
exe_pool = Pool(16)
car_p = {}
for ret in exe_pool.imap(calculate_car, list(matching)):
    if ret != None:
        car_p[(ret[0], ret[1], ret[2])] = ret[3]