In [1]:
data_dir = "data/"
lc_fname = "listed_company"
#pr_fname = "test.csv"
#pr_fname = "pressrelease_all.csv"
matching_fname = "matchings.new"
kabuka_fname = "kabuka_tse1"
market_fname = "market_tse1"

In [2]:
lc_code = {}
with open(data_dir + lc_fname, 'r') as f:
    for line in f:
        data = line.strip().split('\t')
        lc_code[data[0]] = data[2]
        
#print(lc_code)

In [3]:
matching = {}
with open(matching_fname, 'r') as f:
    for line in f:
        data = line.strip().split('\t')
        comp = data[3].strip().split(' ')
        comp_list = []
        comp_code_list = []
        for x in comp:
            if x in lc_code:
                comp_list.append(x)
                comp_code_list.append(lc_code[x])
        if len(comp_list) > 0:
            matching[data[0]] = (data[1], data[2], comp_code_list, comp_list)
#print(matching)
# {articleid : (prtype, date, [code], [name])}

In [4]:
print(list(matching.keys())[:3])
print(list(matching.values())[:3])

['NIKPRLRSP037981_06012003', 'NIKPRLRSP037996_06012003', 'NIKPRLRSP038000_06012003']
[('06: Order', '20030105', ['8226'], ['理経']), ('99_Others', '20030106', ['8379'], ['広島銀行']), ('01: Product', '20030106', ['4829'], ['日本エンタープライズ'])]


In [5]:
# TOPIXデータの読み込み

topix = []
with open(data_dir + market_fname, 'r') as f:
    for line in f:
        data = line.strip().split(',')
        #comp_code = data[0]
        for x in data[1:]:
            date, value = x.split(':')
            topix.append((date, value))
            
import pandas as pd
dates = [x[0] for x in topix]
values = [float(x[1]) for x in topix]

market = pd.DataFrame({"value": values})
market.index = pd.to_datetime(dates)

In [6]:
# 株価データの読み込み、欠損値がある企業はどうする？

from collections import defaultdict

kabuka = defaultdict(list)
with open(data_dir + kabuka_fname, 'r') as f:
    for line in f:
        data = line.strip().split(',')
        comp_code = data[0]
        for x in data[1:]:
            date, value = x.split(':')
            kabuka[comp_code].append((date, value))

In [7]:
def determine_interval(data, market, date):   # DataFrame, DataFrame, datetime
    er_start= -246 - 1
    er_end = -30 
    #ar_start = -1 - 1
    #ar_end = 1
    
    data_val = data[data.columns[0]]
    market_val = market[market.columns[0]]
    
    ## 指定された日付がデータの範囲に収まっているか判定
    max_date = max(data.index)
    while (max_date not in data_val or data_val[max_date] == 0.):
        max_date = max_date - pd.offsets.Day(1)
    min_date = min(data.index)
    while (min_date not in data_val or data_val[min_date] == 0.):
        min_date = min_date + pd.offsets.Day(1)
    if date < min_date or date > max_date:
        return   # OutOfIndex
    
    ## 与えられたdate以降の日付で、株価データが存在するような最も近い日付を求める
    while (date not in data_val or data_val[date] == 0.):
        date = date + pd.offsets.Day(1) 
        if date > max_date:
            return 
    origin = date
    #print("Origin:", date)
    
    ## 計算に使う範囲の(有効な)株価データを抽出 -> リファクタリング: 先に株価0のエントリーを除去してから連続的に区間を抽出するだけで良い
    er_data = []
    er_market = []
    ar_data = []
    ar_market = []
    
    count = 0
    date = origin
    while (er_end < count):
        date = date - pd.offsets.Day(1)
        if date < min_date:
            return
        if date in data_val and data_val[date] >  0.:
            count -= 1
    #print("Er_end:", date)
    er_data.append(data_val[date])
    er_market.append(market_val[date])
    while (er_start < count):
        date = date - pd.offsets.Day(1)
        if date < min_date:
            return
        if date  in data_val and data_val[date] >  0.:
            er_data.append(data_val[date])
            er_market.append(market_val[date])
            count -= 1
    #print("Er_start:", date)
    er_data.reverse()
    er_market.reverse()
    #print(er_data, er_market)
    
    date = origin - pd.offsets.Day(1)
    while (date not in data_val or data_val[date] == 0.):
        date = date - pd.offsets.Day(1) 
        if date < min_date:
            return
    ar_data.append(data_val[date])    # at -1
    ar_market.append(market_val[date])
    date = date - pd.offsets.Day(1)
    while (date not in data_val or data_val[date] == 0.):
        date = date - pd.offsets.Day(1) 
        if date < min_date:
            return
    ar_data.append(data_val[date])   # at -2
    ar_market.append(market_val[date])
    ar_data.reverse()
    ar_market.reverse()
    
    ar_data.append(data_val[origin])   # at 0
    ar_market.append(market_val[origin])
    date = origin + pd.offsets.Day(1)
    while (date not in data_val or data_val[date] == 0.):
        date = date + pd.offsets.Day(1) 
        if date > max_date:
            return
    ar_data.append(data_val[date])   # at +1
    ar_market.append(market_val[date])
    #print(ar_data, ar_market)
    
    import numpy as np
    return (np.array(er_data), np.array(er_market), np.array(ar_data), np.array(ar_market))

In [23]:
# data: arを計算したい企業の株価時系列データ
# market: 株価指数時系列データ
# start_period [end_period]: 期待値の計算に使う期間の開始[終了]日時
# start_window [end_window]: arを計算したい期間の開始[終了]日時

# 欠損値はこれに渡す前に整形する？dataで0になっているところをdata, market両方から除去とか
# その場合、*_periodの値も調整してから渡す必要がある

def market_return(er_data, er_market, ar_data, ar_market):
    import numpy as np
    import pandas as pd
    from scipy import stats
    from datetime import datetime
    
    #print("DATA:", ar_data)
    #print("MARKET:", ar_market)
        
    def calculate_returns(d):   # np.array of stock values
        dr = np.zeros(shape=d.shape)
        dr[1:] = d[1:] / d[0:-1]   # why -1?
        print(dr[1:])
        dr[1:] = np.log(dr[1:])
        print(dr[1:])
        return dr[1:]
    
    # 1. Linear Regression: On the estimation_period
    er_data = calculate_returns(er_data)
    er_market = calculate_returns(er_market)
    ar_data = calculate_returns(ar_data)
    ar_market = calculate_returns(ar_market)
    
    #print("")
    #print("RETURN(DATA):", ar_data)
    #print("RETURN(MARKET):", ar_market)
    
    #c_name = dr_data.columns[0]
    #x =  dr_market[c_name][start_period:end_period]
    #y = dr_data[c_name][start_period:end_period]
    slope, intercept, r_value, p_value, std_error = stats.linregress(er_market, er_data)
    er_reg = lambda x: x * slope + intercept
    
    #print("")
    #print("REG(DATA):", er_data)
    #print("REG(MARKET):", er_market)
    #print("REG: DATA = " + str(slope) + "MARKET + " + str(intercept))
    #print("")

    # 2. Analysis on the event window
    # Expexted Return:
    er = er_reg(ar_market)
    #er.name = 'Expected return'
    # Abnormal return: Return of the data - expected return
    ar = ar_data - er
    #ar.name = 'Abnormal return'
    # Cumulative abnormal return
    car = ar.cumsum()
    #car.name = 'Cum abnormal return'
    
    #print("ER:", er)
    #print("AR:", ar)
    #print("CAR:", car)
    return (car[-1], r_value ** 2)

In [24]:
ed, em, ad, am = determine_interval(data, market, pd.to_datetime("20000125"))

AttributeError: 'tuple' object has no attribute 'columns'

In [None]:
market_return(ed, em, ad, am)

In [11]:
def load_kabuka(code):
    import pandas as pd
    dates = [x[0] for x in kabuka[code]]
    values = [float(x[1]) for x in kabuka[code]]

    data = pd.DataFrame({"value": values})
    data.index = pd.to_datetime(dates)
    return data

In [25]:
count = 0
car = {}
for articleid, data in matching.items():
    print(articleid)
    count += 1
    if count > 10:
        break
    #print(data)
    prtype, date, codes, names = data
    #print(date)
    for code in codes:
        #print(code)
        if code not in kabuka:
            #print("no code")
            continue
        kabuka_data = load_kabuka(code)
        #print(kabuka_data)
        ret = determine_interval(kabuka_data, market, pd.to_datetime(date))
        if ret == None:
            #print("no kabuka")
            continue
        ed, em, ad, am = ret
        car[(articleid, prtype, code)] = market_return(ed, em, ad, am)   # エイリアスはここで縮約されるか
        #print(car[(articleid, code)])
#print(car)

NIKPRLRSP037981_06012003
NIKPRLRSP037996_06012003
[ 1.00731707  0.98789346  0.99019608  0.99257426  0.99750623  1.          1.
  1.0025      1.01745636  1.00490196  0.99512195  0.98284314  1.0074813
  1.00247525  0.99753086  1.0049505   0.99507389  0.99752475  0.99751861
  1.00248756  1.00248139  0.99752475  0.99751861  1.01243781  0.99017199
  1.01736973  1.00487805  0.99029126  0.99509804  1.00738916  0.99022005
  1.00987654  1.01222494  0.99033816  1.0097561   1.00241546  1.01204819
  0.99761905  0.99761337  1.00478469  0.99047619  1.          1.00961538
  0.98809524  1.02891566  0.97892272  0.98803828  1.00726392  1.00240385
  0.99520384  1.00240964  1.01682692  0.98345154  1.02163462  0.98823529
  0.99761905  0.99522673  0.98321343  1.02195122  0.98329356  1.
  0.99514563  1.01219512  1.          0.97831325  1.0320197   0.97613365
  1.          1.          1.00977995  0.99273608  0.99756098  0.99755501
  1.01470588  0.98550725  1.          0.99754902  0.99017199  1.01240695
  1.00

In [21]:
with open("car.new.rsquare", 'w') as f:
    for key, v in car.items():
        articleid, prtype, code = key
        c, r = v
        f.write('\t'.join([articleid, prtype, code, str(c), str(r)]))
        f.write('\n')

In [22]:
len(car)

47301

In [None]:
## 並列化版
def calculate_car(matching_data):
    articleid, data = matching_data
    prtype, date, codes, names = data
    #print(date)
    for code in codes:
        #print(code)
        if code not in kabuka:
            #print("no code")
            continue
        kabuka_data = load_kabuka(code)
        #print(kabuka_data)
        ret = determine_interval(kabuka_data, market, pd.to_datetime(date))
        if ret == None:
            #print("no kabuka")
            continue
        ed, em, ad, am = ret
        #car[(articleid, prtype, code)] = market_return(ed, em, ad, am)   # エイリアスはここで縮約されるか
        return (articleid, prtype, code, market_return(ed, em, ad, am))

In [None]:
from multiprocessing import Pool
exe_pool = Pool(16)
car_p = {}
for ret in exe_pool.imap(calculate_car, matching.items()):
    car_p[(ret[0], ret[1], ret[2])] = ret[3]