In [None]:
import re
import collections
import math
import xlsxwriter
import numpy as np
import datetime
import codecs
from scipy.sparse import csr_matrix
from scipy.sparse import csc_matrix
from scipy.spatial.distance import cdist

In [None]:
def add_months(s, p):
    ''' 在日期s上加上p个月，如果日期不存在，顺延到下个月1号 '''
    p_years = s.year + p / 12
    p_months = s.month + p % 12 
    if p_months > 12:
        p_years += 1
        p_months = p_months % 12
    
    try:
        rs = datetime.datetime(p_years, p_months, s.day)
    except ValueError, e:
        if p_months == 12:
            rs = datetime.datetime(p_years + 1, 1, 1)
        else:
            rs = datetime.datetime(p_years, p_months+1, 1)
    return rs

def next_quater(cur_quater):
    return add_months(cur_quater, 3)

class SharesInfo(object):
    def __init__(self, shares, marketValue, proportion):
        self.shares_ = shares
        self.marketValue_ = marketValue
        self.proportion_ = proportion
    
    def __str__(self):
        return "%f %f %f"%(self.shares_, self.marketValue_, self.proportion_)
    def __repr__(self):
        return "%f %f %f"%(self.shares_, self.marketValue_, self.proportion_)


# 季度的分割日期，[datetime.datetime(2004, 1, 1, 0, 0), datetime.datetime(2004, 4, 1, 0, 0), datetime.datetime(2004, 7, 1, 0, 0) 。。。
quater_date = [datetime.datetime(2003,7,1),datetime.datetime(2003,10,1)] \
        + [datetime.datetime(v1,v2,1) for v1 in xrange(2004, 2018) for v2 in xrange(1,12,3)]

In [None]:
g_raw_data = collections.defaultdict(lambda : collections.defaultdict(dict))
g_record = 0
g_stock_set = set()
def load_data(file_name="Fund_Portfolio_Stock2003-2007.txt"):
    global g_record
    with codecs.open(file_name, "r", encoding="utf_16") as ifid:
        line_num = 0
        for line in ifid:
            if line_num > 0 and len(line) > 5:
                try:
                    MasterFundCode, ReportTypeID, Startdate, EndDate, Rank, Symbol, StockName, Shares, MarketValue, Proportion \
                        = line.strip().split('\t')
                    Startdate = datetime.datetime.strptime(Startdate, "%Y-%m-%d")
                    EndDate = datetime.datetime.strptime(EndDate, "%Y-%m-%d")
                    if int(ReportTypeID) in (1,2,3,4) and Startdate >= quater_date[0] and EndDate < quater_date[-1]:
                        real_start_date = datetime.datetime(Startdate.year, 3*int(ReportTypeID)-2, 1)
                        real_idx = quater_date.index(real_start_date)
                        
                        if quater_date[real_idx] <= Startdate < quater_date[real_idx+1] \
                            and quater_date[real_idx] < EndDate < quater_date[real_idx+1] \
                            and Startdate <= EndDate:
                            g_raw_data[MasterFundCode][Symbol][real_start_date] = SharesInfo(float(Shares), float(MarketValue), float(Proportion))
                            g_stock_set.add(Symbol)
                            g_record += 1
                        else:
                            print u"error at %s linenum %d:%s %s %s"  %(file_name, line_num+1, MasterFundCode, Symbol, Startdate)
                except ValueError, e:
                    print line.strip(), file_name, line_num+1
            line_num += 1


                            
load_data("Fund_Portfolio_Stock2003-2007.txt")
load_data("Fund_Portfolio_Stock2008-2012.txt")
load_data("Fund_Portfolio_Stock2013-2017.txt")
print "加载%d只基金%d条数据，共%d只股票" %(len(g_raw_data), g_record, len(g_stock_set))


In [None]:
g_holding_num_data = collections.defaultdict(lambda : collections.defaultdict(lambda : np.zeros(len(g_stock_set))))
g_stock_map = {sym_: i_ for i_, sym_ in enumerate(sorted(g_stock_set))}
for fund_ in g_raw_data:
    for sym_ in g_raw_data[fund_]:
        for cur_ in g_raw_data[fund_][sym_]:
            g_holding_num_data[fund_][cur_][g_stock_map[sym_]] = g_raw_data[fund_][sym_][cur_].shares_


In [None]:
g_trade_vector = collections.defaultdict(dict)
for fund_ in g_holding_num_data:
    start_date_ = sorted(g_holding_num_data[fund_].keys())[0]
    end_date_ = sorted(g_holding_num_data[fund_].keys())[-1]
    for i in xrange(quater_date.index(start_date_)+1, quater_date.index(end_date_)+1):
        if quater_date[i-1] in g_holding_num_data[fund_] and quater_date[i] in g_holding_num_data[fund_]:
            g_trade_vector[quater_date[i]][fund_] = g_holding_num_data[fund_][quater_date[i]] - g_holding_num_data[fund_][quater_date[i-1]]
        else:
            if quater_date[i-1] not in g_holding_num_data[fund_]:
                print u"error at %s %s"%(fund_, quater_date[i-1])
            else:
                print u"error at %s %s"%(fund_, quater_date[i])

In [None]:
def calc_sim(trade_vec):
    leading_sim_ = collections.defaultdict(dict)
    current_sim_ = collections.defaultdict(dict)
    following_sim_ = collections.defaultdict(dict)
    
    start_idx_ = quater_date.index(datetime.datetime(2004,1,1))
    end_idx_ = quater_date.index(datetime.datetime(2017,10,1))
    
    for i in xrange(start_idx_, end_idx_):
        current_date = quater_date[i]
        print "calc %s"%(current_date)
        # 去除变化为0的funds，并且排序
        sorted_current_funds = sorted([funds_ for funds_ in trade_vec[current_date] if trade_vec[current_date][funds_].sum()!=0])
        current_matrix = np.matrix([trade_vec[current_date][funds_] for funds_ in sorted_current_funds])
        current_cos_dis = cdist(current_matrix, current_matrix, 'cosine')        
        
        for i_, fund_ in enumerate(sorted_current_funds):
            current_sim_[fund_][current_date] = (current_cos_dis[i_].sum()-current_cos_dis[i_][i_], len(sorted_current_funds) - 1)
    
    
    cross_start_idx = quater_date.index(datetime.datetime(2004,1,1))
    cross_end_idx = quater_date.index(datetime.datetime(2017,10,1))
    for i in xrange(cross_start_idx, cross_end_idx):
        previous_date = quater_date[i-1]
        current_date= quater_date[i]
        
        sorted_previous_funds = sorted([funds_ for funds_ in trade_vec[previous_date] if trade_vec[previous_date][funds_].sum()!=0])
        sorted_current_funds = sorted([funds_ for funds_ in trade_vec[current_date] if trade_vec[current_date][funds_].sum()!=0])
        
        previous_matrix = np.matrix([trade_vec[previous_date][funds_] for funds_ in sorted_previous_funds])
        current_matrix = np.matrix([trade_vec[current_date][funds_] for funds_ in sorted_current_funds])
        
        cross_cos_dis = cdist(previous_matrix, current_matrix, "cosine")
        
        print "calc cross %s %s, %s %s, %s"%(previous_date, current_date, previous_matrix.shape, current_matrix.shape, cross_cos_dis.shape)
        
        for i_, fund_ in enumerate(sorted_previous_funds):
            if fund_ in sorted_current_funds:
                remove_idx_ = sorted_current_funds.index(fund_) # 减去和T中自己的cos距离
                leading_sim_[fund_][previous_date] = (cross_cos_dis[i_].sum()-cross_cos_dis[i_][remove_idx_], len(sorted_current_funds) - 1)
            else:
                leading_sim_[fund_][previous_date] = (cross_cos_dis[i_].sum(), len(sorted_current_funds))
            
        for i_, fund_ in enumerate(sorted_current_funds):
            if fund_ in sorted_previous_funds:
                remove_idx_ =sorted_previous_funds.index(fund_)
                following_sim_[fund_][current_date] = (cross_cos_dis.T[i_].sum()-cross_cos_dis.T[i_][remove_idx_], len(sorted_previous_funds)-1)
            else:
                following_sim_[fund_][current_date] = (cross_cos_dis.T[i_].sum(), len(sorted_previous_funds))

                
    return leading_sim_, current_sim_, following_sim_

g_leading_sim, g_current_sim, g_following_sim = calc_sim(g_trade_vector)

In [None]:
workbook = xlsxwriter.Workbook(u'cosine_distance.xlsx')
current_ws = workbook.add_worksheet(u"current")
leading_ws = workbook.add_worksheet(u"leading")
following_ws = workbook.add_worksheet(u"following")

current_ws.write(0,0,u"fund")
current_ws.write(0,1,u"quater")
current_ws.write(0,2,u"sum_distance")
current_ws.write(0,3,u"num")
current_ws.write(0,4,u"avg_distance")
current_ws.write(0,5,u"1-avg")

leading_ws.write(0,0,u"fund")
leading_ws.write(0,1,u"quater")
leading_ws.write(0,2,u"sum_distance")
leading_ws.write(0,3,u"num")
leading_ws.write(0,4,u"avg_distance")
leading_ws.write(0,5,u"1-avg")

following_ws.write(0,0,u"fund")
following_ws.write(0,1,u"quater")
following_ws.write(0,2,u"sum_distance")
following_ws.write(0,3,u"num")
following_ws.write(0,4,u"avg_distance")
following_ws.write(0,5,u"1-avg")

wt_idx=1
for fund_ in g_current_sim:
    for quater_ in g_current_sim[fund_]:
        current_ws.write(wt_idx,0,fund_)
        current_ws.write(wt_idx,1,u"%dQ%d"%(quater_.year, quater_.month/3+1))
        current_ws.write(wt_idx,2,g_current_sim[fund_][quater_][0])
        current_ws.write(wt_idx,3,g_current_sim[fund_][quater_][1])
        current_ws.write(wt_idx,4,g_current_sim[fund_][quater_][0]/g_current_sim[fund_][quater_][1])
        current_ws.write(wt_idx,5,1-g_current_sim[fund_][quater_][0]/g_current_sim[fund_][quater_][1])
        wt_idx += 1

wt_idx=1
for fund_ in g_leading_sim:
    for quater_ in g_leading_sim[fund_]:
        leading_ws.write(wt_idx,0,fund_)
        leading_ws.write(wt_idx,1,u"%dQ%d"%(quater_.year, quater_.month/3+1))
        leading_ws.write(wt_idx,2,g_leading_sim[fund_][quater_][0])
        leading_ws.write(wt_idx,3,g_leading_sim[fund_][quater_][1])
        leading_ws.write(wt_idx,4,g_leading_sim[fund_][quater_][0]/g_leading_sim[fund_][quater_][1])
        leading_ws.write(wt_idx,5,1-g_leading_sim[fund_][quater_][0]/g_leading_sim[fund_][quater_][1])
        wt_idx += 1
        
wt_idx=1
for fund_ in g_following_sim:
    for quater_ in g_following_sim[fund_]:
        following_ws.write(wt_idx,0,fund_)
        following_ws.write(wt_idx,1,u"%dQ%d"%(quater_.year, quater_.month/3+1))
        following_ws.write(wt_idx,2,g_following_sim[fund_][quater_][0])
        following_ws.write(wt_idx,3,g_following_sim[fund_][quater_][1])
        following_ws.write(wt_idx,4,g_following_sim[fund_][quater_][0]/g_following_sim[fund_][quater_][1])
        following_ws.write(wt_idx,5,1-g_following_sim[fund_][quater_][0]/g_following_sim[fund_][quater_][1])
        wt_idx += 1        

workbook.close()