In [2]:
import gensim
import codecs
import numpy as np
import pandas as pd
import jieba
import jieba.analyse
from scipy import stats

In [3]:
"""
the tasks for the program:
1. calculation of similarity of content of TCM han and TCM hui
2. extract keywords from textbook of TCM han and TCM hui
3. non-inferiority test for the similarities based on bootstrap approaches
"""

'\nthe tasks for the program:\n1. calculation of similarity of content of TCM han and TCM hui\n2. extract keywords from textbook of TCM han and TCM hui\n3. superiority test for the similarities based on bootstrap approaches\n'

In [4]:
def file2doc(file_name):
    """ tokenize file to doc """                   
    doc = [w for x in codecs.open(file_name, 'r', 'utf-8').readlines() for w in jieba.cut(x.strip())]
    return doc

In [5]:
def doc2vec(doc, model):
    """  :param model: pre-train sentence vectors model """    
    start_alpha = 0.01
    infer_epoch = 1000
    # texts convert to sentence vector
    doc_vec = model.infer_vector(doc, alpha=start_alpha, steps=infer_epoch)
    return doc_vec

In [6]:
def sim_cal_vec(vec1, vec2):
    """  :return: cosin similarity rate """
    vec1mod = np.sqrt(vec1.dot(vec1))
    vec2mod = np.sqrt(vec2.dot(vec2))
    if vec2mod != 0 and vec1mod != 0:
        sim_prob = (vec1.dot(vec2)) / (vec1mod * vec2mod)
    else:
        sim_prob = 0
    return sim_prob

In [7]:
def sample_doc(doc):
    n=len(doc)
    # generate the index of list of doc
    index_arr=np.random.randint(0,n,size=n)
    index_arr.sort()
    array=np.array(doc)
    data_sample=list(array[index_arr])
    return data_sample

In [8]:
def bootstrap_sample(doc, doc_base,model,num):
    """ calculation of similarity rate for vector by bootstrap """
    print("start...")    
    bootstrap_sim_list=[]
    vec_base=doc2vec(doc_base,model)
    resample_num=num
    step=resample_num/10    
    for i in range(resample_num):
        # resample doc and doc to vector1
        doc_sample1=sample_doc(doc)
        doc_sample_vec1=doc2vec(doc_sample1,model)               
        # similarity of the two vectors
        sim_vec=sim_cal_vec(doc_sample_vec1, vec_base)
        bootstrap_sim_list.append(sim_vec)
        # exexution monitor
        if i>0 and i%step==0:
            print("... %d%s"%(i/step*10,"%"))            
    print("... 100%\ncomplete!")    
    return bootstrap_sim_list

In [12]:
def bootstrap_results(sample_list,alpha,cur_val,ctr_val,cmp_type):
    """ results of similarity rate parameters based on bootstrap """
    # t0.025=1.96
    t_alpha = 1.96
    delta=0   
    sample_list.sort()
    sample_len=len(sample_list)
    mean=np.mean(sample_list)
    std_err = np.std(sample_list)
    conf_inv0 = [cur_val - t_alpha*std_err,cur_val + t_alpha*std_err]
    conf_inv1 = [np.percentile(sample_list,alpha*100),np.percentile(sample_list,(1-alpha)*100)]    
    
    if cmp_type==1:
        # superiority test
        delta=0.001*ctr_val 
        t_value = (cur_val - ctr_val - delta)/std_err
        p_value0 = 2*(1 -stats.t.cdf(t_value, df=999))
        p_value1 = len([i for i, x in enumerate(sample_list) if x <= ctr_val])/len(sample_list)  
    elif cmp_type==2:
        # non-inferiority test
        delta=0.15*ctr_val 
        t_value = (cur_val - ctr_val + delta)/std_err
        p_value0 = 2*(1 -stats.t.cdf(t_value, df=999))
        p_value1 = len([i for i, x in enumerate(sample_list) if x <= ctr_val - delta])/len(sample_list)  
    return cur_val, ctr_val, mean, std_err, conf_inv0, conf_inv1, t_value, p_value0, p_value1, cmp_type

In [30]:
def bootstrap_results_print(btsp_res):
    """ print bootstrap results """
    print("current:%s"%(btsp_res[0]))
    print("control:%s"%(btsp_res[1]))
    print("mean:%s"%(btsp_res[2]))
    print("standard error:%s"%(btsp_res[3]))    
    print("confendece interval0:%s"%(btsp_res[4])) 
    print("confendece interval1:%s"%(btsp_res[5])) 
    
    if btsp_res[9]==1:
        # superiority test results
        print("superiority test:(delta>0)\nH0: current  - control <= %s"%("delta"))
        print("H1: current  - control > %s "%("delta"))
        print("similarity rate(%s) v.s control(%s)"%(btsp_res[0],btsp_res[1]))
        print("t value:%s"%(btsp_res[6]))
        print("p value0:%s"%(btsp_res[7]))
        print("p value1:%s"%(btsp_res[8]))
    elif btsp_res[9]==2:
        # non-inferiority test results
        print("non-inferiority test:(delta>0)\nH0: current - control <= %s"%("-delta"))
        print("H1: current - control > %s"%("-delta"))
        print("similarity rate %s v.s control %s"%(btsp_res[0],btsp_res[1]))
        print("t value:%s"%(btsp_res[6]))
        print("p value:%s"%(btsp_res[7]))
        print("p value1:%s"%(btsp_res[8]))
    
    print("\n")

In [15]:
def keyword_extract(file_name,keyword_num):
    """ extraction of key words """
    with codecs.open(file_name, 'r', 'utf-8') as f:
        doc = f.read()
    #print(doc)       
    keyword_list = jieba.analyse.extract_tags(doc,topK=keyword_num)    
    return keyword_list

In [None]:
file_dic="./data/tbl_dic.csv"
path="./data/data_tcm/"
model_path = './model/zhiwiki_news.doc2vec'
dt=pd.read_csv(file_dic)

for i in range(len(dt)):
    file1_name=dt.loc[i,"tcm_hui"]
    file1="".join([path,file1_name])
    file2_name=dt.loc[i,"tcm_han"]
    file2="".join([path,file2_name])
    out_list=[]      
    
    doc1=file2doc(file1)
    doc2=file2doc(file2)

    model = gensim.models.Doc2Vec.load(model_path)
    doc1_vec=doc2vec(doc1,model)
    doc2_vec=doc2vec(doc2,model)

    keyword1=keyword_extract(file1,10)
    keyword2=keyword_extract(file2,10)
    str1="/".join(keyword1)
    str2="/".join(keyword2)
    

    sim_2v=sim_cal_vec(doc1_vec, doc2_vec)
    print("%-4s\t%-32s\t"%(i+1,dt.loc[i,"item"]))
    print("similarity:%s"%sim_2v)
    #print(str1,str2)
    
    sample_btsp_list=bootstrap_sample(doc1, doc2,model,1000)

    alpha=0.025
    cur_val = sim_2v
    ctr_val = dt.loc[i,"control"] 
    res1=bootstrap_results(sample_btsp_list,alpha,cur_val, ctr_val,2)
    mean=res1[2]
    se=res1[3]
    ci0="".join(str(res1[4]))
    ci1="".join(str(res1[5]))
    tvalue=res1[6]
    p0=res1[7]
    p1=res1[8]
    
    bootstrap_results_print(res1)
    
    #with codecs.open(file, 'r', 'utf-8') as f:
        #doc=f.read()
    out_list.append("%-4s\t"%(i+1))
    out_list.append("%-32s\t"%(dt.loc[i,"item"]))
    out_list.append("%-32s\t"%(file1_name))
    out_list.append("%-32s\t"%(file2_name))
    out_list.append("%-8s\t"%(sim_2v))
    out_list.append("%-8s\t"%(mean))
    out_list.append("%-8s\t"%(se))
    out_list.append("%-40s\t"%(ci0))
    out_list.append("%-40s\t"%(ci1))
    out_list.append("%-8s\t"%(tvalue))
    out_list.append("%-8s\t"%(p0))
    out_list.append("%-8s\n"%(p1))
        #fout.write("".join(out_list))
        #print("%-4s\t%-40s\t%-8s\t%-16s"%(i+1,file,len(doc),dt.loc[i,"item"]))

    with open("./data/res1.out","a") as fout:
        fout.write("".join(out_list))

1   	content outline                 	
similarity:0.541628
start...
... 10%
... 20%
... 30%
... 40%
... 50%
... 60%
... 70%
... 80%
... 90%
... 100%
complete!
current:0.541628
control:0.5
mean:0.551504
standard error:0.0334802
confendece interval0:[0.47600700393319129, 0.60724947914481164]
confendece interval1:[0.48937395140528678, 0.61677252501249313]
non-inferiority test:(delta>0)
H0: current - control <= -delta
H1: current - control > -delta
similarity rate 0.541628 v.s control 0.5
t value:3.48349652882
p value:0.000516367356841
p value1:0.0


2   	basic theory                    	
similarity:0.814209
start...
... 10%
... 20%
... 30%
... 40%
... 50%
... 60%
... 70%
... 80%
... 90%
... 100%
complete!
current:0.814209
control:0.8
mean:0.797524
standard error:0.00958966
confendece interval0:[0.79541349653154614, 0.83300494905561207]
confendece interval1:[0.77874551713466644, 0.81650735139846808]
non-inferiority test:(delta>0)
H0: current - control <= -delta
H1: current - control > -del