# Model Use
#### 用训练好的模型去评估每篇文章的每一个句子，得到每一个句子的积极程度评分，最后取平均得到每篇年报mda部分与audit部分的积极程度分数。
- mda_model_filepath:计算 mda部分句子的分数所用的模型路径。
- audit_model_filepath:计算audit部分句子分数所用的模型路径。
- dict_file_path: data_clean.ipynb 生成的每篇年报的统计信息。
- dic_write_path: 得到每篇文章mda部分和audit部分的分数后，接在dict_file_path文件后写入dic_write_path。
- mda_data_path：data_clean.ipynb 中，生成的每篇年报所有mda句子信息的所在文件。
- mda_data_path：写入分数后的mda句子文件写出地址
- audit_data_path：data_clean.ipynb 中，生成的每篇年报所有audit句子信息的所在文件。
- audit_write_path：写入分数后的audit句子文件写出地址

In [2]:
#coding: utf-8
import gensim
import random
from sklearn.externals import joblib
import os
import csv
import matplotlib as mpt
import jieba
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.models import load_model
from extract_feature import BertVector
mda_model_filepath="../../model/bert_model/sentiment_cnn_model.h5"
audit_model_filepath="../../model/word2vec_model/sentiment_cnn_model.h5"
dict_filepath="../../mid_data/number.csv"
dict_writepath="../../score/score.csv"
mda_data_path="../../mid_data/mda_clean2"
mda_write_path="../../score/mda_score"
audit_data_path="../../mid_data/audit_clean2"
audit_write_path="../../score/audit_score"
bv = BertVector()
embedding_dim=250
seq_len=128
if_deep=True

bert_model = load_model(mda_model_filepath)
audit_model= load_model(audit_model_filepath)

if not os.path.exists(mda_write_path):
    os.makedirs(mda_write_path)
if not os.path.exists(audit_write_path):  
    os.makedirs(audit_write_path)

INFO:tensorflow:Using config: {'_device_fn': None, '_save_checkpoints_steps': None, '_model_dir': '../tmp', '_master': '', '_keep_checkpoint_max': 5, '_protocol': None, '_evaluation_master': '', '_session_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
  allow_growth: true
}
graph_options {
  optimizer_options {
    global_jit_level: ON_1
  }
}
, '_save_summary_steps': 100, '_num_worker_replicas': 1, '_service': None, '_task_id': 0, '_is_chief': True, '_global_id_in_cluster': 0, '_num_ps_replicas': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001749A07C1D0>, '_log_step_count_steps': 100, '_task_type': 'worker', '_experimental_distribute': None, '_save_checkpoints_secs': 600, '_tf_random_seed': None, '_train_distribute': None, '_eval_distribute': None, '_keep_checkpoint_every_n_hours': 10000}
INFO:tensorflow:Could not find trained model in model_dir: ../tmp, running initialization to predict.
INFO:tensorflow:Calling model_fn.
INFO:t

In [2]:
'''lead ino pre-parpared word vector model file''' 
VECTOR_DIR = './embedding/word_vector.bin'  # 词向量模型文件
model = gensim.models.KeyedVectors.load_word2vec_format(VECTOR_DIR, binary=False)

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


# 建立字典
#### 所有文章的统计信息用dataframe表示，增加五列。
- 一列index由stock_id 和year构成，方便后续定位；
- 一列mda_sentence 用来mda 句子的计数，一列mda_score 用来统计mda句子的分数和；
- 一列audit_sentence，一列 audit_score 和mda同理。

In [3]:
def csv2dict(file_path):
    
    new_dataframe = pd.read_csv(file_path, header=None,names=["stock_id","year","mda_id","mda_sentence_num","file_path"],dtype={'stock_id':str,'year':str,'mda_id':str,'mda_sentence_num':np.int32,'file_path':str})
    new_dataframe["mda_sentence"]=[0]*new_dataframe.mda_id.count()
    new_dataframe["audit_sentence"]=[0]*new_dataframe.mda_id.count()
    new_dataframe["mda_score"]=[0.0]*new_dataframe.mda_id.count()
    new_dataframe["audit_score"]=[0.0]*new_dataframe.mda_id.count()
    
    new_dataframe["index"]=new_dataframe.stock_id+"/"+new_dataframe.year
    new_dataframe.set_index(["index"], inplace=True)
    return new_dataframe

def dict2csv(diction,file):
    diction.to_csv(file)

''' bert_based embedding'''
def bert_rep_sentencevector_2(sentence):
    sentence=sentence.replace(" ","")
    embedding_matrix = np.array(bv.encode([sentence]))
    return embedding_matrix 

'''word2vec_based embedding'''
def rep_sentencevector(sentence,if_deep=True):
    
    '''participle''' 
    word_list = jieba.lcut(sentence, cut_all=True)
    
    while '' in word_list:
        word_list.remove('')
    embedding_dim = 250
    if not if_deep:
        embedding_matrix = np.zeros(embedding_dim)
        for index, word in enumerate(word_list):
            try:
                embedding_matrix += model[word]
            except:
                pass
        return embedding_matrix/len(word_list)
    else:
        max_words=seq_len
        embedding_matrix = np.zeros((max_words, embedding_dim))
        for index, word in enumerate(word_list):
            try:
                embedding_matrix[index] = model[word]
            except:
                pass
    
    return embedding_matrix


'''get a positive index from an array of probabilities'''
def compute_score(score_list):    
    return -1*score_list[0]+1*score_list[2]

'''mda model predict mda sentence'''
def predict_mda(sentence_vector,if_deep=True):

    if if_deep==False:
        sentence_vector=sentence_vector.mean(axis=1)
    score_list=np.squeeze(bert_model.predict(sentence_vector))
    
    return compute_score(score_list)

'''audit model predict audit sentence'''
def predict_audit(sentence,if_deep=True):    
    sentence_vector = np.array([rep_sentencevector(sentence,if_deep)])
    score_list=np.squeeze(audit_model.predict(sentence_vector))   
    return compute_score(score_list)


In [4]:

score_list=csv2dict(dict_filepath)


# 计算mda_score

In [None]:
count=0
for parent,dirnames,filenames in os.walk(mda_data_path):  
    for file in filenames:    
        csv_reader=csv.reader(open(os.path.join(mda_data_path,file),'r',encoding='utf-8-sig'))
        csv_writer_open=open(os.path.join(mda_write_path,file),mode="w",encoding="utf-8-sig",newline="")
        csv_writer=csv.writer(csv_writer_open,dialect='excel')
        for item in csv_reader:
            '''use stock_id and year information to combine index'''
            item[0]=item[0].rjust(6,'0')
            key=item[0]+"/"+item[1]
            assert len(key)==11
            
            if key in list(score_list.index):                
                               
                array=np.squeeze(np.array([bert_rep_sentencevector_2(item[5])]),axis=0)
               
                score=predict_mda(array,if_deep)
                score_list.loc[key,"mda_score"]+=score
                score_list.loc[key,"mda_sentence"]+=1
                count+=1
                print("\r %d, %s:%f                                                                                                               "% (count,item[5],score),end=" ")
                item[4]=score
                csv_writer.writerow(item)
        csv_writer_open.close() 
        
               

# 计算audit score

In [None]:
count=0
for parent,dirnames,filenames in os.walk(audit_data_path):  
    for file in filenames:
        csv_reader=csv.reader(open(os.path.join(audit_data_path,file),'r',encoding='utf-8-sig'))
        csv_writer_open=open(os.path.join(audit_write_path,file),mode="w",encoding="utf-8-sig",newline="")
        csv_writer=csv.writer(csv_writer_open,dialect='excel')
        for item in csv_reader:
            item[0]=item[0].rjust(6,'0')
            key=item[0]+"/"+item[1]
            assert len(key)==11
            if key in list(score_list.index):
                score=predict_audit(item[5],if_deep)
                score_list.loc[key,"audit_score"]+=score
                score_list.loc[key,"audit_sentence"]+=1
                
                count+=1
                print("\r %d, %s %s:%f                                                                                                               "% (count,key,item[5],score),end=" ")
                item[4]=score  
                csv_writer.writerow(item)
        csv_writer_open.close()  

# Score List 算分

In [None]:

score_list=csv2dict(dict_filepath)
count=0
for parent,dirnames,filenames in os.walk(mda_write_path): 
    for file in filenames:    
        print(file)
        csv_reader=csv.reader(open(os.path.join(mda_write_path,file),'r',encoding='utf-8-sig'))
        for item in csv_reader:
            item[0]=item[0].rjust(6,'0')
            key=item[0]+"/"+item[1]
            assert len(key)==11           
            if key in list(score_list.index):                
                                
                score=predict_mda(array,if_deep)
                score_list.loc[key,"mda_score"]+=float(item[4])
                score_list.loc[key,"mda_sentence"]+=1
                count+=1
                print("\r %d, %s:%f                                                                                                               "% (count,item[5],float(item[4])),end=" ")
count=0                
for parent,dirnames,filenames in os.walk(audit_write_path): 
    for file in filenames:    
        print(file)
        csv_reader=csv.reader(open(os.path.join(audit_write_path,file),'r',encoding='utf-8-sig'))
        for item in csv_reader:
            item[0]=item[0].rjust(6,'0')
            key=item[0]+"/"+item[1]
            assert len(key)==11
            
            if key in list(score_list.index):                
                
                
                score=predict_mda(array,if_deep)
                score_list.loc[key,"audit_score"]+=float(item[4])
                score_list.loc[key,"audit_sentence"]+=1
                count+=1
                print("\r %d, %s:%f                                                                                                               "% (count,item[5],float(item[4])),end=" ")               

score_list["mda_score"]=score_list["mda_score"]/score_list["mda_sentence"]
score_list["audit_score"]=score_list["audit_score"]/score_list["audit_sentence"]
dict2csv(score_list,dict_writepath)
