In [14]:
#coding: UTF-8 

import os 
import re
import pandas as pd
import jieba

import numpy as np
from tqdm import tqdm

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import learning_curve, GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score,precision_recall_fscore_support,roc_curve,auc,roc_auc_score
from sklearn.decomposition import PCA
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

base_info=pd.read_csv('train/base_info.csv') #企业的基本信息
annual_report_info=pd.read_csv('train/annual_report_info.csv') #企业的年报基本信息
tax_info=pd.read_csv('train/tax_info.csv') #企业的纳税信息
change_info=pd.read_csv('train/change_info.csv') #变更信息
news_info=pd.read_csv('train/news_info.csv') #舆情信息
other_info=pd.read_csv('train/other_info.csv') #其它信息
entprise_info=pd.read_csv('train/entprise_info.csv') #企业标注信息{0: 13884, 1: 981}
entprise_evaluate=pd.read_csv('entprise_evaluate.csv') #未标注信息

# ## 1 特征构建 
# ###  tfidi处理经营范围(opscope)特征

# In[356]:


# tfidif 处理经营范围的特征
#cn_stopwords.txt来源于 https://github.com/goto456/stopwords
def stopwordslist():
    stopwords = [line.strip() for line in open('cn_stopwords.txt',encoding='UTF-8').readlines()]
    return stopwords
# 创建一个停用词列表
stopwords = stopwordslist()
stopwords+=['、', '；', '，', '）','（']
#
train_df_scope=base_info.merge(entprise_info)[['id','opscope','label']]
test_df_scope=base_info[base_info['id'].isin(entprise_evaluate['id'].unique().tolist())]
test_df_scope=test_df_scope.reset_index(drop=True)[['id','opscope']]
str_label_0=''
str_label_1=''
for index,name,opscope,label in train_df_scope.itertuples():
    # 结巴分词
    seg_text = jieba.cut(opscope.replace("\t", " ").replace("\n", " "))
    outline = " ".join(seg_text)
    out_str=""
    for per in outline.split():
        if per not in stopwords: 
            out_str += per
            out_str+=" "
    if label==0:
        str_label_0+=out_str
    else:
        str_label_1+=out_str
corpus=[str_label_0,str_label_1]
vectorizer=CountVectorizer()#该类会将文本中的词语转换为词频矩阵，矩阵元素a[i][j] 表示j词在i类文本下的词频
transformer=TfidfTransformer()#该类会统计每个词语的tf-idf权值
tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus))#第一个fit_transform是计算tf-idf，第二个fit_transform是将文本转为词频矩阵
word=vectorizer.get_feature_names()#获取词袋模型中的所有词语总共7175个词语
weight=tfidf.toarray()#将(2, 7175)tf-idf矩阵抽取出来，元素a[i][j]表示j词在i类文本中的tf-idf权重
# for i in range(len(weight)):#打印每类文本的tf-idf词语权重，第一个for遍历所有文本，第二个for便利某一类文本下的词语权重
#     #
#     for j in range(len(word)):
#         print(word[j],weight[i][j])
#下面将会根据tfidi算出来的权重将经营范围的文本特征转换为数值(利用weight[1,:]也即各个词语在第二类(违法类中所占据的权重之和))
illegal_word_weights={}
for i in range(len(word)):
    illegal_word_weights[word[i]]=weight[1][i]
tfidi_opscope=[]
for index,name,opscope in base_info[['id','opscope']].itertuples():
    # 
    seg_text = jieba.cut(opscope.replace("\t", " ").replace("\n", " "))
    outline = " ".join(seg_text)
    tfidi_frt=0
    for per in outline.split():
        if per in illegal_word_weights: 
            tfidi_frt+=illegal_word_weights[per]
    tfidi_opscope.append(tfidi_frt)
base_info['tfidif_opscope']=tfidi_opscope
print('对opscope提取tfidif特征完毕..........')


# ##  change_info、other_info，news_info，annual_report_info,tax表格的简单特征构建

# In[357]:


#change_info
change_info_clean_2 = change_info.drop(columns = ['bgrq', 'bgq', 'bgh'])
change_info_clean_2 = change_info_clean_2.groupby('id',sort=False).agg('mean')
change_info_clean_2 = pd.DataFrame(change_info_clean_2).reset_index()
#other_info
#空值大于0.5的列都删除掉
buf_group = other_info.groupby('id',sort=False).agg('mean')
other_info_clean=pd.DataFrame(buf_group).reset_index()
other_info_clean=other_info_clean.fillna(-1)
other_info_clean = other_info_clean.groupby('id',sort=False).agg('mean')
other_info_clean=pd.DataFrame(other_info_clean).reset_index()
#news_info
news_info_clean_2=news_info.drop(['public_date'],axis=1)
#对object类型进行编码
news_info_clean_2['positive_negtive']=news_info_clean_2['positive_negtive'].fillna("中立")
#
dic={}
cate=news_info_clean_2.positive_negtive.unique()
for i in range(len(cate)):
    dic[cate[i]]=i

news_info_clean_2['positive_negtive']=news_info_clean_2['positive_negtive'].map(dic)
news_info_clean_2 = news_info_clean_2.groupby('id',sort=False).agg('mean')
news_info_clean_2=pd.DataFrame(news_info_clean_2).reset_index()
#处理annual_report_info的数据
#空值大于0.5的列都删除掉
annual_report_info_clean=annual_report_info.dropna(thresh=annual_report_info.shape[0]*0.5,how='all',axis=1)
#对object类型进行编码
annual_report_info_clean['BUSSTNAME']=annual_report_info_clean['BUSSTNAME'].fillna("无")
dic = {'无':-1,'开业':0, '歇业':1, '停业':2, '清算':3}
#
annual_report_info_clean['BUSSTNAME']=annual_report_info_clean['BUSSTNAME'].map(dic)
annual_report_info_clean = annual_report_info_clean.groupby('id',sort=False).agg('mean')
annual_report_info_clean=pd.DataFrame(annual_report_info_clean).reset_index()
#处理tax数据
tax_info_clean=tax_info.copy()
tax_info_clean['START_DATE']=pd.to_datetime(tax_info_clean['START_DATE'])
tax_info_clean['END_DATE']=pd.to_datetime(tax_info_clean['END_DATE'])
tax_info_clean['gap_day']=(tax_info_clean['END_DATE']-tax_info_clean['START_DATE']).dt.total_seconds()//3600//24
tax_info_clean=tax_info_clean.drop(['START_DATE','END_DATE'],axis=1)
tax_info_clean['TAX_CATEGORIES']=tax_info_clean['TAX_CATEGORIES'].fillna("无")#17 unique
tax_info_clean['TAX_ITEMS']=tax_info_clean['TAX_ITEMS'].fillna("无")#275 TAX_ITEMS
#对object类型进行编码
dic={}
cate=tax_info_clean.TAX_CATEGORIES.unique()
for i in range(len(cate)):
    dic[cate[i]]=i
tax_info_clean['TAX_CATEGORIES']=tax_info_clean['TAX_CATEGORIES'].map(dic)
#
dic={}
cate=tax_info_clean.TAX_ITEMS.unique()
for i in range(len(cate)):
    dic[cate[i]]=i
tax_info_clean['TAX_ITEMS']=tax_info_clean['TAX_ITEMS'].map(dic)
tax_info_clean['income']=tax_info_clean['TAX_AMOUNT']/tax_info_clean['TAX_RATE']
#
tax_info_clean = tax_info_clean.groupby('id',sort=False).agg('mean')
tax_info_clean=pd.DataFrame(tax_info_clean).reset_index()
#税额分箱
tax_info_clean['TAX_AMOUNT']=tax_info_clean['TAX_AMOUNT'].fillna(tax_info_clean['TAX_AMOUNT'].median())
tax_info_clean['bucket_TAX_AMOUNT']=pd.qcut(tax_info_clean['TAX_AMOUNT'], 10, labels=False,duplicates='drop')
print('finished .............')


# ## base_info数据较为重要，需要构建诸多交叉特征以及特征分箱

# In[358]:


# #处理base_info数据
base_info['opto']=pd.to_datetime(base_info['opto']).fillna(pd.to_datetime(base_info['opto']).max())
base_info['opfrom']=pd.to_datetime(base_info['opfrom'])
base_info['gap_year']=(base_info['opto']-base_info['opfrom']).dt.total_seconds()//3600//24//365
base_info_clean=base_info.drop(['opscope','opfrom','opto'],axis=1)

#............................对object类型进行编码...............................
base_info_clean['industryphy']=base_info_clean['industryphy'].fillna("无")
base_info_clean['dom']=base_info_clean['dom'].fillna("无")
base_info_clean['opform']=base_info_clean['opform'].fillna("无")
base_info_clean['oploc']=base_info_clean['oploc'].fillna("无")
#
dic={}
cate=base_info_clean.industryphy.unique()
for i in range(len(cate)):
    dic[cate[i]]=i
base_info_clean['industryphy']=base_info_clean['industryphy'].map(dic)
#
dic={}
cate=base_info_clean.dom.unique()
for i in range(len(cate)):
    dic[cate[i]]=i
base_info_clean['dom']=base_info_clean['dom'].map(dic)
#
dic={}
cate=base_info_clean.opform.unique()
for i in range(len(cate)):
    dic[cate[i]]=i
base_info_clean['opform']=base_info_clean['opform'].map(dic)
#
dic={}
cate=base_info_clean.oploc.unique()
for i in range(len(cate)):
    dic[cate[i]]=i
base_info_clean['oploc']=base_info_clean['oploc'].map(dic)
#
base_info_clean=base_info_clean.fillna(-1)
#
print('编码完毕.................')
#........................分箱.................................
def bucket(name,bucket_len):
    gap_list=[base_info_clean[name].quantile(i/bucket_len) for i in range(bucket_len+1)]#以分位数作为分箱标志
    len_data=len(base_info_clean[name])
    new_col=[]
    for i in base_info_clean[name].values:
        for j in range(len(gap_list)):
            if gap_list[j]>=i:
                encode=j
                break
        new_col.append(encode)
    return new_col
#注册资本_实缴资本
base_info_clean['regcap_reccap']=base_info_clean['regcap']-base_info_clean['reccap']
#注册资本分箱
base_info_clean['regcap']=base_info_clean['regcap'].fillna(base_info_clean['regcap'].median())
base_info_clean['bucket_regcap']=pd.qcut(base_info_clean['regcap'], 10, labels=False,duplicates='drop')
#实缴资本分箱
base_info_clean['reccap']=base_info_clean['reccap'].fillna(base_info_clean['reccap'].median())
base_info_clean['bucket_reccap']=pd.qcut(base_info_clean['reccap'], 10, labels=False,duplicates='drop')
#注册资本_实缴资本分箱
base_info_clean['regcap_reccap']=base_info_clean['regcap_reccap'].fillna(base_info_clean['regcap_reccap'].median())
base_info_clean['bucket_regcap_reccap']=pd.qcut(base_info_clean['regcap_reccap'], 10, labels=False,duplicates='drop')
#.............................交叉.........................
#作两个特征的交叉
def cross_two(name_1,name_2):
    new_col=[]
    encode=0
    dic={}
    val_1=base_info_clean[name_1]
    val_2=base_info_clean[name_2]
    for i in tqdm(range(len(val_1))):
        tmp=str(val_1[i])+'_'+str(val_2[i])
        if tmp in dic:
            new_col.append(dic[tmp])
        else:
            dic[tmp]=encode
            new_col.append(encode)
            encode+=1
    return new_col
#作企业类型-小类的交叉特征
base_info_clean['enttypegb']=base_info_clean['enttypegb'].fillna("无")
base_info_clean['enttypeitem']=base_info_clean['enttypeitem'].fillna("无")
new_col=cross_two('enttypegb','enttypeitem')#作企业类型-小类的交叉特征
base_info_clean['enttypegb_enttypeitem']=new_col
#
#行业类别-细类的交叉特征
base_info_clean['industryphy']=base_info_clean['industryphy'].fillna("无")
base_info_clean['industryco']=base_info_clean['industryco'].fillna("无")
new_col=cross_two('industryphy','industryco')#作企业类型-小类的交叉特征
base_info_clean['industryphy_industryco']=new_col
#企业类型-行业类别的交叉特征
new_col=cross_two('enttypegb','industryphy')#作企业类型-小类的交叉特征
base_info_clean['enttypegb_industryphy']=new_col
#行业类别-企业类型小类的交叉特征
new_col=cross_two('industryphy','enttypeitem')#作企业类型-小类的交叉特征
base_info_clean['industryphy_enttypeitem']=new_col
#行业类别细类--企业类型小类的交叉特征
new_col=cross_two('industryco','enttypeitem')#作企业类型-小类的交叉特征
base_info_clean['industryco_enttypeitem']=new_col

#企业类型-小类-行业类别-细类的交叉特征
new_col=cross_two('enttypegb_enttypeitem','industryphy_industryco')#作企业类型-小类的交叉特征
base_info_clean['enttypegb_enttypeitem_industryphy_industryco']=new_col
base_info_clean.shape


# ## category特征单独提取出来

# In[359]:


cat_features=['industryphy','dom','opform','oploc','bucket_regcap',
              'bucket_reccap','bucket_regcap_reccap',
              'enttypegb','enttypeitem','enttypegb_enttypeitem',
              'enttypegb_industryphy','enttypegb_enttypeitem_industryphy_industryco',
              'industryphy','industryco','industryphy_industryco',
              'industryphy_enttypeitem','industryco_enttypeitem',
              'adbusign','townsign','regtype','TAX_CATEGORIES','bucket_TAX_AMOUNT',
              'legal_judgment_num','brand_num','patent_num'
             ]

#处理舆论信息
groups = news_info.groupby('id')

news_info_clean = pd.DataFrame(columns = ['id', 'positive', 'non', 'negtive'])

values = groups['positive_negtive'].value_counts()
print(values)

i = 0
l = 0
pre = ''
tem = ['', 0, 0, 0]
#print(values.index)
for name in values.index:
    if name[0] == pre:
        if name[1] == '积极':
            tem[1] = values[i]
        if name[1] == '中立':
            tem[2] = values[i]
        if name[1] == '消极':
            tem[3] = values[i]
        i = i + 1
        continue   
    if i != 0:
        tem[0] = pre
        news_info_clean.loc[l] = tem
        l = l + 1
    pre = name[0]
    tem = ['', 0, 0, 0]
    if name[1] == '积极':
        tem[1] = values[i]
    if name[1] == '中立':
        tem[2] = values[i]
    if name[1] == '消极':
        tem[3] = values[i]
    i = i + 1
print(news_info_clean)    

#处理变更信息
values = change_info.value_counts('id')
print(values)

change_info_clean = pd.DataFrame(columns = ['id', 'number_changes'])
tem = ['', 0]
i = 0
for name in values.index:
    tem[0] = name
    tem[1] = values[i]
    change_info_clean.loc[i] = tem
    tem = ['', 0]
    i = i + 1
print(change_info_clean)    




对opscope提取tfidif特征完毕..........
finished .............
编码完毕.................

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
100%|████████████████████████████████████████████████████████████████████████| 24865/24865 [00:00<00:00, 138139.94it/s]





100%|████████████████████████████████████████████████████████████████████████| 24865/24865 [00:00<00:00, 142086.14it/s]
100%|████████████████████████████████████████████████████████████████████████| 24865/24865 [00:00<00:00, 138139.94it/s]
100%|████████████████████████████████████████████████████████████████████████| 24865/24865 [00:00<00:00, 143730.22it/s]
100%|████████████████████████████████████████████████████████████████████████| 24865/24865 [00:00<00:00, 148892.80it/s]
100%|████████████████████████████████████████████████████████████████████████| 24865/24865 [00:00<00:00, 138912.17it/s]


id                                                positive_negtive
09912c34159b1720558a419983a989f1dd2e0ed69a044ca3  中立                  6
175ebe5f059ec050afbd65251ecdd3b512bfbe5e62d041b0  积极                  4
                                                  中立                  3
216bd2aaf4d079240c3ac0b76f0ef4aa355d443880ba78db  积极                  2
                                                  中立                  1
                                                                     ..
f000950527a6feb6fe8f4850e9eb04e8ba3fa3b409725ef3  中立                  2
f000950527a6feb6ff749dc50c7bf46b37b74e36ce38d1a4  消极                  1
f000950527a6feb6ff7cdb55f5e64a477c499dd75137ae6b  积极                  2
f000950527a6feb6ff839cdf509ebb7631857e6e363fedd6  中立                  1
                                                  消极                  1
Name: positive_negtive, Length: 1410, dtype: int64
                                                   id positive non negtive
0    09912c3415

In [16]:
all_data=base_info_clean.merge(annual_report_info_clean,how='outer')
all_data=all_data.merge(tax_info_clean,how='outer')
all_data=all_data.merge(change_info_clean,how='outer')
all_data=all_data.merge(news_info_clean,how='outer')
#all_data=all_data.merge(change_info_clean_2,how='outer')
#all_data=all_data.merge(news_info_clean_2,how='outer')
all_data=all_data.merge(other_info_clean,how='outer')
all_data=all_data.fillna(-1)
all_data[cat_features]=all_data[cat_features].astype(int)
all_data.shape#,base_info.shape,annual_report_info.shape,tax_info.shape


# In[361]:


#
train_df=all_data.merge(entprise_info)
train_data=train_df.drop(['id','label'],axis=1)
kind=train_df['label']
test_df=all_data[all_data['id'].isin(entprise_evaluate['id'].unique().tolist())]
test_df=test_df.reset_index(drop=True)
test_data=test_df.drop(['id'],axis=1)
train_data.shape,test_data.shape


#特征筛选
#frt_select=[
#  'industryphy',
#  'enttypegb',
#  'regcap',
#  'townsign',
#  'industryco',
#  'bucket_regcap',
#  'empnum',
#  'bucket_reccap',
#  'enttypeitem',
#  'industryphy_industryco',
#  'reccap',
#  'FORINVESTSIGN',
#  'positive_negtive',
#  'regtype',
#  'STOCKTRANSIGN',
#  'bucket_regcap_reccap',
#  'enttypegb_enttypeitem',
#  'regcap_reccap',
#  'legal_judgment_num',
#  'TAX_CATEGORIES',
#  'TAX_AMOUNT',
#  'bgq_bgh',
#  'TAX_ITEMS',
#  'positive',
#  'negtive',
#  'number_changes' ]
# frt_select=important_frt[:30]
#train_data=train_data[frt_select]
#test_data=test_data[frt_select]
#cat_features=list(set(frt_select).intersection(set(cat_features)))
# cat_features


def eval_score(y_test,y_pre):
    _,_,f_class,_=precision_recall_fscore_support(y_true=y_test,y_pred=y_pre,labels=[0,1],average=None)
    fper_class={'合法':f_class[0],'违法':f_class[1],'f1':f1_score(y_test,y_pre)}
    return fper_class
#
def k_fold_serachParmaters(model,train_val_data,train_val_kind):
    mean_f1=0
    mean_f1Train=0
    n_splits=5
    sk = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2020)
    for train, test in sk.split(train_val_data, train_val_kind):
        x_train = train_val_data.iloc[train]
        y_train = train_val_kind.iloc[train]
        x_test = train_val_data.iloc[test]
        y_test = train_val_kind.iloc[test]

        model.fit(x_train, y_train)
        pred = model.predict(x_test)
        fper_class =  eval_score(y_test,pred)
        mean_f1+=fper_class['f1']/n_splits
        #print(fper_class)
        
        pred_Train = model.predict(x_train)
        fper_class_train =  eval_score(y_train,pred_Train)
        mean_f1Train+=fper_class_train['f1']/n_splits
    #print('mean valf1:',mean_f1)
    #print('mean trainf1:',mean_f1Train)
    return mean_f1


def search_param(n_estimators,max_depth,min_samples_split):
    rf = RandomForestClassifier(oob_score=True, random_state=2020,
                    n_estimators= n_estimators,max_depth=max_depth,min_samples_split=min_samples_split)
    mean_f1=k_fold_serachParmaters(rf,train_data,kind)
    return mean_f1

# #搜索最佳参数
param=[]
best=0
for n_estimators in [60,50,55,65]:
    print('n_estimators:',n_estimators)
    for min_samples_split in [8,10,20,15]:
        for max_depth in [12,11,13,15]:
            mean_f1=search_param(n_estimators,max_depth,min_samples_split)
            if mean_f1>best:
                param=[n_estimators,min_samples_split,max_depth]
                best=mean_f1
                print(param,best)

rf = RandomForestClassifier(oob_score=True, random_state=2020,
            n_estimators= 60,max_depth=13,min_samples_split=10)
k_fold_serachParmaters(rf,train_data,kind)

model=rf#仅用随机森林
details = []
answers = []
mean_f1=0
n_splits=5
sk = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2020)
cnt=0
for train, test in sk.split(train_data, kind):
    x_train = train_data.iloc[train]
    y_train = kind.iloc[train]
    x_test = train_data.iloc[test]
    y_test = kind.iloc[test]

    model.fit(x_train, y_train)
    pred_cab = model.predict(x_test)
    weight_cab =  eval_score(y_test,pred_cab)['f1']

    print('每{}次验证的f1:{}'.format(cnt,weight_cab))
    cnt+=1
    mean_f1+=weight_cab/n_splits
    ans = model.predict_proba(test_data)

    answers.append(ans)
print('mean f1:',mean_f1)

fina=np.sqrt(sum(np.array(answers)**2)/n_splits)#平方平均
fina=fina[:,1]
test_df['score']=fina#可选:fina_persudo是伪标签的预测结果
submit_csv=test_df[['id','score']]
submit_csv.to_csv('submit.csv',index=False)


n_estimators: 60
[60, 8, 12] 0.8290513110411092
[60, 8, 11] 0.830366626918495
[60, 8, 15] 0.8383219649860969
n_estimators: 50
[50, 8, 15] 0.8383901017644865
n_estimators: 55
[55, 8, 15] 0.8397975051424035
n_estimators: 65
每0次验证的f1:0.816
每1次验证的f1:0.8358974358974359
每2次验证的f1:0.8258706467661691
每3次验证的f1:0.8253164556962025
每4次验证的f1:0.8396946564885496
mean f1: 0.8285558389696714
