In [16]:
#-*- coding: UTF-8 -*-
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import f1_score,accuracy_score,roc_auc_score,log_loss, accuracy_score, precision_score, recall_score
from sklearn.model_selection import StratifiedKFold, GroupKFold
from sklearn.preprocessing import LabelEncoder
import gc
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.cluster import KMeans
pd.options.display.max_columns=50
pd.options.display.max_rows=15


# 分别对应文件夹名，四个csv的名称必须按顺序传入
def predict(folder, base_test_sum, knowledge_test_sum, money_report_test_sum, year_report_test_sum):
    '''
    author： 叶文涛
    time：2020-7-15
    功能：读入测试集，调用模型进行预测
    参数说明——
    folder： 文件夹名称
    base_test_sum： 测试集1
    knowledge_test_sum： 测试集2
    money_report_test_sum： 测试集3
    year_report_test_sum： 测试集4
    '''
    # 训练集读取
    data = pd.read_csv(folder + base_train_sum)
    data = pd.merge(data,pd.read_csv(folder + knowledge_train_sum),how='left',on='ID')
    data = pd.merge(data,pd.read_csv(folder + money_report_train_sum),how='left',on='ID')
    data = pd.merge(data,pd.read_csv(folder + year_report_train_sum),how='left',on=['ID','year']) # 这样就能一起合并
    
    # 如果有控制人ID，flag则删除
    try:del data['flag']
    except:pass
    try:del data['控制人ID']
    except:pass
        
    data.columns = ['ID','Registration Time', 'Registered Capital', 'Industry', 'Region', 'Business Type',
                    'Controlling Type', 'Controlling Shareholding', 'Patent', 'Trademark', 'Copyright',
                    'year', 'Debt financing line', 'Debt financing cost', 'Equity financing line',
                    'Equity financing cost', 'Internal financing and trade financing line',
                    'Internal financing and trade financing cost', 'Project financing and Policy financing quota ',
                    ' Project financing and policy financing costs', 'Number of employees','Total assets',
                    'Total liabilities',' Total operating income ',' Main operating income ',' Total profit' ,
                    'Net Profit', 'Total Taxes', 'Total Owners Equity']

    #labelencode
    for col in ['Industry', 'Region', 'Business Type', 'Controlling Type']:
        lbl = LabelEncoder()
        data[col] = lbl.fit_transform(data[col].astype(str))
        
    # 特征工程
    data['Total Taxes Square'] = data['Total Taxes'] **2
    data['log Total Taxes'] = np.log(data['Total Taxes'])
    data['log Net Profit'] = np.log(data['Net Profit'])
    # qcut等频分箱，cut为等距分箱
    cats = pd.qcut(data['Total Taxes'], 6, duplicates = 'drop')
    data['Total Taxes qcut_branch6'] = lbl.fit_transform(cats.astype(str))
    cats = pd.qcut(data['Net Profit'], 8, duplicates = 'drop')
    data['Net Profit qcut_branch8'] = lbl.fit_transform(cats.astype(str))
    #特征工程2
    datag = data.groupby('ID')
    lst = list(set(data['ID']))
    Total_Taxes_incre = []
    Total_Taxes_incre_mean = []
    Net_profit_has_0 = []
    Total_Taxes_has_0 = []
    for id in lst:
        group = datag.get_group(id)
        taxes = list(group['Total Taxes'])
        nets = list(group['Net Profit'])
        # 判断三年利润为0/有1~2年为0/都存在值
        if sum(nets)==0:  
            Net_profit_has_0.extend([0]*len(nets))
        elif(0 in nets):
            Net_profit_has_0.extend([1]*len(nets)) 
        else:
            Net_profit_has_0.extend([2]*len(nets)) 
        # 判断三年税收为0/有1~2年为0/都存在值
        if sum(taxes)==0:
            Total_Taxes_has_0.extend([0]*len(taxes))
        elif(0 in taxes):
            Total_Taxes_has_0.extend([1]*len(taxes)) 
        else:
            Total_Taxes_has_0.extend([2]*len(taxes)) 
        # 计算总税收增长率
        if 0 in taxes or len(taxes)<3:
            Total_Taxes_incre_mean.extend([0]*len(taxes))
        else:
            a = taxes[1]/taxes[0]
            b = taxes[2]/taxes[1]
            tax_mean = (a + b)/2
            Total_Taxes_incre_mean.extend([tax_mean]*len(taxes))
    data['Net_profit_has_0'] = Net_profit_has_0
    data['Total_Taxes_has_0'] = Total_Taxes_has_0  
    data['Total_Taxes_incre_mean'] = Total_Taxes_incre_mean
    # 特征工程3
    data['year*Net Profit'] = data['year'] * data['Net Profit']#搜索得到特征
    data['Industry*Total Taxes'] = data['Industry'] * data['Total Taxes'] # 搜索得到特征
    data['Asset liability ratio'] = data['Total assets']/data['Total liabilities'] #资产负债比率
    # 聚类
    #data = data.T #使待分类样本格式正确
    data1 = data.fillna(0)
    data1 = data1.replace([np.inf, -np.inf],0) #替换正负inf为NA
    del data1['ID']
    estimator = KMeans(n_clusters=6, n_jobs = 8) #构造聚类器
    estimator.fit(data1)
    label_pred = estimator.labels_#最终聚类类别  
    data['Kmeans_label'] = label_pred
    # 获取特征名
    feature_name = [col for col in data.columns if col not in ['ID','flag']]
    print('使用特征：',feature_name)
    
    result = np.array([0.0]*len(data))
    for i in range(5):
        model = lgb.Booster(model_file = "lgb_model_"+str(i))
        result += np.array(model.predict(data[feature_name]))
    
    # 结果取平均
    result/=5
    
    # 生成结果文件
    df = pd.DataFrame({'ID':data['ID'], 'result':result})
    # 一个ID有3份数据，取三年和的平均值
    df = df[['ID','result']].groupby('ID').mean()
    # 二值化
    df['result'] = np.where(df['result']>=0.46,1,0)

    # 生成文件
    df.to_csv(folder+ 'result.csv', index = True)
    return True

'''使用示例：'''
# 调用函数，注意末端的/要加上
folder = os.getcwd() + '/'
predict(folder, 'base_test_sum.csv', 'knowledge_test_sum.csv', 'money_report_test_sum.csv', 'year_report_test_sum.csv')

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


使用特征： ['Registration Time', 'Registered Capital', 'Industry', 'Region', 'Business Type', 'Controlling Type', 'Controlling Shareholding', 'Patent', 'Trademark', 'Copyright', 'year', 'Debt financing line', 'Debt financing cost', 'Equity financing line', 'Equity financing cost', 'Internal financing and trade financing line', 'Internal financing and trade financing cost', 'Project financing and Policy financing quota ', ' Project financing and policy financing costs', 'Number of employees', 'Total assets', 'Total liabilities', ' Total operating income ', ' Main operating income ', ' Total profit', 'Net Profit', 'Total Taxes', 'Total Owners Equity', 'Total Taxes Square', 'log Total Taxes', 'log Net Profit', 'Total Taxes qcut_branch6', 'Net Profit qcut_branch8', 'Net_profit_has_0', 'Total_Taxes_has_0', 'Total_Taxes_incre_mean', 'year*Net Profit', 'Industry*Total Taxes', 'Asset liability ratio', 'Kmeans_label']


True

In [6]:
data = pd.read_csv('base_test_sum.csv')
data = pd.merge(data,pd.read_csv('knowledge_test_sum.csv'),how='left',on='ID')
data = pd.merge(data,pd.read_csv('money_report_test_sum.csv'),how='left',on='ID')
data = pd.merge(data,pd.read_csv('year_report_test_sum.csv'),how='left',on=['ID','year']) # 这样就能一起合并

In [7]:
data

Unnamed: 0,ID,注册时间,注册资本,行业,区域,企业类型,控制人类型,控制人持股比例,专利,商标,著作权,year,债权融资额度,债权融资成本,股权融资额度,股权融资成本,内部融资和贸易融资额度,内部融资和贸易融资成本,项目融资和政策融资额度,项目融资和政策融资成本,从业人数,资产总额,负债总额,营业总收入,主营业务收入,利润总额,净利润,纳税总额,所有者权益合计
0,8010000,2009,2660,服务业,江西,合伙企业,企业法人,0.57,0,1,0,2015,1596,127.68,0.0,0.000,0.0,0.00,0.0,0.000,856,15960.0,26600.0,22344.0,13406.4,4468.8,-2256.744,0.0,-10640.0
1,8010000,2009,2660,服务业,江西,合伙企业,企业法人,0.57,0,1,0,2016,5586,446.88,0.0,0.000,0.0,0.00,0.0,0.000,869,55860.0,53200.0,16758.0,10054.8,8379.0,-1692.558,0.0,2660.0
2,8010000,2009,2660,服务业,江西,合伙企业,企业法人,0.57,0,1,0,2017,2128,170.24,0.0,0.000,0.0,0.00,0.0,0.000,311,21280.0,37240.0,21280.0,17024.0,8512.0,2149.280,8512.0,-15960.0
3,8010006,2006,370,交通运输业,江西,集体所有制企业,自然人,0.80,0,0,0,2015,0,0.00,0.0,0.000,0.0,0.00,333.0,19.980,266,6660.0,6290.0,20646.0,8258.4,4129.2,-2085.246,0.0,370.0
4,8010006,2006,370,交通运输业,江西,集体所有制企业,自然人,0.80,0,0,0,2016,1850,148.00,0.0,0.000,0.0,0.00,0.0,0.000,426,18500.0,9065.0,37000.0,25900.0,7400.0,-3737.000,0.0,9435.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,8024998,2010,9290,社区服务,广西,有限责任公司,自然人,0.62,0,1,0,2016,0,0.00,13377.6,535.104,0.0,0.00,0.0,0.000,882,55740.0,92900.0,222960.0,156072.0,66888.0,22518.960,133776.0,-37160.0
29996,8024998,2010,9290,社区服务,广西,有限责任公司,自然人,0.62,0,1,0,2017,0,0.00,0.0,0.000,0.0,0.00,5852.7,351.162,229,195090.0,278700.0,565761.0,226304.4,226304.4,0.000,113152.2,-83610.0
29997,8024999,2007,4370,零售业,广东,农民专业合作社,自然人,0.75,0,1,1,2015,0,0.00,0.0,0.000,68172.0,4090.32,0.0,0.000,977,174800.0,85215.0,227240.0,181792.0,113620.0,-22951.240,0.0,89585.0
29998,8024999,2007,4370,零售业,广东,农民专业合作社,自然人,0.75,0,1,1,2016,0,0.00,0.0,0.000,0.0,0.00,5899.5,353.970,421,196650.0,192280.0,98325.0,78660.0,39330.0,-9930.825,0.0,4370.0


In [13]:
len([0]*30000)

30000