In [1]:
#-*- coding: UTF-8 -*-
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import GroupKFold, train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import gc
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
pd.options.display.max_columns=50
pd.options.display.max_rows=20

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn import svm

In [7]:
# 训练集读取，45146*30，其中有标签的15215*30
train_data = pd.read_csv('base_train_sum.csv', encoding = 'gbk')
train_data = pd.merge(train_data,pd.read_csv('knowledge_train_sum.csv', encoding = 'gbk'),how='left',on='ID')
train_data = pd.merge(train_data,pd.read_csv('money_report_train_sum.csv', encoding = 'gbk'),how='left',on='ID')
train_data = pd.merge(train_data,pd.read_csv('year_report_train_sum.csv', encoding = 'gbk'),how='left',on=['ID','year']) # 这样就能一起合并
train_data = train_data[~train_data['flag'].isnull()].reset_index(drop=True)

# 验证集读取，96250*31，其中有标签的91732*30（注：比训练集多了一个控制人ID，官方回复测试集中不会加入这个特征，所以选择删除）
valid_data = pd.read_csv('base_verify1.csv', encoding = 'gbk')
valid_data = pd.merge(valid_data,pd.read_csv('paient_information_verify1.csv', encoding = 'gbk'),how='left',on='ID')
valid_data = pd.merge(valid_data,pd.read_csv('money_information_verify1.csv', encoding = 'gbk'),how='left',on='ID')
valid_data = pd.merge(valid_data,pd.read_csv('year_report_verify1.csv', encoding = 'gbk'),how='left',on=['ID','year'])
del valid_data['控制人ID']

# 合并为一个数据集，137796*30
data = pd.concat([train_data,valid_data],axis=0,sort=False,ignore_index=True)
data.columns = ['ID','Registration Time', 'Registered Capital', 'Industry', 'Region', 'Business Type', 'Controlling Type', 'Controlling Shareholding','flag', 'Patent', 'Trademark', 'Copyright', 'year', 'Debt financing line', 'Debt financing cost', 'Equity financing line', 'Equity financing cost', 'Internal financing and trade financing line', 'Internal financing and trade financing cost', 'Project financing and Policy financing quota ',' Project financing and policy financing costs', 'Number of employees','Total assets', 'Total liabilities',' Total operating income ',' Main operating income ',' Total profit ' , 'Net Profit', 'Total Taxes', 'Total Owners Equity']

# 去flag=null的值，现在data:106947*30
data = data[~data['flag'].isnull()].reset_index(drop=True)
# 打乱数据
# data = data.sample(frac=1).reset_index(drop=True)
# 去重
data = data.drop_duplicates(subset=None,keep='first',inplace=False)
# 所有nan值换0,不要使用，降低效果
# data = data.fillna(0)

#labelencode
#['Industry', 'Region', 'Business Type', 'Controlling Type']
#['行业','区域','企业类型','控制人类型']
for col in ['Industry', 'Region', 'Business Type', 'Controlling Type']:
    lbl = LabelEncoder()
    data[col] = lbl.fit_transform(data[col].astype(str))

In [8]:
# 特征工程 ——自身特征
data['Total Taxes Square'] = data['Total Taxes'] **2
#data['log Total Taxes'] = np.log(data['Total Taxes'])
#data['log Net Profit'] = np.log(data['Net Profit'])
                                
# qcut等频分箱，cut为等距分箱
cats = pd.qcut(data['Total Taxes'], 6, duplicates = 'drop')
data['Total Taxes qcut_branch6'] = lbl.fit_transform(cats.astype(str))
cats = pd.qcut(data['Net Profit'], 8, duplicates = 'drop')
data['Net Profit qcut_branch8'] = lbl.fit_transform(cats.astype(str))

#特征工程2
datag = data.groupby('ID')
lst = list(set(data['ID']))
Total_Taxes_incre = []         
Total_Taxes_incre_mean = []
Net_profit_has_0 = []
Total_Taxes_has_0 = []
for id in lst:
    group = datag.get_group(id)
    taxes = list(group['Total Taxes'])
    nets = list(group['Net Profit'])
    
    # 判断三年利润为0/有1~2年为0/都存在值
    if sum(nets)==0:  
        Net_profit_has_0.extend([0]*len(nets))
    elif(0 in nets):
        Net_profit_has_0.extend([1]*len(nets)) 
    else:
        Net_profit_has_0.extend([2]*len(nets)) 
    
    # 判断三年税收为0/有1~2年为0/都存在值
    if sum(taxes)==0:
        Total_Taxes_has_0.extend([0]*len(taxes))
    elif(0 in taxes):
        Total_Taxes_has_0.extend([1]*len(taxes)) 
    else:
        Total_Taxes_has_0.extend([2]*len(taxes)) 
    
    # 计算总税收增长率
    if 0 in taxes or len(taxes)<3:
        Total_Taxes_incre_mean.extend([0]*len(taxes))
    else:
        tax_mean = (taxes[1]/taxes[0] + taxes[2]/taxes[1] )/2
        Total_Taxes_incre_mean.extend([tax_mean]*len(taxes))
        
data['Net_profit_has_0'] = Net_profit_has_0
data['Total_Taxes_has_0'] = Total_Taxes_has_0  
data['Total_Taxes_incre_mean'] = Total_Taxes_incre_mean

# 特征工程3
data['asset < liabilities'] = data['Total assets'] < data['Total liabilities']#资不抵债
data['year*Net Profit'] = data['year'] * data['Net Profit']#搜索特征
data['Industry*Total Taxes'] = data['Industry'] * data['Total Taxes'] # 搜索特征
data['Asset liability ratio'] = data['Total assets']/data['Total liabilities'] #资产负债比率

# 聚类
data1 = data.fillna(0)
data1 = data1.replace([np.inf, -np.inf],0) #替换正负inf为NA
del data1['ID']  #ID和Flag对聚类影响极大，需要先行删除
del data1['flag']
estimator = KMeans(n_clusters=6, n_jobs = 8) #构造聚类器
estimator.fit(data1)
label_pred = estimator.labels_#最终聚类类别  

data['Kmeans_label'] = label_pred  # 聚类结果作为一维特征



In [9]:
data

Unnamed: 0,ID,Registration Time,Registered Capital,Industry,Region,Business Type,Controlling Type,Controlling Shareholding,flag,Patent,Trademark,Copyright,year,Debt financing line,Debt financing cost,Equity financing line,Equity financing cost,Internal financing and trade financing line,Internal financing and trade financing cost,Project financing and Policy financing quota,Project financing and policy financing costs,Number of employees,Total assets,Total liabilities,Total operating income,Main operating income,Total profit,Net Profit,Total Taxes,Total Owners Equity,Total Taxes Square,Total Taxes qcut_branch6,Net Profit qcut_branch8,Net_profit_has_0,Total_Taxes_has_0,Total_Taxes_incre_mean,asset < liabilities,year*Net Profit,Industry*Total Taxes,Asset liability ratio,Kmeans_label
0,28,2007.0,2050.0,1,7,1,1,,1.0,0.0,1.0,1.0,2015.0,0.0,0.00,0.0,0.000,21648.0,1298.880,0.0,0.000,794.0,16400.0,28700.0,72160.0,28864.0,7216.0,-7216.0,0.0,-12300.0,0.000000e+00,0,0,2,0,0.0,True,-14540240.0,0.0,0.571429,0
1,28,2007.0,2050.0,1,7,1,1,,1.0,0.0,1.0,1.0,2016.0,0.0,0.00,34686.0,1387.440,0.0,0.000,0.0,0.000,396.0,73800.0,71750.0,346860.0,173430.0,173430.0,-34686.0,0.0,2050.0,0.000000e+00,0,2,2,0,0.0,False,-69926976.0,0.0,1.028571,0
2,28,2007.0,2050.0,1,7,1,1,,1.0,0.0,1.0,1.0,2017.0,0.0,0.00,3444.0,137.760,0.0,0.000,0.0,0.000,393.0,82000.0,159900.0,172200.0,103320.0,17220.0,-17220.0,0.0,-77900.0,0.000000e+00,0,2,2,0,0.0,True,-34732740.0,0.0,0.512821,0
3,230,2008.0,3360.0,4,2,1,1,1.00,1.0,0.0,0.0,0.0,2015.0,0.0,0.00,0.0,0.000,0.0,0.000,470.4,28.224,485.0,23520.0,10080.0,115248.0,57624.0,57624.0,-11524.8,0.0,13440.0,0.000000e+00,0,0,2,0,0.0,False,-23222472.0,0.0,2.333333,0
4,230,2008.0,3360.0,4,2,1,1,1.00,1.0,0.0,0.0,0.0,2016.0,0.0,0.00,0.0,0.000,46771.2,2806.272,0.0,0.000,365.0,53760.0,50400.0,155904.0,124723.2,46771.2,-15590.4,0.0,3360.0,0.000000e+00,0,0,2,0,0.0,False,-31430246.4,0.0,1.066667,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106942,5999998,2014.0,4510.0,4,6,4,2,0.64,0.0,0.0,0.0,0.0,2017.0,0.0,0.00,0.0,0.000,18265.5,1095.930,0.0,0.000,841.0,67650.0,63140.0,60885.0,30442.5,30442.5,-6088.5,0.0,4510.0,0.000000e+00,0,0,2,0,0.0,False,-12280504.5,0.0,1.071429,0
106943,5999999,2014.0,9130.0,1,7,4,2,0.80,0.0,1.0,1.0,1.0,2015.0,0.0,0.00,6025.8,241.032,0.0,0.000,0.0,0.000,885.0,27390.0,9130.0,60258.0,42180.6,30129.0,6025.8,24103.2,18260.0,5.809643e+08,1,6,2,0,0.0,False,12141987.0,24103.2,3.000000,0
106944,5999999,2014.0,9130.0,1,7,4,2,0.80,0.0,1.0,1.0,1.0,2016.0,7304.0,584.32,0.0,0.000,0.0,0.000,0.0,0.000,933.0,73040.0,95865.0,51128.0,25564.0,5112.8,0.0,10225.6,-22825.0,1.045629e+08,1,3,2,0,0.0,True,0.0,10225.6,0.761905,0
106945,5999999,2014.0,9130.0,1,7,4,2,0.80,0.0,1.0,1.0,1.0,2017.0,0.0,0.00,0.0,0.000,0.0,0.000,821.7,49.302,46.0,82170.0,73040.0,16434.0,11503.8,8217.0,6573.6,9860.4,9130.0,9.722749e+07,1,6,2,0,0.0,False,13258951.2,9860.4,1.125000,0


In [10]:
# 获取特征名
feature_name = [col for col in data.columns if col not in ['ID','flag']]
print(feature_name)

['Registration Time', 'Registered Capital', 'Industry', 'Region', 'Business Type', 'Controlling Type', 'Controlling Shareholding', 'Patent', 'Trademark', 'Copyright', 'year', 'Debt financing line', 'Debt financing cost', 'Equity financing line', 'Equity financing cost', 'Internal financing and trade financing line', 'Internal financing and trade financing cost', 'Project financing and Policy financing quota ', ' Project financing and policy financing costs', 'Number of employees', 'Total assets', 'Total liabilities', ' Total operating income ', ' Main operating income ', ' Total profit ', 'Net Profit', 'Total Taxes', 'Total Owners Equity', 'Total Taxes Square', 'Total Taxes qcut_branch6', 'Net Profit qcut_branch8', 'Net_profit_has_0', 'Total_Taxes_has_0', 'Total_Taxes_incre_mean', 'asset < liabilities', 'year*Net Profit', 'Industry*Total Taxes', 'Asset liability ratio', 'Kmeans_label']


In [12]:
x_train, x_test, y_train, y_test = train_test_split(data[feature_name],data['flag'], random_state=6666, train_size=0.8)

In [19]:
# 加入ID的过拟合

# Logistic回归
lr = LogisticRegression(penalty='l1', solver='liblinear')
lr.fit(x_train.replace([np.inf, -np.inf], np.nan).fillna(-1), y_train.ravel())
y_hat = lr.predict(x_test.replace([np.inf, -np.inf], np.nan).fillna(-1))
print ("##############Logistic回归#############")
print('准确率',accuracy_score(y_test, y_hat))
print('精确率',precision_score(y_test, y_hat))
print('召回率',recall_score(y_test, y_hat))

#随机森林
rnd_clf = RandomForestClassifier(n_estimators=10, max_leaf_nodes=7, n_jobs=1)
rnd_clf.fit(x_train.replace([np.inf, -np.inf], np.nan).fillna(-1), y_train)
y_hat = rnd_clf.predict(x_test.replace([np.inf, -np.inf], np.nan).fillna(-1))
print("############随机森林 ###############")
print('准确率',accuracy_score(y_test, y_hat))
print('精确率',precision_score(y_test, y_hat))
print('召回率',recall_score(y_test, y_hat))

#bagging
tree = DecisionTreeClassifier(criterion='entropy', max_depth=None)
clf = BaggingClassifier(base_estimator=tree, n_estimators=6, max_samples=1.0, max_features=1.0, bootstrap=True,
                        bootstrap_features=False, n_jobs=1, random_state=1)
clf.fit(x_train.replace([np.inf, -np.inf], np.nan).fillna(-1), y_train)
y_hat = clf.predict(x_test.replace([np.inf, -np.inf], np.nan).fillna(-1))
print("##############bagging#############")
print('准确率',accuracy_score(y_test, y_hat))
print('精确率',precision_score(y_test, y_hat))
print('召回率',recall_score(y_test, y_hat))

#adaboost
ada_real = AdaBoostClassifier(base_estimator=tree, learning_rate=0.5,
                              n_estimators=6, algorithm='SAMME.R')  # 相比于ada_discrete只改变了Algorithm参数
ada_real.fit(x_train.replace([np.inf, -np.inf], np.nan).fillna(-1), y_train)
y_hat = clf.predict(x_test.replace([np.inf, -np.inf], np.nan).fillna(-1))
print("##############adaboost#############")
print('准确率',accuracy_score(y_test, y_hat))
print('精确率',precision_score(y_test, y_hat))
print('召回率',recall_score(y_test, y_hat))

# 朴素贝叶斯 高斯算法
model = GaussianNB()
model.fit(x_train.replace([np.inf, -np.inf], np.nan).fillna(-1), y_train)
y_hat = model.predict(x_test.replace([np.inf, -np.inf], np.nan).fillna(-1))
print("##############朴素贝叶斯#############")
print('准确率',accuracy_score(y_test, y_hat))
print('精确率',precision_score(y_test, y_hat))
print('召回率',recall_score(y_test, y_hat))

# 神经网络
clf_class= MLPClassifier(solver='adam', alpha=1e-4,hidden_layer_sizes=(256,3), random_state=2)
clf_class.fit(x_train.replace([np.inf, -np.inf], np.nan).fillna(-1),y_train)
y_hat = clf_class.predict(x_test.replace([np.inf, -np.inf], np.nan).fillna(-1))
print("##############MLPClassifier#############")
print('准确率',accuracy_score(y_test, y_hat))
print('精确率',precision_score(y_test, y_hat))
print('召回率',recall_score(y_test, y_hat))

##############Logistic回归#############
准确率 0.9996259934548855
精确率 1.0
召回率 0.9990446620492
############随机森林 ###############
准确率 1.0
精确率 1.0
召回率 1.0
##############bagging#############
准确率 1.0
精确率 1.0
召回率 1.0
##############adaboost#############
准确率 1.0
精确率 1.0
召回率 1.0
##############朴素贝叶斯#############
准确率 0.8456287985039738
精确率 0.7179443107597112
召回率 0.9976116551229998
##############MLPClassifier#############
准确率 0.8658719027582983
精确率 0.7448189984879481
召回率 1.0


In [63]:
df = pd.DataFrame(columns = ['准确率', '精确度', '召回率'])
df.loc['Lightgbm'] = [0.898323,0.798458,0.991309]
print ("##############lightgbm#############")
print('准确率',0.898323)
print('精确率',0.798458)
print('召回率',0.991309)

# Logistic回归
lr = LogisticRegression(penalty='l1', solver='liblinear')
lr.fit(x_train.replace([np.inf, -np.inf], np.nan).fillna(-1), y_train.ravel())
y_hat = lr.predict(x_test.replace([np.inf, -np.inf], np.nan).fillna(-1))
print ("##############Logistic回归#############")
print('准确率',accuracy_score(y_test, y_hat), '比较提升', (accuracy - accuracy_score(y_test, y_hat))/accuracy_score(y_test, y_hat)*100,'%')
print('精确率',precision_score(y_test, y_hat), '比较提升', (precision - precision_score(y_test, y_hat))/precision_score(y_test, y_hat)*100,'%')
print('召回率',recall_score(y_test, y_hat), '比较提升', (recall - recall_score(y_test, y_hat))/recall_score(y_test, y_hat)*100,'%')
df.loc['LogisticRegression'] = [accuracy_score(y_test, y_hat), precision_score(y_test, y_hat), recall_score(y_test, y_hat)]

#随机森林
rnd_clf = RandomForestClassifier(n_estimators=10, max_leaf_nodes=7, n_jobs=1)
rnd_clf.fit(x_train.replace([np.inf, -np.inf], np.nan).fillna(-1), y_train)
y_hat = rnd_clf.predict(x_test.replace([np.inf, -np.inf], np.nan).fillna(-1))
print("############随机森林 ###############")
print('准确率',accuracy_score(y_test, y_hat), '比较提升', (accuracy - accuracy_score(y_test, y_hat))/accuracy_score(y_test, y_hat)*100,'%')
print('精确率',precision_score(y_test, y_hat), '比较提升', (precision - precision_score(y_test, y_hat))/precision_score(y_test, y_hat)*100,'%')
print('召回率',recall_score(y_test, y_hat), '比较提升', (recall - recall_score(y_test, y_hat))/recall_score(y_test, y_hat)*100,'%')
df.loc['RandomForest'] = [accuracy_score(y_test, y_hat), precision_score(y_test, y_hat), recall_score(y_test, y_hat)]

#bagging
tree = DecisionTreeClassifier(criterion='entropy', max_depth=None)
clf = BaggingClassifier(base_estimator=tree, n_estimators=6, max_samples=1.0, max_features=1.0, bootstrap=True,
                        bootstrap_features=False, n_jobs=1, random_state=1)
clf.fit(x_train.replace([np.inf, -np.inf], np.nan).fillna(-1), y_train)
y_hat = clf.predict(x_test.replace([np.inf, -np.inf], np.nan).fillna(-1))
print("##############bagging#############")
print('准确率',accuracy_score(y_test, y_hat), '比较提升', (accuracy - accuracy_score(y_test, y_hat))/accuracy_score(y_test, y_hat)*100,'%')
print('精确率',precision_score(y_test, y_hat), '比较提升', (precision - precision_score(y_test, y_hat))/precision_score(y_test, y_hat)*100,'%')
print('召回率',recall_score(y_test, y_hat), '比较提升', (recall - recall_score(y_test, y_hat))/recall_score(y_test, y_hat)*100,'%')
df.loc['Bagging'] = [accuracy_score(y_test, y_hat), precision_score(y_test, y_hat), recall_score(y_test, y_hat)]

#adaboost
ada_real = AdaBoostClassifier(base_estimator=tree, learning_rate=0.5,
                              n_estimators=6, algorithm='SAMME.R')  # 相比于ada_discrete只改变了Algorithm参数
ada_real.fit(x_train.replace([np.inf, -np.inf], np.nan).fillna(-1), y_train)
y_hat = clf.predict(x_test.replace([np.inf, -np.inf], np.nan).fillna(-1))
print("##############adaboost#############")
print('准确率',accuracy_score(y_test, y_hat), '比较提升', (accuracy - accuracy_score(y_test, y_hat))/accuracy_score(y_test, y_hat)*100,'%')
print('精确率',precision_score(y_test, y_hat), '比较提升', (precision - precision_score(y_test, y_hat))/precision_score(y_test, y_hat)*100,'%')
print('召回率',recall_score(y_test, y_hat), '比较提升', (recall - recall_score(y_test, y_hat))/recall_score(y_test, y_hat)*100,'%')
df.loc['Adaboost'] = [accuracy_score(y_test, y_hat), precision_score(y_test, y_hat), recall_score(y_test, y_hat)]

# 朴素贝叶斯 高斯算法
model = GaussianNB()
model.fit(x_train.replace([np.inf, -np.inf], np.nan).fillna(-1), y_train)
y_hat = model.predict(x_test.replace([np.inf, -np.inf], np.nan).fillna(-1))
print("##############朴素贝叶斯#############")
print('准确率',accuracy_score(y_test, y_hat), '比较提升', (accuracy - accuracy_score(y_test, y_hat))/accuracy_score(y_test, y_hat)*100,'%')
print('精确率',precision_score(y_test, y_hat), '比较提升', (precision - precision_score(y_test, y_hat))/precision_score(y_test, y_hat)*100,'%')
print('召回率',recall_score(y_test, y_hat), '比较提升', (recall - recall_score(y_test, y_hat))/recall_score(y_test, y_hat)*100,'%')
df.loc['Bayes'] = [accuracy_score(y_test, y_hat), precision_score(y_test, y_hat), recall_score(y_test, y_hat)]

# 神经网络
clf_class= MLPClassifier(solver='adam', alpha=1e-4,hidden_layer_sizes=(128,3), random_state=2)
clf_class.fit(x_train.replace([np.inf, -np.inf], np.nan).fillna(-1),y_train)
y_hat = clf_class.predict(x_test.replace([np.inf, -np.inf], np.nan).fillna(-1))
print("##############MLPClassifier#############")
print('准确率',accuracy_score(y_test, y_hat), '比较提升', (accuracy - accuracy_score(y_test, y_hat))/accuracy_score(y_test, y_hat)*100,'%')
print('精确率',precision_score(y_test, y_hat), '比较提升', (precision - precision_score(y_test, y_hat))/precision_score(y_test, y_hat)*100,'%')
print('召回率',recall_score(y_test, y_hat), '比较提升', (recall - recall_score(y_test, y_hat))/recall_score(y_test, y_hat)*100,'%')
df.loc['MLPClassifier'] = [accuracy_score(y_test, y_hat), precision_score(y_test, y_hat), recall_score(y_test, y_hat)]

##############lightgbm#############
准确率 0.898323
精确率 0.798458
召回率 0.991309
##############Logistic回归#############
准确率 0.878634876110332 比较提升 2.240762849845688 %
精确率 0.7897693079237713 比较提升 1.100155702311405 %
召回率 0.9402913780749941 比较提升 5.425724739649473 %
############随机森林 ###############
准确率 0.8938756428237494 比较提升 0.4975364539748921 %
精确率 0.7913325696830852 比较提升 0.9004343546441449 %
召回率 0.989968951516599 比较提升 0.1353626779252076 %
##############bagging#############
准确率 0.8689107059373539 比较提升 3.384961637791889 %
精确率 0.8196740128558311 比较提升 -2.5883476263828644 %
召回率 0.8527585383329352 比较提升 16.247326228819496 %
##############adaboost#############
准确率 0.8689107059373539 比较提升 3.384961637791889 %
精确率 0.8196740128558311 比较提升 -2.5883476263828644 %
召回率 0.8527585383329352 比较提升 16.247326228819496 %
##############朴素贝叶斯#############
准确率 0.8450210378681627 比较提升 6.3077674688796606 %
精确率 0.7172550030060981 比较提升 11.32135665189797 %
召回率 0.9972534033914497 比较提升 -0.5960775236498645 %
##############MLPCla

In [None]:
clf = svm.SVC(kernel = 'linear')#实例化
clf.fit(x_train.replace([np.inf, -np.inf], np.nan).fillna(-1),y_train)#拟合
y_hat = clf.predict(x_test.replace([np.inf, -np.inf], np.nan).fillna(-1))
print("##############SVM#############")
print('准确率',accuracy_score(y_test, y_hat))
print('精确率',precision_score(y_test, y_hat))
print('召回率',recall_score(y_test, y_hat))