# 导入工具包

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import lightgbm as lgb

# 读取数据

In [2]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
results = pd.read_csv("../data/results.csv")

# 查看训练数据

In [3]:
train.head()

Unnamed: 0,id,level_1,level_2,level_3,level_4,content,label
0,0,工业/危化品类（现场）—2016版,（二）电气安全,6、移动用电产品、电动工具及照明,1、移动使用的用电产品和I类电动工具的绝缘线，必须采用三芯(单相)或四芯(三相)多股铜芯橡套软线。,"使用移动手动电动工具,外接线绝缘皮破损,应停止使用.",0
1,1,工业/危化品类（现场）—2016版,（一）消防检查,1、防火巡查,3、消防设施、器材和消防安全标志是否在位、完整；,一般,1
2,2,工业/危化品类（现场）—2016版,（一）消防检查,2、防火检查,6、重点工种人员以及其他员工消防知识的掌握情况；,消防知识要加强,0
3,3,工业/危化品类（现场）—2016版,（一）消防检查,1、防火巡查,3、消防设施、器材和消防安全标志是否在位、完整；,消防通道有货物摆放 清理不及时,0
4,4,工业/危化品类（现场）—2016版,（一）消防检查,1、防火巡查,4、常闭式防火门是否处于关闭状态，防火卷帘下是否堆放物品影响使用；,防火门打开状态,0


# 查看训练数据条数

In [4]:
train.shape

(12000, 7)

# 查看测试数据

In [5]:
test.head()

Unnamed: 0,id,level_1,level_2,level_3,level_4,content
0,0,交通运输类（现场）—2016版,（一）消防安全,2、防火检查,2、安全疏散通道、疏散指示标志、应急照明和安全出口情况。,RB1洗地机占用堵塞安全通道
1,1,工业/危化品类（选项）—2016版,（二）仓库,1、一般要求,1、库房内储存物品应分类、分堆、限额存放。,未分类堆放
2,2,工业/危化品类（现场）—2016版,（一）消防检查,1、防火巡查,3、消防设施、器材和消防安全标志是否在位、完整；,消防设施、器材和消防安全标志是否在位、完整
3,3,商贸服务教文卫类（现场）—2016版,（二）电气安全,3、电气线路及电源插头插座,3、电源插座、电源插头应按规定正确接线。,插座随意放在电器旁边
4,4,商贸服务教文卫类（现场）—2016版,（一）消防检查,1、防火巡查,6、其他消防安全情况。,检查中发现一瓶灭火器过期


In [6]:
a = set(train['level_1'])
a,len(a)

({'三小场所（基础）—2016版',
  '三小场所（现场）—2016版',
  '交通运输类（基础）—2016版',
  '交通运输类（现场）—2016版',
  '交通运输类（选项）—2016版',
  '商贸服务教文卫类（基础）—2016版',
  '商贸服务教文卫类（现场）—2016版',
  '商贸服务教文卫类（选项）—2016版',
  '工业/危化品 （现场）',
  '工业/危化品类（基础）—2016版',
  '工业/危化品类（现场）—2016版',
  '工业/危化品类（选项）—2016版',
  '建筑施工类（基础）—2016版',
  '建筑施工类（现场）—2016版',
  '建筑施工类（选项）—2016版',
  '纯办公场所（基础）—2016版',
  '纯办公场所（现场）—2016版'},
 17)

In [7]:
train['level_1'].map(lambda x:dict(zip(a,range(len(a))))[x])

0         4
1         4
2         4
3         4
4         4
         ..
11995     3
11996     4
11997     4
11998     4
11999    15
Name: level_1, Length: 12000, dtype: int64

# 训练集去重

In [8]:
train = train.drop_duplicates(['level_1', 'level_2', 'level_3', 'level_4', 'content', 'label'])

# 查看去重训练数据条数

In [9]:
train.shape

(10619, 7)

In [10]:
# !pip install textvec

# 构建特征

In [11]:
# train['text'] = (train['level_1'] + train['level_2'] + train['level_3'] + train['level_4'] + train['content']
#               ).map(lambda x:' '.join(list(str(x))))
# test['text'] = (test['level_1'] + test['level_2'] + test['level_3'] + test['level_4'] + test['content']
#              ).map(lambda x:' '.join(list(str(x))))

In [12]:
train['text'] = ( train['content']
              ).map(lambda x:' '.join(list(str(x))))
test['text'] = (test['content']
             ).map(lambda x:' '.join(list(str(x))))

In [13]:
vectorizer = TfidfVectorizer(analyzer='char')

In [14]:
train_X = vectorizer.fit_transform(train['text']).toarray()
test_X = vectorizer.transform(test['text']).toarray()

In [15]:
train_X

array([[0.        , 0.62734117, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.12762877, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.4413386 , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.80122334, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.68095895, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.63086452, 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [16]:
train_y = train['label'].astype(int).values

In [17]:
vectorizer.get_feature_names()[-10:]

['！', '（', '）', '，', '．', '：', '；', 'ａ', 'ｂ', '🔌']

In [18]:
train_X.shape

(10619, 1684)

In [19]:
test_X.shape

(18000, 1684)

# 五折交叉验证

In [20]:
params = {
        'task':'train',
        'boosting_type':'gbdt',
        'num_leaves': 31,
        'objective': 'binary', 
        'learning_rate': 0.05, 
        'bagging_freq': 2, 
        'max_bin':256,
        'num_threads': 32,
#         'metric':['binary_logloss','binary_error']
    } 

In [21]:
skf = StratifiedKFold(n_splits=5)

In [22]:
for index,(train_index, test_index) in enumerate(skf.split(train_X, train_y)):
    X_train, X_test = train_X[train_index], train_X[test_index]
    y_train, y_test = train_y[train_index], train_y[test_index]
    
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
    
    gbm = lgb.train(params,
                lgb_train,
                num_boost_round=1000,
                valid_sets=lgb_eval,
                early_stopping_rounds=10)
    y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
    pred = gbm.predict(test_X, num_iteration=gbm.best_iteration)
    if index == 0:
        pred_y_check, true_y_check = list(y_pred), list(y_test)
        pred_out=pred
    else:
        pred_y_check += list(y_pred)
        true_y_check += list(y_test)
        pred_out += pred
        

[LightGBM] [Info] Number of positive: 611, number of negative: 7884
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 30417
[LightGBM] [Info] Number of data points in the train set: 8495, number of used features: 531
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.071925 -> initscore=-2.557494
[LightGBM] [Info] Start training from score -2.557494
[1]	valid_0's binary_logloss: 0.236181
Training until validation scores don't improve for 10 rounds
[2]	valid_0's binary_logloss: 0.221872
[3]	valid_0's binary_logloss: 0.211844
[4]	valid_0's binary_logloss: 0.203738
[5]	valid_0's binary_logloss: 0.196271
[6]	valid_0's binary_logloss: 0.190022
[7]	valid_0's binary_logloss: 0.185008
[8]	valid_0's binary_logloss: 0.180307
[9]	valid_0's binary_logloss: 0.176008
[10]	valid_0's binary_logloss: 0.171944
[11]	valid_0's binary_logloss: 0.168245
[12]	valid_0's binary_logloss: 0.164442
[13]	valid_0's b

[11]	valid_0's binary_logloss: 0.180909
[12]	valid_0's binary_logloss: 0.177564
[13]	valid_0's binary_logloss: 0.174423
[14]	valid_0's binary_logloss: 0.169853
[15]	valid_0's binary_logloss: 0.166089
[16]	valid_0's binary_logloss: 0.161656
[17]	valid_0's binary_logloss: 0.15706
[18]	valid_0's binary_logloss: 0.153015
[19]	valid_0's binary_logloss: 0.149203
[20]	valid_0's binary_logloss: 0.14548
[21]	valid_0's binary_logloss: 0.142202
[22]	valid_0's binary_logloss: 0.139359
[23]	valid_0's binary_logloss: 0.136612
[24]	valid_0's binary_logloss: 0.133259
[25]	valid_0's binary_logloss: 0.131222
[26]	valid_0's binary_logloss: 0.128579
[27]	valid_0's binary_logloss: 0.126602
[28]	valid_0's binary_logloss: 0.124407
[29]	valid_0's binary_logloss: 0.122789
[30]	valid_0's binary_logloss: 0.121064
[31]	valid_0's binary_logloss: 0.119105
[32]	valid_0's binary_logloss: 0.117109
[33]	valid_0's binary_logloss: 0.115789
[34]	valid_0's binary_logloss: 0.11436
[35]	valid_0's binary_logloss: 0.113143
[36

[22]	valid_0's binary_logloss: 0.138691
[23]	valid_0's binary_logloss: 0.13632
[24]	valid_0's binary_logloss: 0.133986
[25]	valid_0's binary_logloss: 0.131447
[26]	valid_0's binary_logloss: 0.129099
[27]	valid_0's binary_logloss: 0.127214
[28]	valid_0's binary_logloss: 0.124789
[29]	valid_0's binary_logloss: 0.123144
[30]	valid_0's binary_logloss: 0.121376
[31]	valid_0's binary_logloss: 0.119423
[32]	valid_0's binary_logloss: 0.117523
[33]	valid_0's binary_logloss: 0.116179
[34]	valid_0's binary_logloss: 0.114579
[35]	valid_0's binary_logloss: 0.113351
[36]	valid_0's binary_logloss: 0.112021
[37]	valid_0's binary_logloss: 0.11081
[38]	valid_0's binary_logloss: 0.109485
[39]	valid_0's binary_logloss: 0.108587
[40]	valid_0's binary_logloss: 0.107707
[41]	valid_0's binary_logloss: 0.106591
[42]	valid_0's binary_logloss: 0.105493
[43]	valid_0's binary_logloss: 0.104254
[44]	valid_0's binary_logloss: 0.102992
[45]	valid_0's binary_logloss: 0.101514
[46]	valid_0's binary_logloss: 0.100475
[4

[55]	valid_0's binary_logloss: 0.0872165
[56]	valid_0's binary_logloss: 0.086467
[57]	valid_0's binary_logloss: 0.0859334
[58]	valid_0's binary_logloss: 0.0852024
[59]	valid_0's binary_logloss: 0.0847102
[60]	valid_0's binary_logloss: 0.0839278
[61]	valid_0's binary_logloss: 0.0834364
[62]	valid_0's binary_logloss: 0.082924
[63]	valid_0's binary_logloss: 0.0826656
[64]	valid_0's binary_logloss: 0.0824756
[65]	valid_0's binary_logloss: 0.0820546
[66]	valid_0's binary_logloss: 0.081543
[67]	valid_0's binary_logloss: 0.0810301
[68]	valid_0's binary_logloss: 0.0806769
[69]	valid_0's binary_logloss: 0.0806002
[70]	valid_0's binary_logloss: 0.080272
[71]	valid_0's binary_logloss: 0.0800765
[72]	valid_0's binary_logloss: 0.0799577
[73]	valid_0's binary_logloss: 0.079543
[74]	valid_0's binary_logloss: 0.0794337
[75]	valid_0's binary_logloss: 0.079327
[76]	valid_0's binary_logloss: 0.0789351
[77]	valid_0's binary_logloss: 0.0787511
[78]	valid_0's binary_logloss: 0.0783336
[79]	valid_0's binary_

[105]	valid_0's binary_logloss: 0.0780506
[106]	valid_0's binary_logloss: 0.0778654
[107]	valid_0's binary_logloss: 0.0778446
[108]	valid_0's binary_logloss: 0.0778976
[109]	valid_0's binary_logloss: 0.0778792
[110]	valid_0's binary_logloss: 0.0778107
[111]	valid_0's binary_logloss: 0.0778356
[112]	valid_0's binary_logloss: 0.0777782
[113]	valid_0's binary_logloss: 0.0778799
[114]	valid_0's binary_logloss: 0.0779355
[115]	valid_0's binary_logloss: 0.0778015
[116]	valid_0's binary_logloss: 0.0778149
[117]	valid_0's binary_logloss: 0.0777763
[118]	valid_0's binary_logloss: 0.0776748
[119]	valid_0's binary_logloss: 0.0775547
[120]	valid_0's binary_logloss: 0.0774288
[121]	valid_0's binary_logloss: 0.0772332
[122]	valid_0's binary_logloss: 0.0771438
[123]	valid_0's binary_logloss: 0.0770367
[124]	valid_0's binary_logloss: 0.076909
[125]	valid_0's binary_logloss: 0.0768653
[126]	valid_0's binary_logloss: 0.0766346
[127]	valid_0's binary_logloss: 0.0765829
[128]	valid_0's binary_logloss: 0.0

# 线下验证

In [23]:
for i in range(10):
    pred = [int(x) for x in np.where(np.array(pred_y_check) >= i/10.0,1,0)]
    scores = f1_score(true_y_check,pred)
    print(i, scores)

0 0.13423526311165773
1 0.7694915254237288
2 0.7989886219974716
3 0.8128772635814889
4 0.8114525139664804
5 0.7932797662527392
6 0.7805243445692884
7 0.7602792862684251
8 0.7135842880523732
9 0.6159355416293644


In [24]:
for i in range(10):
    pred = [int(x) for x in np.where(np.array(pred_y_check) >= i/10.0,1,0)]
    scores = f1_score(true_y_check,pred)
    print(i/10.0, scores)

0.0 0.13423526311165773
0.1 0.7694915254237288
0.2 0.7989886219974716
0.3 0.8128772635814889
0.4 0.8114525139664804
0.5 0.7932797662527392
0.6 0.7805243445692884
0.7 0.7602792862684251
0.8 0.7135842880523732
0.9 0.6159355416293644


In [25]:
pred_y_check

[0.0032345727761327616,
 0.9884558851957882,
 0.8104581945975711,
 0.0005730862931690259,
 0.03921585515500031,
 0.07008035357071751,
 0.9954434549443654,
 0.0007125613039335475,
 0.9957619646769831,
 0.0005030465697981762,
 0.0006981610000848268,
 0.0005448086213494328,
 0.06899294208695088,
 0.00046021057924613957,
 0.0008252584370914179,
 0.00042735897488710867,
 0.0010917208531155758,
 0.002637456620549911,
 0.006543256702594257,
 0.0012154519257235642,
 0.00045596414307771824,
 0.0013699487805427664,
 0.0057835684396571226,
 0.0009012502597672886,
 0.03380109774391483,
 0.018685107862311447,
 0.5531085931191551,
 0.0014206987649586322,
 0.9773905802537705,
 0.007559113146079031,
 0.00042515859045689824,
 0.0010662321702193606,
 0.9929531080430652,
 0.0005558161885551026,
 0.08165493309672987,
 0.0006899774024536996,
 0.0002744959252014447,
 0.00024006646971945302,
 0.004678873184845266,
 0.3815409100070065,
 0.005047915550335809,
 0.9658926342208741,
 0.0003860178445033244,
 0.000

In [26]:
train['content'].map(lambda x:len(list(str(x))))

0        26
1         2
2         7
3        15
4         7
         ..
11993    19
11995    15
11996    62
11997    16
11998    17
Name: content, Length: 10619, dtype: int64

In [27]:
a = set(train['level_1'])

In [28]:
zip(a, range(len(a)))

<zip at 0x7fb00845ed40>

In [29]:
# 类别 cat 特征
train['level_1'].map(lambda x: dict(zip(a, range(len(a))))[x])

0        4
1        4
2        4
3        4
4        4
        ..
11993    5
11995    3
11996    4
11997    4
11998    4
Name: level_1, Length: 10619, dtype: int64