In [1]:
# 一些常规特征
import pandas as pd
from tqdm.autonotebook import *
from bs4 import BeautifulSoup
import re

tqdm.pandas()

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

data = pd.concat([train, test], axis=0, sort=False).reset_index(drop=True)
data = data.fillna(-1)

  This is separate from the ipykernel package so we can avoid doing imports until


In [2]:
def salary_range_min(row):#得到最低工资
    try:
        result = int(str(row['salary_range']).split('-')[0])
    except Exception:
        result = -1
    return result

def salary_range_max(row):#得到最高工资
    try:
        result = int(str(row['salary_range']).split('-')[1])
    except Exception:
        result = -1
    return result

In [3]:
normal_feature = pd.DataFrame()#存储特征
normal_feature['salary_min'] = data.progress_apply(lambda row:salary_range_min(row), axis=1)
normal_feature['salary_max'] = data.progress_apply(lambda row:salary_range_max(row), axis=1)
normal_feature['salary_median'] = (normal_feature['salary_max'] + normal_feature['salary_min'])/2#工资的中间值
normal_feature['salary_range'] = normal_feature['salary_max'] - normal_feature['salary_min']#工资的最高与最低的差值
normal_feature['telecommuting'] = list(data['telecommuting'])#是否远程办公
normal_feature['has_company_logo'] = list(data['has_company_logo'])#公司logo
normal_feature['has_questions'] = list(data['has_questions'])#
from sklearn.preprocessing import LabelEncoder#将值转化为联续的0-n-1的整数。
labelencoder = LabelEncoder()
normal_feature['employment_type'] = labelencoder.fit_transform(data['employment_type'].astype(str))
normal_feature['required_experience'] = labelencoder.fit_transform(data['required_experience'].astype(str))
normal_feature['required_education'] = labelencoder.fit_transform(data['required_education'].astype(str))
normal_feature['industry'] = labelencoder.fit_transform(data['industry'].astype(str))
normal_feature['function'] = labelencoder.fit_transform(data['function'].astype(str))

data['review'] = data.progress_apply(lambda row:str(row['title']) + ' ' + str(row['location']) + ' ' + str(row['company_profile']) + ' ' + 
                                   str(row['description']) + ' ' + str(row['department']) + ' ' + str(row['requirements']) + ' ' + str(row['benefits']), axis=1)

HBox(children=(IntProgress(value=0, max=17880), HTML(value='')))




HBox(children=(IntProgress(value=0, max=17880), HTML(value='')))




HBox(children=(IntProgress(value=0, max=17880), HTML(value='')))




In [4]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer#调用TF-IDF方法
import jieba#结巴分词
from tqdm import *
from sklearn.model_selection import StratifiedKFold
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

df_train = data[:len(train)]
df_test = data[len(train):]

df_train['label'] = df_train['fraudulent'].astype(int)
data = pd.concat([df_train, df_test], axis=0, sort=False)
data['review'] = data['review'].apply(lambda row:str(row))

############################ tf-idf ############################
print('开始计算tf-idf特征')
tf = TfidfVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.95, use_idf=1, smooth_idf=1, sublinear_tf=1)
discuss_tf = tf.fit_transform(data['review']).tocsr()
print('计算结束')

############################ 切分数据集 ##########################
print('开始进行一些前期处理')
train_feature = discuss_tf[:len(df_train)]
score = df_train['label']
test_feature = discuss_tf[len(df_train):]
print('处理完毕')

######################### 模型函数(返回sklean_stacking结果) ########################1
def get_sklearn_classfiy_stacking(clf, train_feature, test_feature, score, model_name, class_number, n_folds, train_num, test_num):
    print('\n****开始跑', model_name, '****')
    stack_train = np.zeros((train_num, class_number))
    stack_test = np.zeros((test_num, class_number))
    score_mean = []
    skf = StratifiedKFold(n_splits=n_folds, random_state=1017)
    tqdm.desc = model_name
    for i, (tr, va) in enumerate(skf.split(train_feature, score)):
        clf.fit(train_feature[tr], score[tr])
        score_va = clf._predict_proba_lr(train_feature[va])
        score_te = clf._predict_proba_lr(test_feature)
        score_single = accuracy_score(score[va], clf.predict(train_feature[va]))
        score_mean.append(np.around(score_single, 5))
        stack_train[va] += score_va
        stack_test += score_te
    stack_test /= n_folds
    stack = np.vstack([stack_train, stack_test])
    df_stack = pd.DataFrame()
    df_stack['tfidf_' + model_name + '_classfiy_{}'.format(1)] = stack[:, 1]
    print(model_name, '处理完毕')
    return df_stack, score_mean

model_list = [
    #['LogisticRegression', LogisticRegression(random_state=1017, C=3)],
    #['SGDClassifier', SGDClassifier(random_state=1017, loss='log')],
    ['PassiveAggressiveClassifier', PassiveAggressiveClassifier(random_state=1017, C=2)],
    #['RidgeClassfiy', RidgeClassifier(random_state=1017)],
    #['LinearSVC', LinearSVC(random_state=1017)]
]

stack_feature = pd.DataFrame()
for i in model_list:
    stack_result, score_mean = get_sklearn_classfiy_stacking(i[1], train_feature, test_feature, score, i[0], 2, 10, len(df_train), len(df_test))
    stack_feature = pd.concat([stack_feature, stack_result], axis=1, sort=False)
    print('五折结果', score_mean)
    print('平均结果', np.mean(score_mean))
normal_feature = pd.concat([stack_feature, normal_feature], axis=1, sort=False)

开始计算tf-idf特征
计算结束
开始进行一些前期处理
处理完毕

****开始跑 PassiveAggressiveClassifier ****
PassiveAggressiveClassifier 处理完毕
五折结果 [0.99039, 0.99039, 0.99209, 0.98869, 0.98699, 0.98869, 0.99321, 0.99377, 0.98981, 0.99095]
平均结果 0.990498


In [5]:
import pandas as pd

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

train_feature = normal_feature[:len(train)]
test_feature = normal_feature[len(train):]

train_feature['label'] = train['fraudulent'].astype(int)
not_cols = ['label']
cols = [col for col in train_feature.columns if col not in not_cols]

In [6]:
from sklearn.metrics import accuracy_score,roc_auc_score,f1_score
import pandas as pd
import numpy as np
import gc
from sklearn.model_selection import KFold,StratifiedKFold
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import os
version = 'lgb_model_normal_feature'

def evaluate_5_fold(train_df, test_df, cols, test=False):
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1017)
    y_test = 0
    oof_train = np.zeros((train_df.shape[0], 1))
    mertics_front = []
    for i, (train_index, val_index) in enumerate(kf.split(train_df[cols],train_df.label)):
        X_train, y_train = train_df.loc[train_index, cols], train_df.label.values[train_index]
        X_val, y_val = train_df.loc[val_index, cols], train_df.label.values[val_index]

        lgb_train = lgb.Dataset(
            X_train, y_train)
        lgb_eval = lgb.Dataset(
            X_val, y_val,
            reference=lgb_train)
        params = {
            'boosting_type': 'gbdt',
            'learning_rate' : 0.01, 
            'verbose': 0,
#             'metrics':{'binary_error'},
#             'num_leaves':32,
            'objective':'binary',
#             'feature_fraction': 0.2,
#             'bagging_fraction':0.7 ,
            'seed': 1024,
            'nthread': 50,
        }
        gbm = lgb.train(params,
                        lgb_train,
                        valid_sets=lgb_eval,
                        num_boost_round=10000,
                        early_stopping_rounds=200,
                        verbose_eval=50,
                        )
        y_pred = gbm.predict(X_val, num_iteration=gbm.best_iteration)
        if test:
            y_test += gbm.predict(test_df[cols], num_iteration=gbm.best_iteration)
        oof_train[val_index] = np.array(y_pred).reshape(len(val_index), 1)
        mertics_front.append(accuracy_score(train_df.label.values[val_index], np.around(y_pred)))
    y_test/= 5
    feature_list = pd.DataFrame()
    feature_list['names'] = cols
    feature_list['imp'] = gbm.feature_importance()
    feature_list = feature_list.sort_values(by='imp', ascending=False)
    print(feature_list)
    print('5 Fold result:', mertics_front)
    print('mean result:', np.mean(mertics_front))
    gc.collect()
    return mertics_front, oof_train, y_test, feature_list
f_score, oof_train, y_test, imp = evaluate_5_fold(train_feature, test_feature, cols, True)

Training until validation scores don't improve for 200 rounds
[50]	valid_0's binary_logloss: 0.07805
[100]	valid_0's binary_logloss: 0.0540237
[150]	valid_0's binary_logloss: 0.0419406
[200]	valid_0's binary_logloss: 0.0355778
[250]	valid_0's binary_logloss: 0.0322363
[300]	valid_0's binary_logloss: 0.0303885
[350]	valid_0's binary_logloss: 0.029018
[400]	valid_0's binary_logloss: 0.0283069
[450]	valid_0's binary_logloss: 0.0278642
[500]	valid_0's binary_logloss: 0.0276772
[550]	valid_0's binary_logloss: 0.027499
[600]	valid_0's binary_logloss: 0.0276659
[650]	valid_0's binary_logloss: 0.0275198
[700]	valid_0's binary_logloss: 0.0273712
[750]	valid_0's binary_logloss: 0.0273267
[800]	valid_0's binary_logloss: 0.0273567
[850]	valid_0's binary_logloss: 0.027406
[900]	valid_0's binary_logloss: 0.0274855
[950]	valid_0's binary_logloss: 0.0275765
Early stopping, best iteration is:
[753]	valid_0's binary_logloss: 0.027301
Training until validation scores don't improve for 200 rounds
[50]	val

In [9]:
test_change_label = y_test.copy()
print(test_change_label)
test_change_label[test_change_label>=0.04] = 1
test_change_label[test_change_label < 0.04] = 0
#test_change_label[test_change_label > 1] = 0
result = pd.DataFrame()
result['id'] = np.arange(0, len(y_test), 1)
result['result'] = np.around(test_change_label)
result['result'] = result['result'].astype(int)
result.to_csv('result/re2.csv', index=False, header=None)

[9.90713518e-01 9.90686176e-01 8.23804718e-01 3.28835094e-04
 3.27700025e-04 5.32593043e-02 9.88891829e-01 5.17078442e-04
 3.40937965e-04 9.88132639e-01 8.57137531e-01 3.28677832e-04
 3.28345605e-04 3.34130507e-04 3.39366499e-04 3.28526770e-04
 8.88258903e-02 9.91484732e-01 1.30211567e-03 9.91584143e-01
 9.82618251e-01 3.78410813e-04 9.91464963e-01 4.78638602e-04
 9.91580764e-01 4.89562966e-03 9.60460461e-01 9.91400940e-01
 4.78467210e-04 9.91511882e-01 9.81883620e-01 9.91584143e-01
 4.66325840e-04 3.27950076e-04 3.35550626e-04 6.94866113e-04
 9.91583340e-01 3.28531915e-04 7.79609738e-01 3.43190659e-04
 9.91484732e-01 1.30568406e-03 3.27976440e-04 3.28695515e-04
 9.91400940e-01 9.91484732e-01 9.91583633e-01 3.34710228e-01
 9.91147387e-01 3.30305151e-04 3.27815649e-04 3.27784316e-04
 8.17738268e-02 2.07836999e-03 1.06325927e-02 3.28615547e-04
 3.27737419e-04 9.91569502e-01 9.49570432e-01 9.90198308e-01
 9.70305732e-01 3.28531915e-04 3.52176829e-04 9.43871324e-01
 9.68819771e-01 3.499385

In [10]:
result.result.value_counts()

1    101
0     99
Name: result, dtype: int64