In [1]:
import os
for dirname, _, filenames in os.walk('../data/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

../data/test.csv
../data/train.csv
../data/ml_submission.csv


In [2]:
# 一些常规特征
import pandas as pd
from tqdm.autonotebook import *
from bs4 import BeautifulSoup
import re

tqdm.pandas()

train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

data = pd.concat([train, test], axis=0, sort=False).reset_index(drop=True)
data = data.fillna(-1)

  This is separate from the ipykernel package so we can avoid doing imports until


In [3]:
def salary_range_min(row):
    try:
        result = int(str(row['salary_range']).split('-')[0])
    except Exception:
        result = -1
    return result

def salary_range_max(row):
    try:
        result = int(str(row['salary_range']).split('-')[1])
    except Exception:
        result = -1
    return result

In [4]:
normal_feature = pd.DataFrame()
normal_feature['salary_min'] = data.progress_apply(lambda row:salary_range_min(row), axis=1)
normal_feature['salary_max'] = data.progress_apply(lambda row:salary_range_max(row), axis=1)
normal_feature['salary_median'] = (normal_feature['salary_max'] + normal_feature['salary_min'])/2
normal_feature['salary_range'] = normal_feature['salary_max'] - normal_feature['salary_min']
normal_feature['telecommuting'] = list(data['telecommuting'])
normal_feature['has_company_logo'] = list(data['has_company_logo'])
normal_feature['has_questions'] = list(data['has_questions'])
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
normal_feature['employment_type'] = labelencoder.fit_transform(data['employment_type'].astype(str))
normal_feature['required_experience'] = labelencoder.fit_transform(data['required_experience'].astype(str))
normal_feature['required_education'] = labelencoder.fit_transform(data['required_education'].astype(str))
normal_feature['industry'] = labelencoder.fit_transform(data['industry'].astype(str))
normal_feature['function'] = labelencoder.fit_transform(data['function'].astype(str))

data['review'] = data.progress_apply(lambda row:str(row['title']) + ' ' + str(row['location']) + ' ' + str(row['company_profile']) + ' ' + 
                                   str(row['description']) + ' ' + str(row['department']) + ' ' + str(row['requirements']) + ' ' + str(row['benefits']), axis=1)

HBox(children=(FloatProgress(value=0.0, max=17880.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=17880.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=17880.0), HTML(value='')))




In [5]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import jieba
from tqdm import *
from sklearn.model_selection import StratifiedKFold
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score


df_train = data[:len(train)]
df_test = data[len(train):]

df_train['label'] = df_train['fraudulent'].astype(int)
data = pd.concat([df_train, df_test], axis=0, sort=False)
data['review'] = data['review'].apply(lambda row:str(row))

############################ tf-idf ############################
print('开始计算tf-idf特征')
tf = TfidfVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1)
discuss_tf = tf.fit_transform(data['review']).tocsr()
print('计算结束')

############################ 切分数据集 ##########################
print('开始进行一些前期处理')
train_feature = discuss_tf[:len(df_train)]
score = df_train['label']
test_feature = discuss_tf[len(df_train):]
print('处理完毕')

######################### 模型函数(返回sklean_stacking结果) ########################
def get_sklearn_classfiy_stacking(clf, train_feature, test_feature, score, model_name, class_number, n_folds, train_num, test_num):
    print('\n****开始跑', model_name, '****')
    stack_train = np.zeros((train_num, class_number))
    stack_test = np.zeros((test_num, class_number))
    score_mean = []
    skf = StratifiedKFold(n_splits=n_folds, random_state=42)
    tqdm.desc = model_name
    for i, (tr, va) in enumerate(skf.split(train_feature, score)):
        clf.fit(train_feature[tr], score[tr])
        score_va = clf._predict_proba_lr(train_feature[va])
        score_te = clf._predict_proba_lr(test_feature)
        score_single = accuracy_score(score[va], clf.predict(train_feature[va]))
        score_mean.append(np.around(score_single, 5))
        stack_train[va] += score_va
        stack_test += score_te
    stack_test /= n_folds
    stack = np.vstack([stack_train, stack_test])
    df_stack = pd.DataFrame()
    df_stack['tfidf_' + model_name + '_classfiy_{}'.format(1)] = stack[:, 1]
    print(model_name, '处理完毕')
    return df_stack, score_mean

model_list = [
    ['LogisticRegression', LogisticRegression(random_state=42, C=3)],
    ['SGDClassifier', SGDClassifier(random_state=42, loss='perceptron')],
    ['PassiveAggressiveClassifier', PassiveAggressiveClassifier(random_state=42, C=2)],
    ['RidgeClassfiy', RidgeClassifier(random_state=42)],
    ['LinearSVC', LinearSVC(random_state=42)]
]

stack_feature = pd.DataFrame()
for i in model_list:
    stack_result, score_mean = get_sklearn_classfiy_stacking(i[1], train_feature, test_feature, score, i[0], 2, 5, len(df_train), len(df_test))
    stack_feature = pd.concat([stack_feature, stack_result], axis=1, sort=False)
    print('五折结果', score_mean)
    print('平均结果', np.mean(score_mean))
normal_feature = pd.concat([stack_feature, normal_feature], axis=1, sort=False)

开始计算tf-idf特征
计算结束
开始进行一些前期处理
处理完毕

****开始跑 LogisticRegression ****
LogisticRegression 处理完毕
五折结果 [0.97936, 0.98049, 0.97738, 0.98473, 0.97963]
平均结果 0.9803179999999999

****开始跑 SGDClassifier ****
SGDClassifier 处理完毕
五折结果 [0.98926, 0.99067, 0.98727, 0.99123, 0.98868]
平均结果 0.989422

****开始跑 PassiveAggressiveClassifier ****
PassiveAggressiveClassifier 处理完毕
五折结果 [0.98954, 0.98954, 0.98699, 0.99265, 0.9901]
平均结果 0.9897640000000001

****开始跑 RidgeClassfiy ****
RidgeClassfiy 处理完毕
五折结果 [0.98332, 0.98501, 0.98275, 0.98982, 0.98359]
平均结果 0.984898

****开始跑 LinearSVC ****
LinearSVC 处理完毕
五折结果 [0.98699, 0.98812, 0.98558, 0.99152, 0.9867]
平均结果 0.9877819999999999


In [6]:
import pandas as pd

train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')


train_feature = normal_feature[:len(train)]
test_feature = normal_feature[len(train):]

train_feature['label'] = train['fraudulent'].astype(int)
not_cols = ['label']
cols = [col for col in train_feature.columns if col not in not_cols]

In [7]:
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import gc
from sklearn.model_selection import KFold,StratifiedKFold
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import os
version = 'lgb_model_normal_feature'

def evaluate_5_fold(train_df, test_df, cols, test=False):
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1017)
    y_test = 0
    oof_train = np.zeros((train_df.shape[0], 1))
    mertics_front = []
    for i, (train_index, val_index) in enumerate(kf.split(train_df[cols],train_df.label)):
        X_train, y_train = train_df.loc[train_index, cols], train_df.label.values[train_index]
        X_val, y_val = train_df.loc[val_index, cols], train_df.label.values[val_index]

        lgb_train = lgb.Dataset(
            X_train, y_train)
        lgb_eval = lgb.Dataset(
            X_val, y_val,
            reference=lgb_train)
        params = {
            'boosting_type': 'gbdt',
            'learning_rate' : 0.003, 
            'verbose': 0,
            'objective':'binary',
            'seed': 1024,
            'nthread': 50,
            'subsample': 0.85,
            'colsample_bytree': 0.85,
            'reg_alpha': 0.3,
            'reg_lamdba': 0.243,
            'num_leaves': 512,
        }
        gbm = lgb.train(params,
                        lgb_train,
                        valid_sets=lgb_eval,
                        num_boost_round=10000,
                        early_stopping_rounds=500,
                        verbose_eval=500,
                        )
        y_pred = gbm.predict(X_val, num_iteration=gbm.best_iteration)
        if test:
            y_test += gbm.predict(test_df[cols], num_iteration=gbm.best_iteration)
        oof_train[val_index] = np.array(y_pred).reshape(len(val_index), 1)
        mertics_front.append(accuracy_score(train_df.label.values[val_index], np.around(y_pred)))
    y_test/= 5
    feature_list = pd.DataFrame()
    feature_list['names'] = cols
    feature_list['imp'] = gbm.feature_importance()
    feature_list = feature_list.sort_values(by='imp', ascending=False)
    print(feature_list)
    print('5 Fold result:', mertics_front)
    print('mean result:', np.mean(mertics_front))
    gc.collect()
    return mertics_front, oof_train, y_test, feature_list
f_score, oof_train, y_test, imp = evaluate_5_fold(train_feature, test_feature, cols, True)

Training until validation scores don't improve for 500 rounds.
[500]	valid_0's binary_logloss: 0.0404973
[1000]	valid_0's binary_logloss: 0.0286302
[1500]	valid_0's binary_logloss: 0.0264742
[2000]	valid_0's binary_logloss: 0.0267933
Early stopping, best iteration is:
[1572]	valid_0's binary_logloss: 0.0263894
Training until validation scores don't improve for 500 rounds.
[500]	valid_0's binary_logloss: 0.0403522
[1000]	valid_0's binary_logloss: 0.0272066
[1500]	valid_0's binary_logloss: 0.0246409
[2000]	valid_0's binary_logloss: 0.0245345
Early stopping, best iteration is:
[1870]	valid_0's binary_logloss: 0.0244055
Training until validation scores don't improve for 500 rounds.
[500]	valid_0's binary_logloss: 0.0459013
[1000]	valid_0's binary_logloss: 0.0339759
[1500]	valid_0's binary_logloss: 0.0328021
Early stopping, best iteration is:
[1446]	valid_0's binary_logloss: 0.0327748
Training until validation scores don't improve for 500 rounds.
[500]	valid_0's binary_logloss: 0.0420373
[1

In [8]:
test_change_label = y_test.copy()
test_change_label[test_change_label >= 0.05] = 1
test_change_label[test_change_label < 0.05] = 0
result = pd.DataFrame()
result['id'] = np.arange(0, len(y_test), 1)
result['result'] = np.around(test_change_label)
result['result'] = result['result'].astype(int)
result.to_csv('lgb_baseline.csv', index=False, header=None)

In [9]:
result.result.value_counts()

0    104
1     96
Name: result, dtype: int64