In [1]:
# 一些常规特征
import pandas as pd
from tqdm.autonotebook import *
from bs4 import BeautifulSoup
import re

tqdm.pandas()

train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

data = pd.concat([train, test], axis=0, sort=False).reset_index(drop=True)
data = data.fillna(-1)

  This is separate from the ipykernel package so we can avoid doing imports until


In [2]:
def salary_range_min(row):
    try:
        result = int(str(row['salary_range']).split('-')[0])
    except Exception:
        result = -1
    return result

def salary_range_max(row):
    try:
        result = int(str(row['salary_range']).split('-')[1])
    except Exception:
        result = -1
    return result

In [3]:
normal_feature = pd.DataFrame()
normal_feature['salary_min'] = data.progress_apply(lambda row:salary_range_min(row), axis=1)
normal_feature['salary_max'] = data.progress_apply(lambda row:salary_range_max(row), axis=1)
normal_feature['salary_median'] = (normal_feature['salary_max'] + normal_feature['salary_min'])/2
normal_feature['salary_range'] = normal_feature['salary_max'] - normal_feature['salary_min']
normal_feature['telecommuting'] = list(data['telecommuting'])
normal_feature['has_company_logo'] = list(data['has_company_logo'])
normal_feature['has_questions'] = list(data['has_questions'])
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
normal_feature['employment_type'] = labelencoder.fit_transform(data['employment_type'].astype(str))
normal_feature['required_experience'] = labelencoder.fit_transform(data['required_experience'].astype(str))
normal_feature['required_education'] = labelencoder.fit_transform(data['required_education'].astype(str))
normal_feature['industry'] = labelencoder.fit_transform(data['industry'].astype(str))
normal_feature['function'] = labelencoder.fit_transform(data['function'].astype(str))

# 对所有字段计算value_counts
# def get_value_counts_feature(cols, data):
#     value_data = data.groupby(cols).size().reset_index()
#     value_data.columns = [cols, cols + '_count']
#     data = pd.merge(data, value_data, on=cols, how='left')
#     return data[cols + '_count']

# for i in data.columns:
#     if i == 'fraudulent':
#         pass
#     else:
#         normal_feature[i + '_count'] = get_value_counts_feature(i, data)

data['review'] = data.progress_apply(lambda row:str(row['title']) + ' ' + str(row['location']) + ' ' + str(row['company_profile']) + ' ' + 
                                   str(row['description']) + ' ' + str(row['department']) + ' ' + str(row['requirements']) + ' ' + str(row['benefits']), axis=1)

normal_feature.head(5)

HBox(children=(FloatProgress(value=0.0, max=17880.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=17880.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=17880.0), HTML(value='')))




Unnamed: 0,salary_min,salary_max,salary_median,salary_range,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function
0,-1,1,0.0,2,0,1,1,0,0,0,0,0
1,-1,1,0.0,2,0,1,1,2,6,0,23,13
2,0,130000,65000.0,130000,0,0,0,2,0,2,0,0
3,-1,1,0.0,2,0,1,0,2,1,2,23,25
4,-1,1,0.0,2,0,0,1,4,5,8,73,32


In [4]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import jieba
from tqdm import *
from sklearn.model_selection import StratifiedKFold
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

df_train = data[:len(train)]
df_test = data[len(train):]

df_train['label'] = df_train['fraudulent'].astype(int)
data = pd.concat([df_train, df_test], axis=0, sort=False)
data['review'] = data['review'].apply(lambda row:str(row))

############################ tf-idf ############################
print('开始计算tf-idf特征')
tf = TfidfVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1)
discuss_tf = tf.fit_transform(data['review']).tocsr()
print('计算结束')

############################ 切分数据集 ##########################
print('开始进行一些前期处理')
train_feature = discuss_tf[:len(df_train)]
score = df_train['label']
test_feature = discuss_tf[len(df_train):]
print('处理完毕')

######################### 模型函数(返回sklean_stacking结果) ########################
def get_sklearn_classfiy_stacking(clf, train_feature, test_feature, score, model_name, class_number, n_folds, train_num, test_num):
    print('\n****开始跑', model_name, '****')
    stack_train = np.zeros((train_num, class_number))
    stack_test = np.zeros((test_num, class_number))
    score_mean = []
    skf = StratifiedKFold(n_splits=n_folds, random_state=1017)
    tqdm.desc = model_name
    for i, (tr, va) in enumerate(skf.split(train_feature, score)):
        clf.fit(train_feature[tr], score[tr])
        score_va = clf._predict_proba_lr(train_feature[va])
        score_te = clf._predict_proba_lr(test_feature)
        score_single = accuracy_score(score[va], clf.predict(train_feature[va]))
        score_mean.append(np.around(score_single, 5))
        stack_train[va] += score_va
        stack_test += score_te
    stack_test /= n_folds
    stack = np.vstack([stack_train, stack_test])
    df_stack = pd.DataFrame()
    df_stack['tfidf_' + model_name + '_classfiy_{}'.format(1)] = stack[:, 1]
    print(model_name, '处理完毕')
    return df_stack, score_mean

model_list = [
    ['LogisticRegression', LogisticRegression(random_state=1017, C=3)],
    ['SGDClassifier', SGDClassifier(random_state=1017, loss='log')],
    ['PassiveAggressiveClassifier', PassiveAggressiveClassifier(random_state=1017, C=2)],
    ['RidgeClassfiy', RidgeClassifier(random_state=1017)],
    ['LinearSVC', LinearSVC(random_state=1017)]
]

stack_feature = pd.DataFrame()
for i in model_list:
    stack_result, score_mean = get_sklearn_classfiy_stacking(i[1], train_feature, test_feature, score, i[0], 2, 5, len(df_train), len(df_test))
    stack_feature = pd.concat([stack_feature, stack_result], axis=1, sort=False)
    print('五折结果', score_mean)
    print('平均结果', np.mean(score_mean))
normal_feature = pd.concat([stack_feature, normal_feature], axis=1, sort=False)

开始计算tf-idf特征
计算结束
开始进行一些前期处理
处理完毕

****开始跑 LogisticRegression ****
LogisticRegression 处理完毕
五折结果 [0.97936, 0.98049, 0.97738, 0.98473, 0.97963]
平均结果 0.9803179999999999

****开始跑 SGDClassifier ****
SGDClassifier 处理完毕
五折结果 [0.96777, 0.96748, 0.96861, 0.96974, 0.96747]
平均结果 0.968214

****开始跑 PassiveAggressiveClassifier ****
PassiveAggressiveClassifier 处理完毕
五折结果 [0.98954, 0.98925, 0.98784, 0.99293, 0.98953]
平均结果 0.989818

****开始跑 RidgeClassfiy ****
RidgeClassfiy 处理完毕
五折结果 [0.98332, 0.98501, 0.98275, 0.98982, 0.98359]
平均结果 0.984898

****开始跑 LinearSVC ****
LinearSVC 处理完毕
五折结果 [0.98699, 0.98812, 0.98558, 0.99152, 0.9867]
平均结果 0.9877819999999999


In [5]:
import sys
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.model_selection import StratifiedKFold
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDClassifier,SGDRegressor
from sklearn.linear_model import PassiveAggressiveClassifier,PassiveAggressiveRegressor
from sklearn.linear_model import Ridge
from wordbatch.models import FTRL,FM_FTRL
from sklearn.svm import LinearSVR

from sklearn.metrics import mean_squared_error



############################ tf-idf ############################
print('开始计算tf-idf特征')
tf = TfidfVectorizer(min_df=3,  max_features=10000,
        strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
        ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1,
        stop_words = 'english')
df_train = data[:len(train)]
df_test = data[len(train):]
train_data = df_train
test_data = df_test
df_train['label'] = df_train['fraudulent'].astype(int)
data = pd.concat([df_train, df_test], axis=0, sort=False)
data['review'] = data['review'].apply(lambda row:str(row))
discuss_tf=tf.fit_transform(data['review'])
train_feat=tf.transform(df_train['review'])
test_feat=tf.transform(df_test['review'])
print('计算结束')


############################ 切分数据集 ##########################
print('开始进行一些前期处理')
train_feature = train_feat
test_feature = test_feat
# 五则交叉验证
n_folds = 5
print('处理完毕')
df_stack2 = pd.DataFrame()
for label in ["fake"]:
    score = df_train['label'] 
    
   
    ########################### SGD(随机梯度下降) ################################
    print('sgd stacking')
    stack_train = np.zeros((len(train_data),1))
    stack_test = np.zeros((len(test_data),1))
    score_va = 0

    sk = StratifiedKFold( n_splits=5, random_state=1017)
    for i, (tr, va) in enumerate(sk.split(train_feature, score)):
        print('stack:%d/%d' % ((i + 1), n_folds))
        sgd = SGDRegressor(random_state=1017,)
        sgd.fit(train_feature[tr], score[tr])
        score_va = sgd.predict(train_feature[va])
        score_te = sgd.predict(test_feature)
        print('得分' + str(mean_squared_error(score[va], sgd.predict(train_feature[va]))))
        stack_train[va,0] = score_va
        stack_test[:,0]+= score_te
    stack_test /= n_folds
    stack = np.vstack([stack_train, stack_test])
    df_stack2['tfidf_sgd_classfiy_{}'.format(label)] = stack[:,0]


    ########################### pac(PassiveAggressiveClassifier) ################################
    print('PAC stacking')
    stack_train = np.zeros((len(train_data),1))
    stack_test = np.zeros((len(test_data),1))
    score_va = 0

    sk = StratifiedKFold( n_splits=5, random_state=1017)
    for i, (tr, va) in enumerate(sk.split(train_feature, score)):
        print('stack:%d/%d' % ((i + 1), n_folds))
        pac = PassiveAggressiveRegressor(random_state=1017)
        pac.fit(train_feature[tr], score[tr])
        score_va = pac.predict(train_feature[va])
        score_te = pac.predict(test_feature)
      
        print('得分' + str(mean_squared_error(score[va], pac.predict(train_feature[va]))))
        stack_train[va,0] = score_va
        stack_test[:,0] += score_te
    stack_test /= n_folds
    stack = np.vstack([stack_train, stack_test])

    df_stack2['tfidf_pac_classfiy_{}'.format(label)] = stack[:,0]
    

    ########################### FTRL ################################
    print('MultinomialNB stacking')
    stack_train = np.zeros((len(train_data),1))
    stack_test = np.zeros((len(test_data),1))
    score_va = 0

    sk = StratifiedKFold( n_splits=5, random_state=1017)
    for i, (tr, va) in enumerate(sk.split(train_feature, score)):
        print('stack:%d/%d' % ((i + 1), n_folds))
        clf = FTRL(alpha=0.01, beta=0.1, L1=0.00001, L2=1.0, D=train_feature.shape[1], iters=50, inv_link="identity", threads=1)
        clf.fit(train_feature[tr], score[tr])
        score_va = clf.predict(train_feature[va])
        score_te = clf.predict(test_feature)
      
        print('得分' + str(mean_squared_error(score[va], clf.predict(train_feature[va]))))
        stack_train[va,0] = score_va
        stack_test[:,0] += score_te
    stack_test /= n_folds
    stack = np.vstack([stack_train, stack_test])
    
    df_stack2['tfidf_FTRL_classfiy_{}'.format(label)] = stack[:,0]
    
    ########################### ridge(RidgeClassfiy) ################################
    print('RidgeClassfiy stacking')
    stack_train = np.zeros((len(train_data),1))
    stack_test = np.zeros((len(test_data),1))
    score_va = 0

    sk = StratifiedKFold( n_splits=5, random_state=1017)
    for i, (tr, va) in enumerate(sk.split(train_feature, score)):
        print('stack:%d/%d' % ((i + 1), n_folds))
        ridge = Ridge(solver="sag", fit_intercept=True, random_state=42, alpha=30) 
        ridge.fit(train_feature[tr], score[tr])
        score_va = ridge.predict(train_feature[va])
        score_te = ridge.predict(test_feature)
       
        print('得分' + str(mean_squared_error(score[va], ridge.predict(train_feature[va]))))
        stack_train[va,0] = score_va
        stack_test[:,0] += score_te
    stack_test /= n_folds
    stack = np.vstack([stack_train, stack_test])

    df_stack2['tfidf_ridge_classfiy_{}'.format(label)] = stack[:,0]
    
    ############################ Linersvc(LinerSVC) ################################
    print('LinerSVC stacking')
    stack_train = np.zeros((len(train_data),1))
    stack_test = np.zeros((len(test_data),1))
    score_va = 0

    sk = StratifiedKFold( n_splits=5, random_state=1017)
    for i, (tr, va) in enumerate(sk.split(train_feature, score)):
        print('stack:%d/%d' % ((i + 1), n_folds))
        lsvc = LinearSVR(random_state=1017)
        lsvc.fit(train_feature[tr], score[tr])
        score_va = lsvc.predict(train_feature[va])
        score_te = lsvc.predict(test_feature)
       
        print('得分' + str(mean_squared_error(score[va], lsvc.predict(train_feature[va]))))
        stack_train[va,0] = score_va
        stack_test[:,0] += score_te
    stack_test /= n_folds
    stack = np.vstack([stack_train, stack_test])

    df_stack2['tfidf_lsvc_classfiy_{}'.format(label)] = stack[:,0]
print('tfidf特征已保存\n')

开始计算tf-idf特征
计算结束
开始进行一些前期处理
处理完毕
sgd stacking
stack:1/5
得分0.03786815014158462
stack:2/5
得分0.03765604643608347
stack:3/5
得分0.037879005330643964
stack:4/5
得分0.037434859111442766
stack:5/5
得分0.03789069275120299
PAC stacking
stack:1/5
得分0.022159844972041133
stack:2/5
得分0.02264334208584598
stack:3/5
得分0.02340489084659103
stack:4/5
得分0.020502724184960967
stack:5/5
得分0.022061861536353648
MultinomialNB stacking
stack:1/5
Total e: 1000.0219319380075
Total e: 953.6409274063603
Total e: 925.8174254985047
Total e: 906.302039093391
Total e: 891.2576038655386
Total e: 878.9295238506995
Total e: 868.5842035141704
Total e: 859.64153671364
Total e: 851.858010150978
Total e: 844.8806295717654
Total e: 838.6000326228709
Total e: 832.918221204479
Total e: 827.7796453221669
Total e: 823.0853405250981
Total e: 818.8139980104461
Total e: 814.8671740641404
Total e: 811.2148090141906
Total e: 807.8313253842401
Total e: 804.701592657918
Total e: 801.758986213227
Total e: 798.9923261733502
Total e: 796.39172535

In [6]:
normal_feature = pd.concat([normal_feature, df_stack2], axis=1, sort=False)

In [7]:
normal_feature.to_csv('feature/normal_feature.csv', index=False)