### 业务场景

根据招聘网站岗位描述，预测某些描述的薪资情况

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

### 数据获取

In [16]:
df_data = pd.read_excel('./jobs_4k.xls')
df_data.head()

Unnamed: 0,id,positionName,district,stationname,jobNature,companyLabelList,industryField,salary,companySize,skillLables,createTime,companyFullName,workYear,education,positionAdvantage,url,detail,type
0,3,财务产品经理,海淀区,上地,全职,"['节日礼物', '年底双薪', '定期体检', '七险一金']","物流丨运输,软件开发",10k-15k,500-2000人,"['产品设计', '后台', 'B2B']",2020-05-15 15:10:02,北京福佑多多信息技术有限公司,1-3年,本科,福利待遇好、创业黑马、技术氛围好,https://www.lagou.com/jobs/7149613.html,岗位职责1、负责财务相关业务解决方案架构并主导与产品、技术等交付团队的对接，能够从完整的财务...,产品线
1,4,产品专家,海淀区,中关村,全职,"['五险一金', '弹性工作', '带薪年假', '免费两餐']",企业服务,30k-50k,500-2000人,[],2020-05-16 08:23:06,北京拉勾网络技术有限公司,5-10年,本科,做好产品，给用户用,https://www.lagou.com/jobs/7056930.html,负责具体产品线的工作，完成用户价值。解决具体产品线的问题。产品线有策略、运营、基础产品流程等线,产品线
2,5,产品总监,海淀区,中关村,全职,"['五险一金', '弹性工作', '带薪年假', '免费两餐']",企业服务,40k-75k,500-2000人,[],2020-05-16 08:23:06,北京拉勾网络技术有限公司,5-10年,本科,做好产品，给用户用,https://www.lagou.com/jobs/6982266.html,做出好产品，服务用户，实现用户价值。B端产品、C端产品、商业策略产品、运营产品等岗位，都有总...,产品线
3,6,联盟广告产品运营专家/专员-【商业化】,海淀区,东单,全职,"['股票期权', '弹性工作', '定期体检', '岗位晋升']",文娱丨内容,20k-40k,2000人以上,['产品运营'],2020-05-15 19:35:13,北京达佳互联信息技术有限公司,5-10年,不限,扁平化管理 免费午餐晚餐下午茶,https://www.lagou.com/jobs/7112346.html,职位描述：1、负责快手联盟竞价广告的落地和运营，提升产品使用率和覆盖率； 2、深度分析广告预...,产品线
4,7,产品总监,海淀区,中关村,全职,"['领导好', '五险一金', '领军企业', '脑力密集型']","移动互联网,社交",20k-38k,15-50人,"['电商', '社交电商', '产品策划', '需求分析']",2020-05-16 09:15:16,北京领主科技有限公司,5-10年,本科,发展，创新,https://www.lagou.com/jobs/6864122.html,工作职责:1. 整体负责公司的产品功能规划、优化升级、实施工作； 2. 带领产品团队完成产品...,产品线


In [17]:
df_data['companyLabelList'] = df_data['companyLabelList'].apply(lambda x: np.nan if x =='[]' else x)
df_data['skillLables'] = df_data['skillLables'].apply(lambda x: np.nan if x =='[]' else x)

In [35]:
df_data = df_data.dropna(axis=0)

In [36]:
df_data.index

Int64Index([   0,    3,    4,    6,    7,    8,   10,   11,   12,   14,
            ...
            4494, 4495, 4496, 4497, 4499, 4500, 4503, 4505, 4506, 4510],
           dtype='int64', length=3239)

In [37]:
df_data['salary']

0       10k-15k
3       20k-40k
4       20k-38k
6       40k-60k
7       30k-50k
         ...   
4500      6k-9k
4503    14k-25k
4505    20k-40k
4506    15k-25k
4510    15k-25k
Name: salary, Length: 3239, dtype: object

### 特征构造

#### tfidf特征向量

In [38]:
# tfidf_features = ['positionName', 'industryField', 'skillLables']
tfidf_features = ['positionName', 'skillLables']

def re_string(string):
    string = re.sub('[丨\[\]\',-/【】（）()—; ]', '', string)
    string = re.sub('&nbsp', '',string)
    return string
    
def df_to_str(df_loc):
    string = ''
    for i in range(len(df_loc)):
        string += df_loc.iloc[i]
    return re_string(string)

string_list = []
for i in range(len(df_data[tfidf_features])):
    string_list.append(df_to_str(df_data[tfidf_features].iloc[i,:]))
    
tfidf = TfidfVectorizer(max_features=300)
str_features_vec = tfidf.fit_transform(string_list).toarray()

In [39]:
str_features_vec.shape

(3239, 300)

#### 独热特征构造

In [43]:
one_hot_features = ['district', 'stationname', 'jobNature', 'education']

# district字段处理
df_data['district'] = df_data['district'].apply(
    lambda x: x if x in ['海淀区', '朝阳区', '东城区', '西城区', '昌平区'] else '其他')

# stationname字段处理
save_value_list = df_data['stationname'].value_counts()[df_data['stationname'].value_counts() >= 100].index.tolist()
save_value_list = [x for x in save_value_list if x != 'None']
df_data['stationname'] = df_data['stationname'].apply(
    lambda x: x if x in save_value_list else '其他')

# 合并onehot列
df_one_hot = pd.DataFrame([],index=[x for x in df_data.index.tolist()])
for column in one_hot_features:
    df_column_hot = pd.get_dummies(df_data[column], prefix=column)
    df_one_hot = pd.concat([df_one_hot, df_column_hot], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_data['district'] = df_data['district'].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_data['stationname'] = df_data['stationname'].apply(


In [47]:
df_one_hot

Unnamed: 0,district_东城区,district_其他,district_昌平区,district_朝阳区,district_海淀区,district_西城区,stationname_东湖渠,stationname_中关村,stationname_其他,stationname_国贸,...,stationname_知春路,stationname_知春里,stationname_西二旗,jobNature_全职,jobNature_兼职,jobNature_实习,education_不限,education_大专,education_本科,education_硕士
0,0,0,0,0,1,0,0,0,1,0,...,0,0,0,1,0,0,0,0,1,0
3,0,0,0,0,1,0,0,0,1,0,...,0,0,0,1,0,0,1,0,0,0
4,0,0,0,0,1,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,0
6,0,0,0,0,1,0,0,0,0,0,...,1,0,0,1,0,0,0,0,1,0
7,0,0,0,1,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4500,0,0,0,1,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,1,0
4503,0,0,0,0,1,0,0,0,0,0,...,1,0,0,1,0,0,0,1,0,0
4505,0,0,0,0,1,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,0
4506,0,0,0,1,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,1,0


#### 合并特征

In [48]:
features = np.concatenate((str_features_vec, df_one_hot.values), axis=1)
s_1 = StandardScaler()
features = s_1.fit_transform(features)
features.shape

(3239, 321)

### 标签处理

In [49]:
def object_to_mean(string):
    result = re.findall('\d{1,3}', string)
    result = np.array([int(x) for x in result])
    return result.mean()

labels = df_data['salary'].apply(object_to_mean).values
labels.shape

(3239,)

### 拆分数据集

In [50]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.3, random_state=42)

### 模型训练

In [51]:
models = {'knn': KNeighborsRegressor(n_neighbors=5), 
          'svr': SVR(),
          'ram': RandomForestRegressor(), 
          'tree': DecisionTreeRegressor(),
         }
for model in models:
    score = cross_val_score(models[model], train_features, train_labels, cv=5, scoring='neg_mean_absolute_error')
    print(model, score.mean())

knn -7.341363013099164
svr -6.943439967903539
ram -6.543298671785109
tree -6.371421206331834


In [53]:
parameter = {
    'splitter': ['best', 'random'],
    'max_depth': [100, 200, 300, None],
    'min_samples_split': [2,3,4,5,6],
    'min_samples_leaf': [1,3,4,5,6]
}
tree = DecisionTreeRegressor()
clf = GridSearchCV(estimator=tree, param_grid=parameter, cv=3, scoring='neg_mean_absolute_error')
clf.fit(train_features, train_labels)
best_score = clf.best_score_
best_params = clf.best_params_ 

In [54]:
best_params

{'max_depth': 300,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'splitter': 'best'}

In [55]:
tree_finall = DecisionTreeRegressor(max_depth=300,min_samples_leaf=1,min_samples_split=2,splitter='random')
tree_finall.fit(train_features, train_labels)
predict = tree_finall.predict(test_features)
mean_absolute_error(test_labels, predict)

5.910821655936333