In [69]:
import numpy as np
import pandas as pd
import tqdm as tqdm
import matplotlib.pyplot as plt
import lightgbm as lgb
import datetime as dt
from pytz import timezone
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
%matplotlib inline

In [70]:
job_data = pd.read_csv('./Job_Views.csv',
                       names=[
                            'applicant_id', 'job_id', 'title', 'position',
                            'company', 'city', 'state_name', 'state_code',
                            'industry', 'view_start', 'view_end',
                            'view_duration', 'created_at', 'updated_at'],
                       sep=",",
                       parse_dates=True)
# データ確認
print(job_data.shape)
print('===============')
print(job_data.info())
print('===============')
print(job_data.head(1))

(12370, 14)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12370 entries, 0 to 12369
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   applicant_id   12370 non-null  int64  
 1   job_id         12370 non-null  int64  
 2   title          12370 non-null  object 
 3   position       12370 non-null  object 
 4   company        11790 non-null  object 
 5   city           12370 non-null  object 
 6   state_name     12348 non-null  object 
 7   state_code     12348 non-null  object 
 8   industry       2199 non-null   object 
 9   view_start     12370 non-null  object 
 10  view_end       10575 non-null  object 
 11  view_duration  10575 non-null  float64
 12  created_at     12370 non-null  object 
 13  updated_at     12370 non-null  object 
dtypes: float64(1), int64(2), object(11)
memory usage: 1.3+ MB
None
   applicant_id  job_id                                  title  \
0         10000   73666  Cashiers & Valet

In [71]:
job_data_null = job_data.copy() 
# null項目の検索
non_null_list = []
for col in job_data.columns:
    non_null = job_data[col].count()
#     if non_null > 0:
    if non_null == 0:
        non_null_list.append(col)
print(non_null_list)

# 対象なしのため以下の処理は実施不要とする
# null項目を除外
# job_data_null.drop('null columns', axis=1)

[]


In [72]:
job_data_out = job_data_null.copy() 
# 考慮対象外項目を削除
job_data_out = job_data_out.drop('industry', axis=1)
job_data_out = job_data_out.drop('created_at', axis=1)
job_data_out = job_data_out.drop('updated_at', axis=1)
job_data_out.columns

Index(['applicant_id', 'job_id', 'title', 'position', 'company', 'city',
       'state_name', 'state_code', 'view_start', 'view_end', 'view_duration'],
      dtype='object')

In [73]:
# 相関性のある項目の確認
print(job_data['state_name'].value_counts().head())
print('===============')
print(job_data['state_code'].value_counts().head())
print('===============')

job_data_relation = job_data_out.copy() 
# 相関性の高い項目の除外
job_data_relation = job_data_relation.drop('state_name', axis=1)
print(job_data_relation.columns.values)

California    2735
New York      1081
Illinois       929
Florida        609
Texas          601
Name: state_name, dtype: int64
CA    2735
NY    1081
IL     929
FL     609
TX     601
Name: state_code, dtype: int64
['applicant_id' 'job_id' 'title' 'position' 'company' 'city' 'state_code'
 'view_start' 'view_end' 'view_duration']


In [74]:
# 統計量の確認
job_data.astype('str').describe()

Unnamed: 0,applicant_id,job_id,title,position,company,city,state_name,state_code,industry,view_start,view_end,view_duration,created_at,updated_at
count,12370,12370,12370,12370,12370,12370,12370,12370,12370.0,12370,12370.0,12370.0,12370,12370
unique,3448,7047,4518,4059,2056,1632,50,50,33.0,12297,10538.0,1176.0,12300,12300
top,601,221890,Package Handler - Part-Time @ UPS,Administrative Assistant,OfficeTeam,Los Angeles,California,CA,,2014-11-06 22:48:34 UTC,,,2014-12-01 19:24:19 UTC,2014-12-01 19:24:19 UTC
freq,75,114,200,330,1039,441,2735,2735,10171.0,4,1795.0,1795.0,4,4


In [75]:
# 対象なしのため以下の処理は実施不要とする
# カラム値が同一の項目の除外
# job_data.drop('non', axis=1)

In [76]:
job_data_mold = job_data_relation.copy()
# 型変換
job_data_mold['company'].replace().astype(str) # str型に変換
job_data_mold['city'].replace().astype(str) # str型に変換
job_data_mold['state_code'].replace().astype(str) # str型に変換
# job_data_mold.info()

0        NJ
1        NY
2        WA
3        SC
4        SC
         ..
12365    CA
12366    CA
12367    CA
12368    CA
12369    CA
Name: state_code, Length: 12370, dtype: object

In [77]:
job_data_time = job_data_mold.copy()
# # タイムゾーン（JST/UTC）変換
job_data_time['view_start'] = pd.to_datetime(job_data_time['view_start'], utc=False)
job_data_time['view_end'] = pd.to_datetime(job_data_time['view_end'], utc=False)
# print('===============')
# print(job_data_time['created_at'].head(1))
# print(job_data_time['updated_at'].head(1))

In [78]:
job_data_lost = job_data_time.copy()
# カテゴリ変数の欠損値処理
# Encoders require their input to be uniformly strings or numbers. Got ['float', 'str']の対処
print(job_data_lost.isnull().sum())
print('===============')
data = ['title', 'position', 'company', 'city', 'state_code']
job_data_lost['company'] = job_data_lost['company'].fillna('other')
job_data_lost.dropna(subset = ['state_code'], inplace = True) # nanの行のみ削除（20/12370*100=0.16(%)）
print(job_data_lost.isnull().sum())
print('===============')

# カテゴリ変数処理
# LightGBMは int, float, boolean のみ
#ラベル・エンコーディング
# title, position, company, city, state_code
le = LabelEncoder()
job_data_lost['title'] = le.fit_transform(job_data_lost['title'].values)
job_data_lost['position'] = le.fit_transform(job_data_lost['position'].values)
job_data_lost['company'] = le.fit_transform(job_data_lost['company'].values)
job_data_lost['city'] = le.fit_transform(job_data_lost['city'].values)
job_data_lost['state_code'] = le.fit_transform(job_data_lost['state_code'].values)
# job_data_lost = job_data_lost.drop('title', axis=1)
# job_data_lost = job_data_lost.drop('position', axis=1)
job_data_lost = job_data_lost.drop('view_start', axis=1)
job_data_lost = job_data_lost.drop('view_end', axis=1)
job_data_lost = job_data_lost.drop('view_duration', axis=1)
print(job_data_lost.isnull().sum())
print('===============')
job_data_lost.head()

applicant_id        0
job_id              0
title               0
position            0
company           580
city                0
state_code         22
view_start          0
view_end         1795
view_duration    1795
dtype: int64
applicant_id        0
job_id              0
title               0
position            0
company             0
city                0
state_code          0
view_start          0
view_end         1794
view_duration    1794
dtype: int64
applicant_id    0
job_id          0
title           0
position        0
company         0
city            0
state_code      0
dtype: int64


Unnamed: 0,applicant_id,job_id,title,position,company,city,state_code
0,10000,73666,651,580,1971,983,30
1,10000,96655,2196,1960,1116,504,33
2,10001,84141,2946,2668,789,107,46
3,10002,77989,1372,1216,95,1334,39
4,10002,69568,427,381,281,553,39


In [79]:
# 型変換
job_data_lost['title'] = job_data_lost['title'].replace().astype(int)
job_data_lost['position'] = job_data_lost['position'].replace().astype(int)
job_data_lost['company'] = job_data_lost['company'].replace().astype(int)
job_data_lost['city'] = job_data_lost['city'].replace().astype(int)
job_data_lost['state_code'] = job_data_lost['state_code'].replace().astype(int)

In [80]:
# 学習データとテストデータに分割
train_set, test_set = train_test_split(job_data_lost, test_size=0.4, random_state=4)
print(train_set.shape)
print(test_set.shape)
print('=======================')

# 説明変数と目的変数に分割
#　学習データを説明変数データと目的変数データに分割
train_X = train_set.drop('job_id', axis=1)
train_y = train_set['job_id']
print(train_X.shape)
print(train_y.shape)
print('=======================')

#　テスト用データを説明変数データと目的変数データに分割
test_X = test_set.drop('job_id', axis=1)
test_y = test_set['job_id']
print(test_X.shape)
print(test_y.shape)

(7408, 7)
(4940, 7)
(7408, 6)
(7408,)
(4940, 6)
(4940,)


In [81]:
# LightGBM用のデータセットに入れる
lgb_train = lgb.Dataset(train_X, train_y)
lgb_test = lgb.Dataset(test_X, test_y)

In [82]:
params = {
    "objective" : "binary",
    "metric" : "binary_logloss"
}

In [83]:
# LightGBMのハイパーパラメータを設定
#     'task': 'train',                          # タスク 訓練
#     'boosting_type': 'gbdt',       # GBDT指定
#     'objective': 'multiclass',        # 多クラス分類を指定
#     'metric': {'multi_logloss'},    # 多クラス分類の損失（誤差）
#     'num_class': 1,                       # クラス数
#     'learning_rate': 0.1,              # 学習率
#     'num_leaves': 1,                    # ノード数
#     'min_data_in_leaf': 1,          # 決定木ノードの最小データ数
#     'num_iteration': 100            # 予測器(決定木)の数:イタレーション

In [84]:
# 学習
model = lgb.train(
                params,
                lgb_train,
                valid_sets=lgb_train,
                num_boost_round=10000,
                early_stopping_rounds=1000,
                verbose_eval=100)

[LightGBM] [Info] Number of positive: 7408, number of negative: 0
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1322
[LightGBM] [Info] Number of data points in the train set: 7408, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=1.000000 -> initscore=34.539576
[LightGBM] [Info] Start training from score 34.539576
[LightGBM] [Info] [binary:BoostFromScore]: pavg=1.000000 -> initscore=34.539576
Training until validation scores don't improve for 1000 rounds
[100]	training's binary_logloss: 0
[200]	training's binary_logloss: 0
[300]	training's binary_logloss: 0


[400]	training's binary_logloss: 0
[500]	training's binary_logloss: 0


[600]	training's binary_logloss: 0
[700]	training's binary_logloss: 0
[800]	training's binary_logloss: 0


[900]	training's binary_logloss: 0
[1000]	training's binary_logloss: 0


[1100]	training's binary_logloss: 0
[1200]	training's binary_logloss: 0
[1300]	training's binary_logloss: 0
[1400]	training's binary_logloss: 0


[1500]	training's binary_logloss: 0
[1600]	training's binary_logloss: 0


[1700]	training's binary_logloss: 0
[1800]	training's binary_logloss: 0


[1900]	training's binary_logloss: 0
[2000]	training's binary_logloss: 0
[2100]	training's binary_logloss: 0


[2200]	training's binary_logloss: 0
[2300]	training's binary_logloss: 0
[2400]	training's binary_logloss: 0


[2500]	training's binary_logloss: 0
[2600]	training's binary_logloss: 0


[2700]	training's binary_logloss: 0
[2800]	training's binary_logloss: 0
[2900]	training's binary_logloss: 0


[3000]	training's binary_logloss: 0
[3100]	training's binary_logloss: 0


[3200]	training's binary_logloss: 0
[3300]	training's binary_logloss: 0
[3400]	training's binary_logloss: 0


[3500]	training's binary_logloss: 0
[3600]	training's binary_logloss: 0


[3700]	training's binary_logloss: 0
[3800]	training's binary_logloss: 0
[3900]	training's binary_logloss: 0
[4000]	training's binary_logloss: 0


[4100]	training's binary_logloss: 0
[4200]	training's binary_logloss: 0
[4300]	training's binary_logloss: 0


[4400]	training's binary_logloss: 0
[4500]	training's binary_logloss: 0


[4600]	training's binary_logloss: 0
[4700]	training's binary_logloss: 0
[4800]	training's binary_logloss: 0


[4900]	training's binary_logloss: 0
[5000]	training's binary_logloss: 0
[5100]	training's binary_logloss: 0


[5200]	training's binary_logloss: 0
[5300]	training's binary_logloss: 0


[5400]	training's binary_logloss: 0
[5500]	training's binary_logloss: 0
[5600]	training's binary_logloss: 0


[5700]	training's binary_logloss: 0
[5800]	training's binary_logloss: 0
[5900]	training's binary_logloss: 0


[6000]	training's binary_logloss: 0
[6100]	training's binary_logloss: 0


[6200]	training's binary_logloss: 0
[6300]	training's binary_logloss: 0


[6400]	training's binary_logloss: 0
[6500]	training's binary_logloss: 0


[6600]	training's binary_logloss: 0
[6700]	training's binary_logloss: 0
[6800]	training's binary_logloss: 0
[6900]	training's binary_logloss: 0


[7000]	training's binary_logloss: 0


[7100]	training's binary_logloss: 0
[7200]	training's binary_logloss: 0
[7300]	training's binary_logloss: 0
[7400]	training's binary_logloss: 0


[7500]	training's binary_logloss: 0


[7600]	training's binary_logloss: 0
[7700]	training's binary_logloss: 0
[7800]	training's binary_logloss: 0
[7900]	training's binary_logloss: 0


[8000]	training's binary_logloss: 0
[8100]	training's binary_logloss: 0
[8200]	training's binary_logloss: 0


[8300]	training's binary_logloss: 0
[8400]	training's binary_logloss: 0


[8500]	training's binary_logloss: 0
[8600]	training's binary_logloss: 0


[8700]	training's binary_logloss: 0
[8800]	training's binary_logloss: 0
[8900]	training's binary_logloss: 0


[9000]	training's binary_logloss: 0
[9100]	training's binary_logloss: 0


[9200]	training's binary_logloss: 0
[9300]	training's binary_logloss: 0


[9400]	training's binary_logloss: 0
[9500]	training's binary_logloss: 0
[9600]	training's binary_logloss: 0


[9700]	training's binary_logloss: 0
[9800]	training's binary_logloss: 0


[9900]	training's binary_logloss: 0
[10000]	training's binary_logloss: 0
Did not meet early stopping. Best iteration is:
[1]	training's binary_logloss: 0


In [85]:
params = {
    "objective" : "regression",
    "metric" : "rmse"
}

In [86]:
# 予測
model = lgb.train(
                params,
                lgb_test,
                valid_sets = lgb_test,
                num_boost_round = 10000,
                early_stopping_rounds = 1000,
                verbose_eval = 100)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1318
[LightGBM] [Info] Number of data points in the train set: 4940, number of used features: 6
[LightGBM] [Info] Start training from score 118293.849190
Training until validation scores don't improve for 1000 rounds
[100]	training's rmse: 25459.1
[200]	training's rmse: 20949.6
[300]	training's rmse: 17945.9
[400]	training's rmse: 15778.1
[500]	training's rmse: 14183.5
[600]	training's rmse: 12850.7
[700]	training's rmse: 11689.4
[800]	training's rmse: 10813
[900]	training's rmse: 10067.2
[1000]	training's rmse: 9417.42
[1100]	training's rmse: 8809.15
[1200]	training's rmse: 8247.91
[1300]	training's rmse: 7755.64
[1400]	training's rmse: 7329.61
[1500]	training's rmse: 6968.96
[1600]	training's rmse: 6637.3
[1700]	training's rmse: 6352.74
[1800]	training's rmse: 6112.64
[1900]	training's rmse: 5902.52
[2000]	training's rmse: 5698.09
[2100]	training's rmse: 5518.04
[2200]	training's rmse: 5355.52
[230

In [87]:
# # クエリリスト作成
# query_list = job_data_lost[['applicant_id']].value_counts()

# # user_idをインデックス化
# # train_all = job_data_lost.set_index([['applicant_id']])

# # クエリリストをインデックスでソート
# query_list = query_list.sort_index()

# # 特徴量と目的変数データをインデックスでソート
# job_data_lost = job_data_lost.sort_index()

# # 特徴量抽出
# X_train = job_data_lost[['title', 'position', 'company', 'city', 'state_code']]

# group_val = job_data_lost.groupby(['applicant_id', 'position', 'company', 'city', 'state_code'])

# # 目的変数を抽出
# y_train = job_data_lost['job_id']

# # LGBMRankerのモデルを作成
# model = lgb.LGBMRanker()

# # model.fit(X_train,　y_train,　group　=　query_list, eval_set=[(test_X, test_y)])

# # LGBMRankerを作成して学習
# model.fit(X_train, y_train, group=query_list, eval_set=[(test_X, test_y)], eval_group=[group_val])

In [88]:
model = GaussianNB()
model.fit(train_X, train_y)

pred_job = model.predict(test_X)
print(test_label)

[ 23342 204379  63180 ...   7726 168163  89599]


In [89]:
print(accuracy_score(pred_job, test_y))

0.4200404858299595


In [68]:
print(test_X)
print('=======================')
print(test_X.loc[9739])
print('=======================')
# 機械学習モデルmodelの'applicant_id = 9739'に対する出力結果
pred = model.predict(test_X)

# job_idとの紐づけ
pred_jobs = pd.Series(pred, index = test_X.index)

# 指定のjob_idとの関連の高い順にソート
jobs = pred_jobs.sort_values(ascending=False).index

# 上位10件を出力
output = list(jobs)[:10]
print(output)

       applicant_id  title  position  company  city  state_code
9739           7307   3307      3005     2011   260          14
6527          13396   3657      3298     1116   380          26
3460          11700   3661      3302       50  1549           9
1694          10903   4318      3873     1864    64          10
10861          9146    902       806     1017   767          17
...             ...    ...       ...      ...   ...         ...
6699          13531   3933      3541     1916  1603           4
6692          13526    997       859     1674   615          15
10546          8843   1497      1337      774   899          47
3123          11542    727       650      223  1033           4
5017          12508   1748      1565     1688   812           4

[4940 rows x 6 columns]
applicant_id    7307
title           3307
position        3005
company         2011
city             260
state_code        14
Name: 9739, dtype: int64
[8718, 8720, 8689, 6768, 8674, 8673, 6288, 8490, 8446, 1

In [41]:
print(test_X.shape)
preds = model.predict(test_X)
print(np.sort(preds)[::-1])


# y_pred = []
# for x in preds:
#     y_pred.append(np.argmax(x))


# print(accuracy_score(y_test, y_pred))
# print('=================')
# print(accuracy_score(y_test, y_pred))
# print('=================')
# print(sum(y_test == y_pred_max) / len(y_test))

(4940, 7)


ValueError: Number of features of the model must match the input. Model n_features_ is 6 and input n_features is 7 

In [119]:
# sample = '10000'
# print('[predicted]')
# print(predicted.query(f'applicant_id == "{sample}"')[['job_id', 'Rating']])
# print('+'*20)
# print('[actual]')
# print(y_test[sample].sort_values(ascending=False))

In [None]:
# def get_model_input(t_X, t_y):
#     merged = pd.merge(t_X, t_y, on = ['applicant_id'], how = 'inner')
#     print(t_X)
#     print('=======================')
#     print(t_y)
#     print(t_y.shape)
#     print('=======================')
#     print(merged)
#     print('=======================')
#     # nullの場合は0で補完
#     merged.fillna(0, inplace=True)
#     features_cols = list(merged.drop(columns=['applicant_id', 'job_id']).columns)
#     # クエリ
#     query_list = merged['applicant_id'].value_counts()
#     # applicant_idをインデックス化
#     merged = merged.set_index(['applicant_id'])
#     # クエリリストをインデックスでソート
#     query_list = query_list.sort_index()
#     # 特徴量と目的変数データをインデックスでソート
#     merged.sort_index(inplace=True)
#     # 特徴量
#     df_x = merged[features_cols]
#     print(df_x)
#     print('=======================')
#     # 目的変数
#     df_y = merged['job_id']
    
#     return df_x, df_y, query_list

# X_train, y_train, query_list_train = get_model_input(train_X, train_y)
# X_test, y_test, query_list_test = get_model_input(test_X, test_y)

# X_train = X_train.drop('title_y', axis=1)
# X_train = X_train.drop('position_y', axis=1)
# X_train = X_train.drop('company_y', axis=1)
# X_train = X_train.drop('city_y', axis=1)
# X_train = X_train.drop('state_code_y', axis=1)