In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
from torch import nn
from torch.utils import data
from d2l import torch as d2l

In [2]:
df_map = pd.read_csv('./recruit_folder.csv')
df_per = pd.read_csv('./user_dict',sep='\t')
df_rec = pd.read_csv('./jd_dict',sep='\t')

In [3]:
per_id = {}
for i in df_per.index:
  per_id[df_per.iloc[i,0]] = df_per.iloc[i,1]
rec_id = {}
for i in df_rec.index:
  rec_id[df_rec.iloc[i,0]] = df_rec.iloc[i,1]

In [5]:
len(rec_id)

2104

In [5]:
for i in tqdm(df_map.index):
  raw_pid = df_map.loc[i,'PERSON_ID']
  df_map.loc[i,'PERSON_ID'] = per_id[raw_pid]
  raw_rid = df_map.loc[i,'RECRUIT_ID']
  df_map.loc[i,'RECRUIT_ID'] = rec_id[raw_rid]

100%|██████████| 35291/35291 [00:08<00:00, 4408.07it/s]


In [24]:
df_map.to_csv('./per_job_map.csv',header=['job','person','label'],index=None)

In [7]:
df_map = pd.read_csv('./per_job_map.csv')
df_map

Unnamed: 0,job,person,label
0,1135,40121,0
1,305,4973,0
2,664,9112,0
3,221,13978,0
4,322,41717,0
...,...,...,...
35286,1213,2949,0
35287,284,19247,0
35288,748,39496,0
35289,491,7187,1


In [8]:
# 读取嵌入数据
emb_job = pd.read_csv('./embedding_recruit.txt',sep=' ',header=None,na_values=[])
emb_job.drop(columns=[0,65],inplace=True)
emb_per = pd.read_csv('./embedding_person.txt',sep=' ',header=None,na_values=[])
emb_per.drop(columns=[0,220],inplace=True)
print(emb_job.shape, emb_per.shape)

(2104, 64) (54779, 219)


In [33]:
emb_per.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,210,211,212,213,214,215,216,217,218,219
0,0.0,15.0,3.0,37.0,-0.204654,0.031938,-0.145382,-0.187237,0.119759,0.039633,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,12.0,3.0,35.0,0.278906,0.036311,-0.167667,0.157667,-0.091677,-0.136762,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,10.0,3.0,32.0,0.217372,-0.049497,0.135316,-0.145119,0.057672,-0.024925,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,10.0,1.0,34.0,0.163814,-0.015554,-0.210183,0.006652,0.252146,-0.181748,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,15.0,1.0,34.0,-0.218846,0.036923,-0.255493,-0.105587,-0.081384,0.127615,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [166]:
def get_input(per_id, job_id):
  p_vec = emb_per.loc[per_id-1].values
  j_vec = emb_job.loc[job_id-1].values
  return np.concatenate([p_vec,j_vec],axis=0)

In [10]:
features = []
labels = []
get_input(1,2).shape

(283,)

In [11]:
for i in tqdm(df_map.index):
  row = df_map.iloc[i]
  # print(row['person'], row['job'], row['label'])
  feature = get_input(row['person'],row['job'])
  if row['label']==0:
    label = [1,0.5]
  elif row['label']==1:
    label = [1,1]
  else:
    label = [0,0]
  label = np.array(label)
  features.append(feature)
  labels.append(label)
  

100%|██████████| 35291/35291 [00:07<00:00, 4436.71it/s]


In [12]:
features = np.array(features)
labels = np.array(labels)
features = torch.Tensor(features)
labels = torch.Tensor(labels)
features.shape

torch.Size([35291, 283])

In [13]:
class DS:
  def __init__(self,data_arrays):
    self.data = data_arrays

  def __getitem__(self,index):
    return self.data[0][index],self.data[1][index]
  
  def __len__(self):
    return self.data[0].size()[0]

def load_array(data_arrays, batch_size, is_train=True):
  # Array to Iter
  dataset = data.TensorDataset(*data_arrays)
  # dataset = DS(data_arrays)
  return data.DataLoader(dataset, batch_size, shuffle=is_train)
    

In [110]:
batch_size = 50
train_size = 30000
test_size = 1000

train_iter = load_array((features[:train_size],labels[:train_size]), batch_size)
test_iter = load_array((features[train_size:train_size+test_size],labels[train_size:train_size+test_size]), batch_size, False)

In [70]:
features.shape

torch.Size([35291, 283])

In [71]:
net = nn.Sequential(nn.Linear(283,200),
                    nn.ReLU(),
                    nn.Linear(200,100),
                    nn.ReLU(),
                    nn.Linear(100,2)
                    )

In [72]:
def init_weights(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, std=0.01)

net.apply(init_weights)

Sequential(
  (0): Linear(in_features=283, out_features=200, bias=True)
  (1): ReLU()
  (2): Linear(in_features=200, out_features=100, bias=True)
  (3): ReLU()
  (4): Linear(in_features=100, out_features=2, bias=True)
)

In [111]:
loss = nn.MSELoss()
# loss = nn.CrossEntropyLoss(reduction='none')
trainer = torch.optim.SGD(net.parameters(), lr=0.01)

In [None]:
num_epochs = 100
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, trainer)

In [128]:
num_epochs = 10
looper = tqdm(range(num_epochs))
for epoch in looper:
    for X, y in train_iter:
        # print(X.shape, y.shape)
        l = loss(net(X) ,y)
        trainer.zero_grad()
        l.backward()
        trainer.step()
    l = loss(net(features[train_size:train_size+test_size]), labels[train_size:train_size+test_size])
    # print(f'epoch {epoch + 1}, loss {l:f}')
    looper.set_description(f'epoch {epoch + 1}, loss {l:f}')

epoch 10, loss 0.013473: 100%|██████████| 10/10 [00:36<00:00,  3.69s/it]


In [165]:
print(net(torch.Tensor(get_input(51121,2044))))
print(net(torch.Tensor(get_input(54525,1075))))
# net.state_dict()

tensor([0.9993, 0.6959], grad_fn=<AddBackward0>)
tensor([1.0001, 0.6275], grad_fn=<AddBackward0>)


In [122]:
torch.save(net,'MLP.pt')