In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import os,sys,inspect
import gc
from tqdm import tqdm
import random

import warnings
warnings.filterwarnings('ignore')

In [2]:
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras import optimizers, callbacks, layers, losses
from tensorflow.keras.layers import Dense, Concatenate, Activation, Add, BatchNormalization, Dropout, Input, Embedding, Flatten, Multiply
from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import to_categorical

SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
os.environ['PYTHONHASHSEED']=str(SEED)
random.seed(SEED)
gpus = tf.config.experimental.list_physical_devices('GPU')

if gpus:
    try:
        tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError as e:
        # 프로그램 시작시에 메모리 증가가 설정되어야만 합니다
        print(e)
        
def mish(x):
    return x*tf.math.tanh(tf.math.softplus(x))

def leakyrelu(x, factor=0.2):
    return tf.maximum(x, factor*x)

In [3]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test_job.csv')
sub = pd.read_csv('./data/sample_output_job.csv')
train

Unnamed: 0,userID,jobID,applied
0,fe292163d06253b716e9a0099b42031d,15de21c670ae7c3f6f3f1f37029303c9,0
1,6377fa90618fae77571e8dc90d98d409,55b37c5c270e5d84c793e486d798c01d,0
2,8ec0888a5b04139be0dfe942c7eb4199,0fcbc61acd0479dc77e3cccc0f5ffca7,1
3,f862b39f767d3a1991bdeb2ea1401c9c,3b5dca501ee1e6d8cd7b905f4e1bf723,0
4,cac14930c65d72c16efac2c51a6b7f71,287e03db1d99e0ec2edb90d079e142f3,0
...,...,...,...
5995,68cb94b97d00979f4e8127915885b641,b9228e0962a78b84f3d5d92f4faa000b,0
5996,c0b199d73bdf390c2f4c3150b6ee1574,e3796ae838835da0b6f6ea37bcf8bcb7,0
5997,3ab88dd28f749fe4ec90c0b6f9896eb5,e2a2dcc36a08a345332c751b2f2e476c,0
5998,75b4af0dacbc119eadf4eeb096738405,3b712de48137572f3849aabd5666a4e3,0


In [4]:
from sklearn.preprocessing import LabelEncoder

user_lbe = LabelEncoder()

user_lbe.fit(train['userID'])

LabelEncoder()

In [5]:
train['userID'] = user_lbe.transform(train['userID'])
test['userID'] = user_lbe.transform(test['userID'])

tag = pd.read_csv('./data/tags.csv')
tag_lbe = LabelEncoder()
tag['tagID'] = tag_lbe.fit_transform(tag['tagID'])

job_tag = pd.read_csv('./data/job_tags.csv')
job_lbe = LabelEncoder()

job_tag['jobID'] = job_lbe.fit_transform(job_tag['jobID'])
job_tag['tagID'] = tag_lbe.transform(job_tag['tagID'])
train['jobID'] = job_lbe.transform(train['jobID'])
test['jobID'] = job_lbe.transform(test['jobID'])

user_tag = pd.read_csv('./data/user_tags.csv')
user_tag['userID'] = user_lbe.transform(user_tag['userID'])
user_tag['tagID'] = tag_lbe.transform(user_tag['tagID'])

job_comp = pd.read_csv('./data/job_companies.csv')
comp_lbe = LabelEncoder()
comp_lbe.fit(job_comp['companyID'])
job_comp['companyID'] = comp_lbe.transform(job_comp['companyID'])
job_comp['jobID'] = job_lbe.transform(job_comp['jobID'])

train

Unnamed: 0,userID,jobID,applied
0,195,52,0
1,64,245,0
2,111,35,1
3,190,177,0
4,159,117,0
...,...,...,...
5995,71,526,0
5996,154,650,0
5997,35,648,0
5998,83,178,0


In [6]:
def jaccard(true, pred):
    union = set(true).union(set(pred))
    inter = set(true).intersection(set(pred))
    return len(inter)/len(union)

In [7]:
jac = []
for i in range(len(train)):
    u = train.values[i][0]
    j = train.values[i][1]
    score = jaccard(np.unique(user_tag[user_tag['userID']==u].values), job_tag[job_tag['jobID']==j]['tagID'].values)
    jac.append(score)
    
train['jac'] = jac

In [8]:
jac = []
for i in range(len(test)):
    u = test.values[i][0]
    j = test.values[i][1]
    score = jaccard(np.unique(user_tag[user_tag['userID']==u].values), job_tag[job_tag['jobID']==j]['tagID'].values)
    jac.append(score)
    
test['jac'] = jac

In [9]:
jobs = []
tags = []

jid= job_tag['jobID'].unique()

for j in jid:
    one_j = job_tag[job_tag['jobID']==j]
    if len(one_j) > 1:
        jobs.append(j)
        tags.append(list(one_j['tagID'].values))
#     break

In [10]:
import gensim

In [11]:
Word2Vec = gensim.models.Word2Vec

In [12]:
w2v = Word2Vec(
      np.array(tags),
      vector_size = 32,
      window=10,
      min_count=1,
      sg=1,
      negative=20)

In [13]:
w2v.build_vocab(np.array(tags))

In [14]:
w2v.train(np.array(tags),
         total_examples = w2v.corpus_count,
         epochs=100,
         compute_loss=True)

(141572, 344600)

In [15]:
embedding_matrix = w2v.wv[w2v.wv.key_to_index.keys()]
embedding_matrix.shape

(240, 32)

In [16]:
embedding_matrix

array([[-0.5029302 , -0.46807966,  0.36893418, ..., -0.4141846 ,
        -1.0150032 ,  0.09512324],
       [-0.33069316, -0.19277354,  0.36678764, ..., -0.2940389 ,
        -0.55227745,  0.16863441],
       [-0.33309913, -0.91207916, -0.03739739, ..., -0.05271877,
        -1.0511299 ,  0.25219604],
       ...,
       [-0.13066268, -0.16798045,  0.26332527, ..., -0.0334536 ,
        -0.68267393,  0.37820774],
       [-0.32021567, -0.48293287,  1.0351163 , ..., -0.5029538 ,
        -0.51693475, -0.45450923],
       [-0.09783003, -0.23087727,  0.20971908, ..., -0.02187978,
        -0.36080122,  0.23990817]], dtype=float32)

In [17]:
uid = train['userID'].unique()
tid = job_tag['tagID'].unique()

tr_j_tag =  []
for j in train['jobID'].values:
    arr = [0]*887
    tags = job_tag[job_tag['jobID']==j]['tagID'].values
    for t in tags:
        arr[t] = 1
    tr_j_tag.append(arr)
tr_j_tag = np.array(tr_j_tag)
# tr_ohe = np.hstack([to_categorical(train['userID'], len(uid)), j_tag])

te_j_tag =  []
for j in test['jobID'].values:
    arr = [0]*887
    tags = job_tag[job_tag['jobID']==j]['tagID'].values
    for t in tags:
        arr[t] = 1
    te_j_tag.append(arr)
te_j_tag = np.array(te_j_tag)
# te_ohe = np.hstack([to_categorical(test['userID'], len(uid)), j_tag])

# tr_ohe = to_categorical(train['userID'], len(uid))
# te_ohe = to_categorical(test['userID'], len(uid))

In [18]:
tr_ohe = np.hstack([to_categorical(train['userID'], len(uid)), 
#                     to_categorical(pd.merge(train, job_comp, on='jobID', right_index=True).sort_index()['companyID'], 733),
                    pd.get_dummies(pd.merge(train, job_comp, on='jobID', right_index=True).sort_index()['companySize']).values])

te_ohe = np.hstack([to_categorical(test['userID'], len(uid)), 
#                     to_categorical(pd.merge(test, job_comp, on='jobID', right_index=True).sort_index()['companyID'], 733),
                    pd.get_dummies(pd.merge(test, job_comp, on='jobID', right_index=True).sort_index()['companySize']).values])


In [19]:
from sklearn.metrics.pairwise import cosine_similarity

def average_emb(tags, emb=embedding_matrix, w2v=w2v):
    avg_emb = np.zeros((32, ))
    cnt = 0
    for tag in tags:
        if tag in w2v.wv.key_to_index.keys():
            avg_emb += embedding_matrix[w2v.wv.key_to_index[tag]] 
            cnt += 1
    return avg_emb

In [20]:
user_tag_emb = []
job_tag_emb = []
for i in range(len(train)):
    u = train.values[i][0]
    j = train.values[i][1]
    
    u_tags = user_tag[user_tag['userID']==u]['tagID'].unique()
    user_tag_emb.append(list(average_emb(u_tags)))
    
    j_tags = job_tag[job_tag['jobID']==j]['tagID'].unique()
    job_tag_emb.append(list(average_emb(j_tags)))
tr_dense = np.hstack([user_tag_emb, job_tag_emb, np.diag(cosine_similarity(user_tag_emb, job_tag_emb)).reshape(-1, 1), train['jac'].values.reshape(-1, 1)])


In [21]:
user_tag_emb = []
job_tag_emb = []
for i in range(len(test)):
    u = test.values[i][0]
    j = test.values[i][1]
    
    u_tags = user_tag[user_tag['userID']==u]['tagID'].unique()
    user_tag_emb.append(list(average_emb(u_tags)))
    
    j_tags = job_tag[job_tag['jobID']==j]['tagID'].unique()
    job_tag_emb.append(list(average_emb(j_tags)))
te_dense = np.hstack([user_tag_emb, job_tag_emb, np.diag(cosine_similarity(user_tag_emb, job_tag_emb)).reshape(-1, 1), test['jac'].values.reshape(-1, 1)])


In [22]:
class FM_layer(keras.Model):
    def __init__(self, latent_dim, w_reg=1e-4, v_reg=1e-4):
        super(FM_layer, self).__init__()
        self.latent_dim = latent_dim
        
        self.w_reg = w_reg
        self.v_reg = v_reg

    def build(self, input_shape):
        self.w_0 = self.add_weight(shape=(1, ),
                                  initializer=tf.zeros_initializer(),
                                  trainable=True)
        
        self.w = self.add_weight(shape=(input_shape[-1], 1), 
                             initializer=tf.random_normal_initializer(),
                                trainable=True,
                                regularizer=l2(self.w_reg))
        
        self.V = self.add_weight(shape=(input_shape[-1], self.latent_dim), 
                             initializer=tf.random_normal_initializer(),
                                trainable=True,
                                regularizer=l2(self.v_reg))

    def call(self, inputs):
        linear_terms = tf.reduce_sum(tf.matmul(inputs, self.w), axis=1)

        interactions = 0.5 * tf.reduce_sum(
            tf.pow(tf.matmul(inputs, self.V), 2)
            - tf.matmul(tf.pow(inputs, 2), tf.pow(self.V, 2)),
            1,
            keepdims=False
        )

        y_hat = (self.w_0 + linear_terms + interactions)

        return y_hat

In [23]:
class DeepFM(tf.keras.Model):
    def __init__(self, sparse_dims, latent_dim, l2_emb=1e-4, w2v=w2v, embedding_matrix=embedding_matrix):
        super().__init__()
        
        self.sparse_dims = sparse_dims
        self.latent_dim = latent_dim
        
        self.w2v = w2v
        self.embedding_matrix = embedding_matrix

        self.embed_layers = {
            'embed_' + str(i): Embedding(input_dim=2, # ohe
                                         input_length=1,
                                         output_dim=latent_dim,
                                         embeddings_initializer='random_uniform',
                                         embeddings_regularizer=l2(l2_emb))
            for i, feat in enumerate(range(sparse_dims))
        }
        self.fm_layer = FM_layer(latent_dim)
        self.dnn_layers = self.build_dnn()
        self.flatten =  Flatten()
        
        self.linear = Dense(1)

    def build_dnn(self):
        model = Sequential()
        model.add(Dense(128, activation='relu'))
        model.add(Dense(64, activation='relu'))
        model.add(Dense(1))
        
        return model
        

    def call(self, inputs):        
        dense_inputs, sparse_inputs, job_inputs = inputs
        wide_inputs = tf.concat([sparse_inputs, job_inputs], axis=1)
        wide = self.linear(wide_inputs)
        
        sparse_embed = tf.concat([self.embed_layers[f'embed_{i}'](sparse_inputs[:, i]) for i in range(self.sparse_dims)], axis=-1)
        
        stack = tf.concat([dense_inputs, sparse_embed], axis=-1)

        fm_out = self.fm_layer(stack)

        deep_out = self.dnn_layers(stack)

        outputs = fm_out + tf.squeeze(deep_out) + tf.squeeze(wide)
        
        return outputs

In [24]:
dfm = DeepFM(tr_ohe.shape[1], 32)

In [25]:
es = callbacks.EarlyStopping(patience=5, restore_best_weights=True)
dfm.compile(loss=losses.BinaryCrossentropy(from_logits=True), 
            optimizer=optimizers.Adam(2e-4))

dfm.fit([tr_dense, tr_ohe, tr_j_tag.astype(np.float32)], 
       train['applied'].values,
      epochs=20,
      shuffle=True,
      validation_split=0.1,
       callbacks=[es])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20


<tensorflow.python.keras.callbacks.History at 0x2c5c5552708>

In [26]:
pred = dfm.predict([tr_dense, tr_ohe, tr_j_tag.astype(np.float32)])
pred_ = np.where(pred>0, 1, 0) 
sum(pred_==train['applied'].values) / len(pred_)

0.8571666666666666

In [27]:
best_score = 0
thres = 0
for i in np.arange(-100, 1000)/1000:
    pred_ = np.where(pred>i, 1, 0) 
    score = sum(pred_==train['applied'].values) / len(pred_)
    if score > best_score:
        thres = i
        best_score = score
        
print(best_score, thres)

0.8645 0.562


In [30]:
pred = dfm.predict([te_dense, te_ohe, te_j_tag.astype(np.float32)], batch_size=1024)
pred_ = np.where(pred>i, 1, 0)

sub = pd.DataFrame(pred_, columns=['applied'])
sub.to_csv('./sub/dfm_emb2.csv', index=False)

In [34]:
tag.head(20)

Unnamed: 0,tagID,keyword
0,327,Amazon Web Services(AWS)
1,787,Tensorflow
2,558,Docker
3,462,Git
4,224,Python
5,377,Go
6,111,Deep Learning
7,878,Machine Learning
8,230,JSP
9,513,Framework7


In [36]:
# tensorflow: 787
# python: 224
# go: 377
# deeplearning: 111
# ml: 878
# c++: 649
# c: 804
# java: 836

# tensorflow vs python
cosine_similarity(
    embedding_matrix[w2v.wv.key_to_index[787]].reshape(1, -1) ,
    embedding_matrix[w2v.wv.key_to_index[224]].reshape(1, -1) 
               )

array([[0.57687306]], dtype=float32)

In [37]:
# tensorflow vs deeplearning
cosine_similarity(
    embedding_matrix[w2v.wv.key_to_index[787]].reshape(1, -1) ,
    embedding_matrix[w2v.wv.key_to_index[111]].reshape(1, -1) 
               )

array([[0.7766644]], dtype=float32)

In [38]:
# c++ vs c
cosine_similarity(
    embedding_matrix[w2v.wv.key_to_index[649]].reshape(1, -1) ,
    embedding_matrix[w2v.wv.key_to_index[804]].reshape(1, -1) 
               )

array([[0.8242682]], dtype=float32)

In [42]:
# c++ vs java
cosine_similarity(
    embedding_matrix[w2v.wv.key_to_index[649]].reshape(1, -1) ,
    embedding_matrix[w2v.wv.key_to_index[836]].reshape(1, -1) 
               )

array([[0.43744987]], dtype=float32)

In [43]:
# c vs java
cosine_similarity(
    embedding_matrix[w2v.wv.key_to_index[804]].reshape(1, -1) ,
    embedding_matrix[w2v.wv.key_to_index[836]].reshape(1, -1) 
               )

array([[0.47135445]], dtype=float32)

In [44]:
# python vs java
cosine_similarity(
    embedding_matrix[w2v.wv.key_to_index[224]].reshape(1, -1) ,
    embedding_matrix[w2v.wv.key_to_index[836]].reshape(1, -1) 
               )

array([[0.5147978]], dtype=float32)