In [78]:
import numpy as np
import pandas as pd
import gensim
from tqdm import tqdm_notebook as tqdm

from keras.models import Model
from keras.layers import Input, Embedding, Reshape, Dot, Dense
from keras.preprocessing.sequence import skipgrams, make_sampling_table, pad_sequences
from keras.preprocessing.text import Tokenizer

In [2]:
internal_link_df = pd.read_csv("../data/internal_CID_link_in_entries.csv")
print(internal_link_df.dtypes)
internal_link_df.head()

entry     int64
linked    int64
dtype: object


Unnamed: 0,entry,linked
0,25352,712
1,25352,7416
2,25352,10979
3,25352,25352
4,25352,5462310


In [3]:
class LinkTokenizer(object):
    def __init__(self):
        self.num_links = 0
        self.link_index = {}
        
    def fit(self, links: list):
        _links = np.array(list(set(links)))
        _links.sort()
        self.link_index = {link: i for i, link in enumerate(_links, 1)}
        self.num_links = len(self.link_index)
        
    def link_to_index(self, links: list):
        return np.array([self.link_index.get(link) if self.link_index.get(link) else 0 for link in links])

In [4]:
tokenizer = LinkTokenizer()
tokenizer.fit(np.append(internal_link_df.entry.values, internal_link_df.linked.values))

In [5]:
LINK_SIZE = tokenizer.num_links + 1

In [6]:
EMBED_DIM = 100

In [101]:
input_target_entry = Input(shape=(1,), dtype='int32', name='input1')
input_linked_entry = Input(shape=(1,), dtype='int32', name='input2')

embedding = Embedding(LINK_SIZE, EMBED_DIM, name='embedding1')
target_entry = embedding(input_target_entry)
linked_entry = embedding(input_linked_entry)

dot = Dot(axes=2, name='dot1')([target_entry, linked_entry])
dot = Reshape((1,), name='reshape1')(dot)
output = Dense(1, activation='sigmoid', name='dense1')(dot)

model = Model(inputs=[input_target_entry, input_linked_entry], outputs=output)
model.compile(loss='binary_crossentropy', optimizer='adam')
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input1 (InputLayer)             (None, 1)            0                                            
__________________________________________________________________________________________________
input2 (InputLayer)             (None, 1)            0                                            
__________________________________________________________________________________________________
embedding1 (Embedding)          (None, 1, 100)       1843600     input1[0][0]                     
                                                                 input2[0][0]                     
__________________________________________________________________________________________________
dot1 (Dot)                      (None, 1, 1)         0           embedding1[0][0]                 
          

In [75]:
index_df = pd.DataFrame()
index_df['entry'] = tokenizer.link_to_index(internal_link_df.entry.values)
index_df['linked'] = tokenizer.link_to_index(internal_link_df.linked.values)

In [120]:
def neg(entry, backlink, negative_samples=1.0):
    neg_samples = np.random.choice(np.setdiff1d(all_links, backlink), size=int(len(backlink) * negative_samples))
    e = [entry] * (len(backlink) + len(neg_samples))
    links = np.append(backlink, neg_samples)
    label = np.array([1] * len(backlink) + [0] * len(neg_samples))
    
    return e, links, label

In [22]:
EPOCHS = 1

In [121]:
%%time

all_links = np.array(list(tokenizer.link_index.values()))

for i in range(EPOCHS):
    loss = 0.0
    for entry, link in tqdm(index_df.groupby('linked')):
        backlinks = link.entry.values
        entry, link, label = neg(entry, backlinks, negative_samples=5.0)
        loss += model.train_on_batch([entry, link], label)

    print(f"Epoch {i + 1}/{EPOCHS}\tloss: {loss}")

HBox(children=(IntProgress(value=0, max=17214), HTML(value='')))


Epoch 1/1	loss: 1713.3947564815753
CPU times: user 15min 21s, sys: 4min 55s, total: 20min 17s
Wall time: 9min 28s
