# Keras-LSTM
## Dependency

In [1]:
import pandas as pd
import numpy as np
import string
from py2neo import Graph, NodeMatcher, Relationship, Node, Record
from pattern.en import singularize

import tensorflow
Sequential = tensorflow.keras.models.Sequential
Dense = tensorflow.keras.layers.Dense
LSTM = tensorflow.keras.layers.LSTM
TimeDistributed = tensorflow.keras.layers.TimeDistributed
RepeatVector = tensorflow.keras.layers.RepeatVector
TensorBoard = tensorflow.keras.callbacks.TensorBoard

import matplotlib.pyplot as plt
%matplotlib inline

## Import Word Vector

In [2]:
with open('glove.6B.50d.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

word_vectors = dict()
for line in lines:
    line_split = line[:-1].split()
    word_vectors[line_split[0]] = [float(number) for number in line_split[1:]]

## Import Graph

In [3]:
graph = Graph("bolt://localhost:7687", password = "1012zzpHh")

## Processing Titles

In [4]:
paper_nodes = graph.run("MATCH (paper:paper) RETURN paper.id, paper.title").data()

# Remove Punctuations
for node in paper_nodes:
    for ch in string.punctuation:
        if ch is '-':
            node['paper.title'] = node['paper.title'].replace(ch, ' ')
        else:
            node['paper.title'] = node['paper.title'].replace(ch, '')
        

# Lowercase Words
for node in paper_nodes:
    node['paper.title'] = [word.lower() for word in node['paper.title'].split()]

In [5]:
paper_nodes[15]['paper.title']

['online', 'generation', 'of', 'profile', 'association', 'rules']

## Creating Title Vectors

In [6]:
none_words = list()
for node in paper_nodes:
    title_vector = list()
    for word in node['paper.title']:
        try:
            title_vector.append(word_vectors[word])
        except KeyError:
            try:
                title_vector.append(word_vectors[singularize(word)])
            except KeyError:
                none_words.append(word)
    node['title_vector'] = title_vector

len(set(none_words))

273

## Create Word Vector for None_words

In [7]:
np.random.seed(5)
none_words_matrix = np.random.rand(273,50)*2 - 1
none_word_vectors = dict()
for i,word in enumerate(set(none_words)):
    none_word_vectors[word] = none_words_matrix[i,:]

In [8]:
for node in paper_nodes:
    for word in node['paper.title']:
        if word in set(none_words):
            node['title_vector'].append(none_word_vectors[word])

## Create Title Matrix & PCA

In [9]:
for node in paper_nodes:
    node['title_vector'] = np.array(node['title_vector'])

In [10]:
min(map(lambda x:x['title_vector'].shape[0],paper_nodes))

3

In [11]:
def pca(data_mat):
    #centralize
    data_mat = data_mat.T - np.mean(data_mat.T,axis = 0).reshape((1,data_mat.T.shape[1]))
    # XX.T
    cov_mat = np.cov(data_mat,rowvar = 0)
    #eig
    eigvals,eigvects = np.linalg.eig(np.mat(cov_mat))
    eigvals_id = np.argsort(-eigvals)[:3]
    eigvects_selected = eigvects[:,eigvals_id]
    #reduce dims
    reduced_data_mat = data_mat * eigvects_selected
    return np.real(reduced_data_mat).T

In [12]:
for node in paper_nodes:
    node['title_vector'] = pca(node['title_vector'])
paper_nodes[1]['title_vector'].shape

(3, 50)

## Build Training Data

In [13]:
with open("citation_table.csv","r", encoding="utf-8") as file:
    lines = file.readlines()
    lines = [line for line in lines if line is not '\n']
    citation_table = [[int(num) for num in line[0:-1].split(',')] for line in lines]
citation_table = pd.DataFrame(citation_table,columns = ["source",'target'])
citation_table

Unnamed: 0,source,target
0,1987,1341
1,1987,1412
2,1987,2394
3,316,313
4,316,318
...,...,...
5962,1914,1748
5963,1169,2088
5964,1169,1982
5965,1169,1141


In [14]:
paper_table = pd.DataFrame(paper_nodes)
paper_table

Unnamed: 0,paper.id,paper.title,title_vector
0,1987,"[efficient, evaluation, of, aggregates, on, bu...",[[[[[-1.28749409 0.23764203 0.1963248 0.14...
1,316,"[a, non, obtrusive, user, interface, for, incr...",[[[[[ 0.55718042 1.23926539 0.05190135 0.46...
2,358,"[performance, of, dlp, on, random, modal, form...",[[[[[ 5.11703455e-01 3.93235971e-01 -5.772689...
3,2227,"[semcog, a, hybrid, object, based, image, data...",[[[[[ -1.1707025 0.15134707 0.06867833 -...
4,2437,"[the, use, of, information, capacity, in, sche...",[[[[[ 1.21138240e+00 6.64351211e-01 -1.746093...
...,...,...,...
2550,157,"[arguing, about, beliefs, and, actions]",[[[[[ 0.65251859 -0.30215619 -0.77729002 -0.69...
2551,101,"[characterization, of, database, access, skew,...",[[[[[-1.18638654 -0.05430245 0.09434683 -1.40...
2552,395,"[consistency, checking, in, complex, object, d...",[[[[[-5.56413959e-01 -5.35828955e-01 4.325371...
2553,1914,"[queries, on, change, in, an, extended, relati...",[[[[[ 7.93680413e-01 9.39145473e-01 -3.787493...


In [15]:
paper_table[paper_table['paper.id'] == 1]['title_vector'].iloc[0].shape

(3, 50)

In [16]:
train_x = list()
train_y = list()
for source, target in zip(citation_table['source'],citation_table['target']):
    train_x.append(paper_table[paper_table['paper.id'] == source]['title_vector'].iloc[0])
    train_y.append(paper_table[paper_table['paper.id'] == target]['title_vector'].iloc[0])
train_x = np.array(train_x)
train_y = np.array(train_y)
train_x.shape, train_y.shape

((5967, 3, 50), (5967, 3, 50))

## Shuffle Data

In [17]:
def shuffle(X,Y):
    np.random.seed(10)
    randomList = np.arange(X.shape[0])
    np.random.shuffle(randomList)
    return X[randomList], Y[randomList]

In [18]:
train_x, train_y = shuffle(train_x,train_y)
train_x.shape, train_y.shape

((5967, 3, 50), (5967, 3, 50))

## Training Data & Validation Data

In [27]:
def split_data(X,Y,val_rate):
    X_train = X[int(X.shape[0]*(val_rate)):]
    Y_train = Y[int(Y.shape[0]*(val_rate)):]
    X_val = X[:int(X.shape[0]*val_rate)]
    Y_val = Y[:int(Y.shape[0]*val_rate)]
    return X_train, Y_train, X_val, Y_val

In [28]:
x_train, y_train, x_val, y_val= split_data(train_x, train_y,0.1)
x_train.shape, y_train.shape, x_val.shape, y_val.shape

((5371, 3, 50), (5371, 3, 50), (596, 3, 50), (596, 3, 50))

## Define LSTM Model

In [29]:
def build_model():
    model = Sequential() 

    model.add(LSTM(50, input_shape=(3, 50)))

    model.add(RepeatVector(3))
    
    model.add(LSTM(25, return_sequences=True))
    
    model.add(TimeDistributed(Dense(50, activation= 'softmax' ))) 

    model.compile(loss='mse' , optimizer='adam', metrics=['accuracy'])

    print(model.summary())
    
    return model

In [36]:
model = build_model()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_4 (LSTM)                (None, 50)                20200     
_________________________________________________________________
repeat_vector_2 (RepeatVecto (None, 3, 50)             0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 3, 25)             7600      
_________________________________________________________________
time_distributed_2 (TimeDist (None, 3, 50)             1300      
Total params: 29,100
Trainable params: 29,100
Non-trainable params: 0
_________________________________________________________________
None


## Train Model

In [39]:
tbCallBack = TensorBoard(log_dir='logs\\final')

In [40]:
model.fit(x_train,y_train,epochs=300,batch_size=100,validation_data = (x_val,y_val),callbacks=[tbCallBack])

Train on 5371 samples, validate on 596 samples
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300


Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78/300
Epoch 79/300
Epoch 80/300
Epoch 81/300
Epoch 82/300
Epoch 83/300
Epoch 84/300
Epoch 85/300
Epoch 86/300
Epoch 87/300
Epoch 88/300
Epoch 89/300
Epoch 90/300
Epoch 91/300
Epoch 92/300
Epoch 93/300
Epoch 94/300
Epoch 95/300
Epoch 96/300
Epoch 97/300
Epoch 98/300
Epoch 99/300
Epoch 100/300
Epoch 101/300
Epoch 102/300
Epoch 103/300
Epoch 104/300
Epoch 105/300
Epoch 106/300
Epoch 107/300
Epoch 108/300


Epoch 109/300
Epoch 110/300
Epoch 111/300
Epoch 112/300
Epoch 113/300
Epoch 114/300
Epoch 115/300
Epoch 116/300
Epoch 117/300
Epoch 118/300
Epoch 119/300
Epoch 120/300
Epoch 121/300
Epoch 122/300
Epoch 123/300
Epoch 124/300
Epoch 125/300
Epoch 126/300
Epoch 127/300
Epoch 128/300
Epoch 129/300
Epoch 130/300
Epoch 131/300
Epoch 132/300
Epoch 133/300
Epoch 134/300
Epoch 135/300
Epoch 136/300
Epoch 137/300
Epoch 138/300
Epoch 139/300
Epoch 140/300
Epoch 141/300
Epoch 142/300
Epoch 143/300
Epoch 144/300
Epoch 145/300
Epoch 146/300
Epoch 147/300
Epoch 148/300
Epoch 149/300
Epoch 150/300
Epoch 151/300
Epoch 152/300
Epoch 153/300
Epoch 154/300
Epoch 155/300
Epoch 156/300
Epoch 157/300
Epoch 158/300
Epoch 159/300
Epoch 160/300
Epoch 161/300
Epoch 162/300
Epoch 163/300


Epoch 164/300
Epoch 165/300
Epoch 166/300
Epoch 167/300
Epoch 168/300
Epoch 169/300
Epoch 170/300
Epoch 171/300
Epoch 172/300
Epoch 173/300
Epoch 174/300
Epoch 175/300
Epoch 176/300
Epoch 177/300
Epoch 178/300
Epoch 179/300
Epoch 180/300
Epoch 181/300
Epoch 182/300
Epoch 183/300
Epoch 184/300
Epoch 185/300
Epoch 186/300
Epoch 187/300
Epoch 188/300
Epoch 189/300
Epoch 190/300
Epoch 191/300
Epoch 192/300
Epoch 193/300
Epoch 194/300
Epoch 195/300
Epoch 196/300
Epoch 197/300
Epoch 198/300
Epoch 199/300
Epoch 200/300
Epoch 201/300
Epoch 202/300
Epoch 203/300
Epoch 204/300
Epoch 205/300
Epoch 206/300
Epoch 207/300
Epoch 208/300
Epoch 209/300
Epoch 210/300
Epoch 211/300
Epoch 212/300
Epoch 213/300
Epoch 214/300
Epoch 215/300
Epoch 216/300
Epoch 217/300


Epoch 218/300
Epoch 219/300
Epoch 220/300
Epoch 221/300
Epoch 222/300
Epoch 223/300
Epoch 224/300
Epoch 225/300
Epoch 226/300
Epoch 227/300
Epoch 228/300
Epoch 229/300
Epoch 230/300
Epoch 231/300
Epoch 232/300
Epoch 233/300
Epoch 234/300
Epoch 235/300
Epoch 236/300
Epoch 237/300
Epoch 238/300
Epoch 239/300
Epoch 240/300
Epoch 241/300
Epoch 242/300
Epoch 243/300
Epoch 244/300
Epoch 245/300
Epoch 246/300
Epoch 247/300
Epoch 248/300
Epoch 249/300
Epoch 250/300
Epoch 251/300
Epoch 252/300
Epoch 253/300
Epoch 254/300
Epoch 255/300
Epoch 256/300
Epoch 257/300
Epoch 258/300
Epoch 259/300
Epoch 260/300
Epoch 261/300
Epoch 262/300
Epoch 263/300
Epoch 264/300
Epoch 265/300
Epoch 266/300
Epoch 267/300
Epoch 268/300
Epoch 269/300
Epoch 270/300
Epoch 271/300
Epoch 272/300


Epoch 273/300
Epoch 274/300
Epoch 275/300
Epoch 276/300
Epoch 277/300
Epoch 278/300
Epoch 279/300
Epoch 280/300
Epoch 281/300
Epoch 282/300
Epoch 283/300
Epoch 284/300
Epoch 285/300
Epoch 286/300
Epoch 287/300
Epoch 288/300
Epoch 289/300
Epoch 290/300
Epoch 291/300
Epoch 292/300
Epoch 293/300
Epoch 294/300
Epoch 295/300
Epoch 296/300
Epoch 297/300
Epoch 298/300
Epoch 299/300
Epoch 300/300


<tensorflow.python.keras.callbacks.History at 0x26d685796c8>

## Make Prediction Vector

In [70]:
predict_matrix = model.predict(np.array(list(paper_table['title_vector'])))

In [72]:
for i, node in enumerate(paper_nodes):
    node['predict_vector'] = predict_matrix[i]

## Construct Distance Matrix

In [79]:
distance_matrix = np.zeros((len(paper_nodes),len(paper_nodes)))
for i, nodeA in enumerate(paper_nodes):
    for j, nodeB in enumerate(paper_nodes):
        if i is not j:
            distance_matrix[i,j] = np.linalg.norm(np.dot(nodeA['title_vector'],nodeB['title_vector'].T))
max_distance = np.max(distance_matrix)
distance_matrix = distance_matrix/max_distance
distance_matrix.shape

(2555, 2555)

In [83]:
distance_table = list()
for i, nodeA in enumerate(paper_nodes):
    for j, nodeB in enumerate(paper_nodes):
        temp = dict()
        temp['paper_id_a'] = nodeA['paper.id']
        temp['paper_id_b'] = nodeB['paper.id']
        temp['distance'] = distance_matrix[i,j]
        distance_table.append(temp)
distance_table = pd.DataFrame(distance_table)
distance_table

Unnamed: 0,paper_id_a,paper_id_b,distance
0,1987,1987,0.000000
1,1987,316,0.261576
2,1987,358,0.146118
3,1987,2227,0.262546
4,1987,2437,0.230848
...,...,...,...
6528020,1169,157,0.169219
6528021,1169,101,0.242356
6528022,1169,395,0.215330
6528023,1169,1914,0.227054


## Distance Filter

In [87]:
def distance_filter(node_id):
    table = distance_table[distance_table["paper_id_a"] == node_id] #select
    table = table.sort_values(by='distance') #sort 
    table = table.head(10) #top 10
    return list(table['paper_id_b'])

## Example Paper 2554
The title of paper `2554` is `The Efficient Retrieval of Partial Documents`

With Chinese meaning `对部分文档的有效检索`

### The 10 articles with the most similar titles

In [88]:
for i in distance_filter(2554):
    print(graph.run("MATCH (node:paper{id: %d}) RETURN node.title" % i).data(),i)

[{'node.title': 'Erweiterbarkeit, Kooperation, Foderation von Datenbanksystemen'}] 2405
[{'node.title': 'Transaktionsunterstutzung fur Workflows'}] 93
[{'node.title': 'Comparing Subsumption Optimizations'}] 340
[{'node.title': 'Explaining ALC Subsumption'}] 345
[{'node.title': 'D-Tree Substitution Grammars'}] 1119
[{'node.title': 'Dynamic Classifier Selection'}] 941
[{'node.title': 'Bidirectional Contextual Resolution'}] 1096
[{'node.title': 'The Generative Lexicon'}] 1087
[{'node.title': 'Classifier Instability and Partitioning'}] 881
[{'node.title': 'Reusing Analogous Components'}] 69


1. Erweiterbarkeit, Kooperation, Foderation von Datenbanksystemen `2405` `可扩充、合作、创造档案`
2. Transaktionsunterstutzung fur Workflows `93` `Transaktionsunterstutzung毛皮工作流程`
3. Comparing Subsumption Optimizations `340` `比较包容优化`
4. Explaining ALC Subsumption `345` `解释ALC包含`
5. D-Tree Substitution Grammars `1119` `D-树替换语法`
6. Dynamic Classifier Selection `941` `动态分类器选择`
7. Bidirectional Contextual Resolution `1096` `双向上下文分辨`
8. The Generative Lexicon `1087` `生成词汇`
9. Classifier Instability and Partitioning `881` `分类器不稳定性和划分`
10. Reusing Analogous Components `69` `重复使用类似组件`

### Paper cited `2554` or paper is cited by `2554` 

In [89]:
citation_table[citation_table['source'] == 2554] #no

Unnamed: 0,source,target


In [90]:
citation_table[citation_table['target'] == 2554] #2554 cited 2522

Unnamed: 0,source,target
4036,2522,2554


In [91]:
print(graph.run("MATCH (node:paper{id: %d}) RETURN node.title" % 2522).data(),2522)

[{'node.title': 'Fast Ranking in Limited Space'}] 2522


1. Fast Ranking in Limited Space  `2522` `有限空间快速排序`