# 数据处理与存储

## 基本说明

脚本根据 [citation-recommendation](https://github.com/whuscity/citation-recommendation) 整理。

本脚本的作用：
* 读取原始的引文数据
* 引文、作者、关键词 ID 化
* 根据网络表示学习不同模型特点，保存为模型所需的连边、网络等
* 将基础数据存入数据库

## 依赖

图表示部分基于 [@shenweichen](https://github.com/shenweichen) 的 [GraphEmbedding](https://github.com/shenweichen/GraphEmbedding) 中的部分模型，可参考该仓库进行导入。

执行本脚本所需包如下，一并导入。

In [67]:
import numpy as np
import pandas as pd
import json
import networkx as nx
import sys,argparse,random
from gensim.models import Word2Vec
from ge import Graph

## 文件目录、数据库配置限定

In [91]:
dir_input = "./data/aan/aan_full.csv"
dir_node_type = "./data/aan/aan_node_type.txt"
dir_gatne_edge = "./data/aan/aan_gatne_edge.txt"
dir_normal_edge = "./data/aan/aan_normal_edge.txt"
dir_emb = "./data/aan/aan_node2vec.emb"
dir_cite_edge = "./data/aan/aan_cite_edge.txt"
dir_cite_nx = "./data/aan/aan_cite.edgelist"
dir_final_data = "./data/aan/aan.csv"
dir_node_id = "./data/aan/aan_node_id.csv"

dbname = "aan"

In [56]:
from py2neo import Graph, Node, Relationship, NodeMatcher, RelationshipMatcher

class neoConnection:
    def __init__(self):
        self.graph = Graph(
            "http://localhost:7474",
            user = "cradmin",
            password = "888888"
        )

import pymysql

class mysqlConnection:
    def __init__(self):
        self.connection = pymysql.connect(
            host="localhost",
            user="cradmin",
            passwd="888888",
            database="cit_rec"
        )
        self.cursor = self.connection.cursor()

# 数据预处理

## 数据读取与描述统计

In [57]:
df = pd.read_csv(dir_input,usecols = ['id','author','refs','new_new_keywords','year','title','abstract','content'])
authors = list(set(df['author']))
keywords = list(set(df['new_new_keywords']))
refs = list(set(df['refs']))
papers = list(set(df['id'].astype(str)))

In [58]:
aus = []
for au in authors:
    if type(au)!=float and len(au)>0:
        aus += au.split(";")
aus = list(set(aus))
kws = []
for kw in keywords:
    if type(kw)!=float and len(kw)>0:
        kws += kw.split(";")
kws = list(set(kws))
paper_count = len(papers)
author_count = len(set(aus))
keyword_count = len(set(kws))
print("不重复的论文数为{}".format(paper_count))
print("不重复的作者数为{}".format(author_count))
print("不重复的关键词数为{}".format(keyword_count))

不重复的论文数为16706
不重复的作者数为18861
不重复的关键词数为44502


## 实体 ID 化

统计完论文、作者、关键词的数量之后就可以对其进行 ID 化。
创建一个 `node_type` 文件用于记录不同 ID 的节点实体类型。

In [59]:
p = []; a = []; k = []
for i in range(paper_count):
    p.append(str(i)+" "+"P")
for j in range(paper_count,paper_count+author_count):
    a.append(str(j)+" "+"A")
for m in range(paper_count+author_count,paper_count+author_count+keyword_count):
    k.append(str(m)+" "+"K")
total = p+a+k
with open(dir_node_type,"w") as f:
    for each in total:
        f.write(each+"\n")
    f.close()

In [60]:
def get_id(x,entity_name_set,offset):
    entity_ids = []
    if type(x) != float and len(x)>0:
        for entity_name in x.split(';'):
            entity_ids.append(str(entity_name_set.index(str(entity_name))+offset))
    return ';'.join(entity_ids)

df['paper_id'] = df['id'].apply(get_id,entity_name_set=papers,offset=0)
df['author_id'] = df['author'].apply(get_id,entity_name_set=aus,offset=paper_count)
df['keyword_id'] = df['new_new_keywords'].apply(get_id,entity_name_set=kws,offset=paper_count+author_count)
df['refs_id'] = df['refs'].apply(get_id,entity_name_set=papers,offset=0)

## 生成连边数据

目前包含三类连边：
1. 【作者】----(撰写)----【论文】
2. 【论文】----(包含)----【关键词】
3. 【论文】----(引用)----【论文】



In [64]:
a_p_edges = []; normal_a_p_edges = []
p_k_edges = []; normal_p_k_edges = []
p_p_edges = []; normal_p_p_edges = []

for i,row in df.iterrows():
    if len(row['author_id']) > 0:
        for author in row['author_id'].split(';'):
            a_p_edges.append("1 " + str(author) + ' ' + str(row['paper_id']))
            normal_a_p_edges.append(str(author) + ' ' + str(row['paper_id']))
    if len(row['keyword_id']) > 0:
        for keyword in row['keyword_id'].split(';'):
            p_k_edges.append("2 " + str(row['paper_id']) + ' ' + str(keyword))
            normal_p_k_edges.append(str(row['paper_id']) + ' ' + str(keyword))
    if len(row['refs_id']) > 0:
        for ref in row['refs_id'].split(';'):
            p_p_edges.append("3 " + str(row['paper_id']) + ' ' + str(ref))
            normal_p_p_edges.append(str(row['paper_id']) + ' ' + str(ref))

def write_text(texts,filepath):
    with open(filepath,"w",encoding = "utf-8") as f:
        for each in texts:
            f.write(each+"\n")
        f.close()

gatne_edges = a_p_edges + p_k_edges + p_p_edges
normal_edges = normal_a_p_edges + normal_p_k_edges + normal_p_p_edges
write_text(gatne_edges,dir_gatne_edge)
write_text(normal_edges,dir_normal_edge)
write_text(normal_p_p_edges,dir_cite_edge)

# 网络表示学习

根据 [@flyingzhy](https://github.com/flyingzhy) 的试验结果直接采取最优状态下的模型参数。


## 获取 `networkx` 图

In [65]:
def read_txt(filename):
    with open(filename, "r", encoding="utf8") as f:
        res = f.readlines()
        f.close()
    pairs = []
    for edge in res:
        edge_list = edge.strip("\n").split(" ")
        pairs.append((int(edge_list[0]),int(edge_list[1])))
    return pairs
path = nx.DiGraph()
edges = read_txt(dir_cite_edge)
path.add_edges_from(edges)
dedges = path.edges()
ans = []
for each in edges:
    if each not in dedges:
        ans.append(each)
nx.write_edgelist(path, dir_cite_nx)

## Node2Vec 表示

In [71]:
def parse_args():
    '''
    Parses the node2vec arguments.
    '''
    parser = argparse.ArgumentParser(description="Run node2vec.")

    parser.add_argument('--input', default=dir_cite_nx,
                        help='Input graph path')

    parser.add_argument('--output', default=dir_emb,
                        help='Embeddings path')

    parser.add_argument('--dimensions', type=int, default=128,
                        help='Number of dimensions. Default is 128.')

    parser.add_argument('--walk-length', type=int, default=20,
                        help='Length of walk per source. Default is 80.')

    parser.add_argument('--num-walks', type=int, default=5,
                        help='Number of walks per source. Default is 10.')

    parser.add_argument('--window-size', type=int, default=8,
                        help='Context size for optimization. Default is 10.')

    parser.add_argument('--iter', default=1, type=int,
                        help='Number of epochs in SGD')

    parser.add_argument('--workers', type=int, default=8,
                        help='Number of parallel workers. Default is 8.')

    parser.add_argument('--p', type=float, default=0.5,
                        help='Return hyperparameter. Default is 1.')

    parser.add_argument('--q', type=float, default=0.5,
                        help='Inout hyperparameter. Default is 1.')

    parser.add_argument('--weighted', dest='weighted', action='store_true',
                        help='Boolean specifying (un)weighted. Default is unweighted.')
    parser.add_argument('--unweighted', dest='unweighted', action='store_false')
    parser.set_defaults(weighted=False)

    parser.add_argument('--directed', dest='directed', action='store_true',
                        help='Graph is (un)directed. Default is undirected.')
    parser.add_argument('--undirected', dest='undirected', action='store_false')
    parser.set_defaults(directed=False)

    return parser.parse_args(args=[])


def read_graph():
    '''
    Reads the input network in networkx.
    '''
    if args.weighted:
        G = nx.read_edgelist(args.input, nodetype=int, data=(('weight', float),), create_using=nx.DiGraph())
    else:
        G = nx.read_edgelist(args.input, nodetype=int, create_using=nx.DiGraph())
        for edge in G.edges():
            G[edge[0]][edge[1]]['weight'] = 1

    if not args.directed:
        G = G.to_undirected()

    return G


def learn_embeddings(walks):
    '''
    Learn embeddings by optimizing the Skipgram objective using SGD.
    '''
    walks = [list(map(str, walk)) for walk in walks]
    model = Word2Vec(walks, size=args.dimensions, window=args.window_size, min_count=0, sg=1, workers=args.workers,
                     iter=args.iter)
    model.wv.save_word2vec_format(args.output)
    print("保存完毕")

    return


args = parse_args()
nx_G = read_graph()
G = Graph(nx_G, args.directed, args.p, args.q)
G.preprocess_transition_probs()
walks = G.simulate_walks(args.num_walks, args.walk_length)
print("开始执行")
learn_embeddings(walks)

Walk iteration:
1 / 5
2 / 5
3 / 5
4 / 5
5 / 5
开始执行




保存完毕


In [74]:
df.to_csv(dir_final_data,index=False)

In [1]:
node_list = []
for x,y in zip(np.array(range(0,paper_count)),np.array(papers)):
    node_list.append([x,y,'P'])
for x,y in zip(np.array(range(paper_count,paper_count+author_count)),np.array(aus)):
    node_list.append([x,y,'A'])
for x,y in zip(np.array(range(paper_count+author_count,paper_count+author_count+keyword_count)),np.array(kws)):
    node_list.append([x,y,'K'])


NameError: name 'np' is not defined

In [88]:
nodes = pd.DataFrame(node_list,columns=['id','node_name','node_type'])

In [89]:
nodes['dbname']=dbname

In [92]:
nodes.to_csv(dir_node_id,index=False)