#### Original graph와 reduced graph의 차이
- Original graph는 모든 특허 데이터를 이용해 구축한 IPC 네트워크 그래프.
- Original graph로 모델을 검증하기 위해 훈련셋과 검증셋을 나누어야 한다.
- 이 때 훈련셋으로 학습한 모델은 훈련셋에 없는 IPC는 예측할 수 없다.
- 따라서, 훈련셋에 출현한 IPC만으로 구축한 별도의 그래프 구축이 필요하다.
- 훈련셋에는 출현하지 않고 검증셋에서 새로 등장하는 IPC를 제외하고 네트워크를 구축한 것이 reduced 그래프.
- 간단히 말해, original graph에서 모델 검증을 위해 일부 IPC를 제외했다고 할 수 있다.

In [18]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
from sklearn.feature_extraction.text import CountVectorizer
import pickle as pkl
import networkx as nx
from networkx.readwrite import json_graph
import json
import random

In [42]:
data = pd.read_excel('../data/add_patent.xlsx', sheet_name='분석대상', index_col=None, header=None)
data = data.fillna(' ')
data = data.loc[:, (data != ' ').any(axis=0)]
data = data.iloc[1:,1:-1]
data = data.reset_index(drop=True)
data

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,1979,d_5109,H04J,,,,,,,,,,,,,,
1,1980,d_5106,H04L,,,,,,,,,,,,,,
2,1981,d_5107,A01M,,,,,,,,,,,,,,
3,1981,d_5108,C22F,,,,,,,,,,,,,,
4,1983,d_5104,G01R,G01R,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5104,2020,d_0417,G06F,H04L,,,,,,,,,,,,,
5105,2020,d_0418,G01S,B63G,,,,,,,,,,,,,
5106,2020,d_0488,H04N,H04L,H04N,H04N,,,,,,,,,,,
5107,2020,d_0536,G01S,G01S,,,,,,,,,,,,,


In [43]:
val = data[data[1].isin(['2019', '2020'])]
val = val.reset_index(drop=True)
val = val.iloc[:,2:]; val.iloc[:2]

Unnamed: 0,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,G01R,G01R,G01R,,,,,,,,,,,,
1,B01J,B01J,B01J,B01J,B01J,C06D,,,,,,,,,


In [44]:
train = data[data[1] != '2019']
train = train[train[1] != '2020']
train = train.reset_index(drop=True)

In [45]:
train

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,1979,d_5109,H04J,,,,,,,,,,,,,,
1,1980,d_5106,H04L,,,,,,,,,,,,,,
2,1981,d_5107,A01M,,,,,,,,,,,,,,
3,1981,d_5108,C22F,,,,,,,,,,,,,,
4,1983,d_5104,G01R,G01R,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4614,2018,d_1381,B63G,H01F,,,,,,,,,,,,,
4615,2018,d_1409,B64C,,,,,,,,,,,,,,
4616,2018,d_1442,F02K,F01D,F02K,,,,,,,,,,,,
4617,2018,d_1462,H01Q,H01Q,H01Q,,,,,,,,,,,,


In [46]:
train = train.iloc[:,2:]; train

Unnamed: 0,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,H04J,,,,,,,,,,,,,,
1,H04L,,,,,,,,,,,,,,
2,A01M,,,,,,,,,,,,,,
3,C22F,,,,,,,,,,,,,,
4,G01R,G01R,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4614,B63G,H01F,,,,,,,,,,,,,
4615,B64C,,,,,,,,,,,,,,
4616,F02K,F01D,F02K,,,,,,,,,,,,
4617,H01Q,H01Q,H01Q,,,,,,,,,,,,


In [47]:
(len(train) + len(test)) == len(data)

True

In [48]:
train_merged = train[3].map(str)
for i in range(3, len(train.columns)+3):
    train_merged += ' ' + train[i].map(str)
train_merged

0             H04J H04J                            
1             H04L H04L                            
2             A01M A01M                            
3             C22F C22F                            
4          G01R G01R G01R                          
                           ...                     
4614       B63G B63G H01F                          
4615          B64C B64C                            
4616    F02K F02K F01D F02K                        
4617    H01Q H01Q H01Q H01Q                        
4618    H04L H04L H04B H04L                        
Name: 3, Length: 4619, dtype: object

In [49]:
train_idx = train_merged.index
cv = CountVectorizer()
cv.fit(train_merged)
results = cv.transform(train_merged)
train_ipc = cv.get_feature_names()
ipc_bow = pd.DataFrame(results.toarray(), columns=train_ipc, index=train_idx)
ipc_bow

Unnamed: 0,a01k,a01m,a01n,a41d,a42b,a47b,a61b,a61f,a61h,a61k,...,h04m,h04n,h04q,h04r,h04s,h04w,h05b,h05f,h05h,h05k
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4614,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4615,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4616,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4617,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
train_adj = np.matmul(ipc_bow.values.T, ipc_bow.values)
train_adj[train_adj != 0] = 1
np.fill_diagonal(train_adj, 0)
print(train_adj)
print(train_adj.shape)
print(np.count_nonzero(train_adj))

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(322, 322)
2748


In [51]:
train_adj_idx = pd.DataFrame(train_adj, columns=train_ipc, index=train_ipc); train_adj_idx

Unnamed: 0,a01k,a01m,a01n,a41d,a42b,a47b,a61b,a61f,a61h,a61k,...,h04m,h04n,h04q,h04r,h04s,h04w,h05b,h05f,h05h,h05k
a01k,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
a01m,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
a01n,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
a41d,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
a42b,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
h04w,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
h05b,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
h05f,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
h05h,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [52]:
# train_adj_idx.to_csv('add_ipc_train.csv')

In [71]:
val_merged = val[3].map(str)
for i in range(3, len(val.columns)+3):
    val_merged += ' ' + val[i].map(str)
val_merged

0            G01R G01R G01R G01R                        
1      B01J B01J B01J B01J B01J B01J C06D            ...
2                  H04K H04K                            
3         H01M H01M B01F B63G H01M                      
4               G01S G01S G01S                          
                             ...                        
485             G06F G06F H04L                          
486             G01S G01S B63G                          
487       H04N H04N H04L H04N H04N                      
488             G01S G01S G01S                          
489       G01S G01S G01S G02B G02B                      
Name: 3, Length: 490, dtype: object

#### Validation

In [72]:
val_idx = val.index
cv2 = CountVectorizer()
cv2.fit(val_merged)
results2 = cv2.transform(val_merged)
val_ipc = cv2.get_feature_names()
ipc_bow2 = pd.DataFrame(results2.toarray(), columns=val_ipc, index=val_idx)
ipc_bow2

Unnamed: 0,a41d,a61b,a61h,a61k,a61m,a61p,a62b,a62d,a63f,b01d,...,h04b,h04j,h04k,h04l,h04m,h04n,h04w,h05f,h05h,h05k
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
485,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
486,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
487,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,4,0,0,0,0
488,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [118]:
val_adj = np.matmul(ipc_bow2.values.T, ipc_bow2.values)
val_adj[val_adj != 0] = 1
np.fill_diagonal(val_adj, 0)
print(val_adj)
print(val_adj.shape)
print(np.count_nonzero(val_adj))

[[0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(153, 153)
814


In [119]:
val_adj_idx = pd.DataFrame(val_adj, columns=val_ipc, index=val_ipc); val_adj_idx.head()

Unnamed: 0,a41d,a61b,a61h,a61k,a61m,a61p,a62b,a62d,a63f,b01d,...,h04b,h04j,h04k,h04l,h04m,h04n,h04w,h05f,h05h,h05k
a41d,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
a61b,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
a61h,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
a61k,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
a61m,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [120]:
val_G = nx.from_pandas_adjacency(val_adj_idx)

In [121]:
len(val_G.edges())

407

In [130]:
val_adj_modified = train_adj_idx.copy()
val_adj_modified[val_adj_modified != 0] = 0
print(np.count_nonzero(val_adj_modified))
val_adj_modified

0


Unnamed: 0,a01k,a01m,a01n,a41d,a42b,a47b,a61b,a61f,a61h,a61k,...,h04m,h04n,h04q,h04r,h04s,h04w,h05b,h05f,h05h,h05k
a01k,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
a01m,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
a01n,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
a41d,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
a42b,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
h04w,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
h05b,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
h05f,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
h05h,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [131]:
for i, j in val_G.edges():
    try:
        if train_adj_idx.loc[i, j] != 1 and  train_adj_idx.loc[j, i] != 1:
            try:
                val_adj_modified.loc[i, j] += 1
                val_adj_modified.loc[j, i] += 1
            except KeyError:
                continue
    except KeyError:
        continue
print(np.count_nonzero(val_adj_modified))

354


In [132]:
val_adj_modified

Unnamed: 0,a01k,a01m,a01n,a41d,a42b,a47b,a61b,a61f,a61h,a61k,...,h04m,h04n,h04q,h04r,h04s,h04w,h05b,h05f,h05h,h05k
a01k,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
a01m,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
a01n,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
a41d,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
a42b,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
h04w,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
h05b,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
h05f,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
h05h,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [133]:
labeled_graph = train_adj_idx.copy()
np.count_nonzero(labeled_graph.values)

2748

In [134]:
for i, j in val_G.edges():
    if all(x in train_ipc for x in [i, j]) and (train_adj_idx.loc[i, j] == 0) and (train_adj_idx.loc[j, i] == 0):
        labeled_graph.loc[i, j] += 1
        labeled_graph.loc[j, i] += 1

In [135]:
np.count_nonzero(labeled_graph.values)

3102

In [136]:
labeled_graph

Unnamed: 0,a01k,a01m,a01n,a41d,a42b,a47b,a61b,a61f,a61h,a61k,...,h04m,h04n,h04q,h04r,h04s,h04w,h05b,h05f,h05h,h05k
a01k,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
a01m,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
a01n,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
a41d,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
a42b,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
h04w,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
h05b,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
h05f,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
h05h,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [137]:
original_G = nx.from_pandas_adjacency(labeled_graph)
original_G_json = json_graph.node_link_data(original_G)

In [138]:
with open('../data/reduced.graph', 'w') as f:
    json.dump(original_G_json, f)

In [139]:
train_G = nx.from_pandas_adjacency(train_adj_idx)
train_G_json = json_graph.node_link_data(train_G)

In [140]:
with open('../data/reduced_train.graph', 'w') as f:
    json.dump(train_G_json, f)

In [146]:
val_G = nx.from_pandas_adjacency(val_adj_modified)
val_G_json = json_graph.node_link_data(val_G)

In [153]:
with open('../data/reduced_val.graph', 'w') as f:
    json.dump(val_G_json, f)

#### Check saved data

In [4]:
graphs = ['reduced', 'reduced_train', 'reduced_val']
obj = []
for graph in graphs:
    with open('../data/'+graph +'.graph', 'r') as f:
        data = json.load(f)
    obj.append(data)
reduced_g = json_graph.node_link_graph(obj[0])
train_g = json_graph.node_link_graph(obj[1])
val_g = json_graph.node_link_graph(obj[2])
pd.DataFrame([[len(reduced_g.nodes()),len(reduced_g.edges())],[len(train_g.nodes()),len(train_g.edges())],[len(val_g.nodes()),len(val_g.edges())]],
             index=['Reduced graph', 'Reduced graph - train(1979-2020)', 'Reduced graph - val(2019-2020)'], columns=['nodes', 'edges'])

Unnamed: 0,nodes,edges
Reduced graph,322,1551
Reduced graph - train(1979-2020),322,1374
Reduced graph - val(2019-2020),322,177


In [10]:
def ismember(idx_i, idx_j, edgelist):
    if ((idx_i, idx_j) in edgelist) or ((idx_j, idx_i) in edgelist):
        return True
    else:
        return False

In [11]:
train_adj = nx.adjacency_matrix(train_g, nodelist=train_g.nodes())
reduced_adj = nx.adjacency_matrix(reduced_g, nodelist=train_g.nodes())

In [24]:
idx2nodes = {}
nodes2idx = {}
for idx, node in enumerate(train_g.nodes()):
    idx2nodes[idx] = node
    nodes2idx[node] = idx

In [25]:
val_edges_name = []
for edge in val_g.edges():
    val_edges_name.append(edge)
print(len(val_edges_name))
print(val_edges_name[:5])

177
[('a61b', 'a61h'), ('a61p', 'c07k'), ('a62b', 'd01d'), ('a62b', 'd01f'), ('a62b', 'd03d')]


In [26]:
val_edges = []
for i, j in val_edges_name:
    val_edges.append((nodes2idx[i], nodes2idx[j]))
print(len(val_edges))
print(val_edges[:5])

177
[(6, 8), (13, 123), (14, 148), (14, 149), (14, 152)]


In [27]:
val_non_edges_name = []
while len(val_non_edges_name) < len(val_edges):
    idx_i, idx_j = random.sample(train_g.nodes(), 2)
    if ismember(idx_i, idx_j,  reduced_g.edges()):
        continue
    if ismember(idx_i, idx_j, val_non_edges_name):
        continue
    val_non_edges_name.append((idx_i, idx_j))
print(len(val_non_edges_name))
print(val_non_edges_name[:5])

177
[('h02g', 'b29l'), ('d03d', 'h02k'), ('f16g', 'b60w'), ('f41g', 'b23k'), ('b03b', 'b33y')]


In [31]:
val_non_edges = []
for i, j in val_non_edges_name:
    val_non_edges.append((nodes2idx[i], nodes2idx[j]))
val_non_edges[:5]

[(291, 57), (152, 294), (192, 77), (219, 44), (25, 60)]

In [34]:
# with open('idx2nodes.pkl', 'wb') as f:
#     pkl.dump(idx2nodes, f)
# with open('nodes2idx.pkl', 'wb') as f:
#     pkl.dump(nodes2idx, f)
# with open('val_edges_name.pkl', 'wb') as f:
#     pkl.dump(val_edges_name, f)
# with open('val_non_edges_name.pkl', 'wb') as f:
#     pkl.dump(val_non_edges_name, f)
# with open('val_edges.pkl', 'wb') as f:
#     pkl.dump(val_edges, f)
# with open('val_non_edges.pkl', 'wb') as f:
#     pkl.dump(val_non_edges, f)

In [None]:
# with open('idx2nodes.pkl', 'rb') as f:
#     idx2nodes = pkl.load(f)
# with open('nodes2idx.pkl', 'rb') as f:
#     nodes2idx = pkl.load(f)
# with open('val_edges_name.pkl', 'rb') as f:
#     val_edges_name = pkl.load(f)
# with open('val_non_edges_name.pkl', 'rb') as f:
#     val_non_edges_name = pkl.load(f)
# with open('val_edges.pkl', 'rb') as f:
#     val_edges = pkl.load(f)
# with open('val_non_edges.pkl', 'rb') as f:
#     val_non_edges = pkl.load(f)