<a href="https://colab.research.google.com/github/vdslab/bearinforest_graph_analysis/blob/main/swem/swem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json
with open("/content/2021.json") as f:
  raw = json.load(f)
  data = raw["articles"]
  print(data)
  for record in data:
    print(record["title"])


AttributeError: ignored

In [None]:
from gensim.models import KeyedVectors
import tensorflow_datasets as tfds
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import json


class SWEM():
    """
    Simple Word-Embeddingbased Models (SWEM)
    https://arxiv.org/abs/1805.09843v1
    """

    def __init__(self, w2v, tokenizer, oov_initialize_range=(-0.01, 0.01)):
        self.w2v = w2v
        self.tokenizer = tokenizer
        self.vocab = set(self.w2v.vocab.keys())
        self.embedding_dim = self.w2v.vector_size
        self.oov_initialize_range = oov_initialize_range

        if self.oov_initialize_range[0] > self.oov_initialize_range[1]:
            raise ValueError("Specify valid initialize range: "
                             f"[{self.oov_initialize_range[0]}, {self.oov_initialize_range[1]}]")

    def get_word_embeddings(self, text):
        np.random.seed(abs(hash(text)) % (10 ** 8))

        vectors = []
        for word in self.tokenizer.tokenize(text):
            if word in self.vocab:
                vectors.append(self.w2v[word])
            else:
                vectors.append(np.random.uniform(self.oov_initialize_range[0],
                                                 self.oov_initialize_range[1],
                                                 self.embedding_dim))
        return np.array(vectors)

    def average_pooling(self, text):
        word_embeddings = self.get_word_embeddings(text)
        return np.mean(word_embeddings, axis=0)

    def max_pooling(self, text):
        word_embeddings = self.get_word_embeddings(text)
        return np.max(word_embeddings, axis=0)

    def concat_average_max_pooling(self, text):
        word_embeddings = self.get_word_embeddings(text)
        return np.r_[np.mean(word_embeddings, axis=0), np.max(word_embeddings, axis=0)]

    def hierarchical_pooling(self, text, n):
        word_embeddings = self.get_word_embeddings(text)

        text_len = word_embeddings.shape[0]
        if n > text_len:
            raise ValueError(f"window size must be less than text length / window_size:{n} text_length:{text_len}")
        window_average_pooling_vec = [np.mean(word_embeddings[i:i + n], axis=0) for i in range(text_len - n + 1)]

        return np.max(window_average_pooling_vec, axis=0)

print(0)
tokenizer =tfds.deprecated.text.Tokenizer()
#googleの学習済みw2vモデルを使ってる
w2v_path = "/content/drive/MyDrive/GoogleNews-vectors-negative300.bin"
print(1)
w2v = KeyedVectors.load_word2vec_format(w2v_path, binary=True)
print(2)
swem = SWEM(w2v, tokenizer)
print(3)

vector_ave_data = []
vector_max_data = []
vector_cat_data = []
vector_hier_data = []
with open("/content/drive/MyDrive/2021.json") as f:
  raw = json.load(f)
  data = raw["articles"]
  
  for record in data:
    vector_ave_data.append({'title':record['title'], 'vector':swem.average_pooling( record['abstract'] ) .tolist() })
    vector_max_data.append({'title':record['title'], 'vector':swem.max_pooling(record['abstract'] ).tolist() })
    vector_cat_data.append({'title':record['title'], 'vector':swem.concat_average_max_pooling(record['abstract'] ).tolist() })
    #vector_hier_data.append({'title':record['title'], 'vector':swem.hierarchical_pooling( n = 2, text = record['abstract'] ).tolist() })

with open('/content/2021_swem_ave_sim.json', 'w') as f:
    json.dump(vector_ave_data, f, indent=4)

with open('/content/2021_swem_max_sim.json', 'w') as f:
    json.dump(vector_max_data, f, indent=4)

with open('/content/2021_swem_cat_sim.json', 'w') as f:
    json.dump(vector_cat_data, f, indent=4)

#with open('/content/2021_swem_hier_sim.json', 'w') as f:
#    json.dump(vector_hier_data, f, indent=4)

    

0
1
2
3


In [None]:
import json
import numpy as np
import csv
import random


data = []

path = '/content/drive/MyDrive/2021_swem_cat_sim.json'
with open(path) as f:
  raw = json.load(f)

  for record in raw:
    data.append({"title":record['title'], "vector":np.array(record['vector'])})

length = 100
minv = 2.0
maxv = -2.0

#dataからlength個ランダムにサンプリングする
sid = random.sample(range(len(data)), length)

with open('/content/node_2021_swem_max.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['Id', 'Label'])

    for i in range(length):
      writer.writerow([sid[i], data[sid[i]]['title']])

with open('/content/edge_2021_swem_max.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(["Source","Target","Cosine_similarity"])

    for i in range(length):
      for j in range(length):
        if(i != j):
          a = data[sid[i]]["vector"]
          b = data[sid[j]]["vector"]
          cosim = a @ b  / (np.linalg.norm(a) * np.linalg.norm(b) )
          writer.writerow([sid[i], sid[j], cosim])
          
          minv = min(minv, cosim)
          maxv = max(maxv, cosim)
print(minv)
print(maxv)

0.6640051241624468
1.0
