In [1]:
import json

In [None]:
import os

In [2]:
import gzip

In [None]:
import pickle

In [None]:
import math

In [3]:
from tqdm.notebook import tqdm

In [None]:
from scipy.sparse import dok_matrix

In [None]:
from sklearn.preprocessing import binarize, normalize

In [None]:
from scipy.sparse import vstack

In [None]:
from multiprocessing.dummy import Pool as ThreadPool

### Subsumption class (to avoid pickle errors)

In [None]:
import logging

In [None]:
import sys

In [None]:
import io

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
import scipy.sparse as sp

In [None]:
class Subsumption:
    def __init__(self, data, topics) -> None:
        self.data_path = data
        self.topics_path = topics
        self.is_topic_path = True
        self.topics_label = ""
        self.overlaps = None
        self.weights = None
        self.features = None
        self.ifeatures = None
        self.lengths = None

    def load_data(self):
        if os.path.exists(self.data_path):
            logging.info('loading preprocessed data from %s' % self.data_path)
            if self.data_path.endswith(".txt"):
                self.data = open(self.data_path, "r")
            else:
                with open(self.data_path, 'rb') as fin:
                    self.data = pickle.load(fin)
        else:
            logging.error("preprocessed data doesn't exist")
            sys.exit()

    def load_topics(self):
        fname = self.topics_path
        if not os.path.exists(fname):
            self.is_topic_path = False
            fname = '/calcul/datasets/nasa/topics-%s.txt' % self.topics_path
            if not os.path.exists(fname):
                logging.error("not a filename or a valid topic name")
                sys.exit()
        logging.info('loading topics from %s' % fname)
        with open(fname, 'r') as f_in:
            self.topics = f_in.read()
        self.topics = self.topics.split('\n')
        logging.info('loaded %d topics' % len(self.topics))

    def make_counts(self):
        logging.info("getting topics counts")
        pattern = "(?u)\\b[\\w-]+\\b"

        self.vectorizer = CountVectorizer(vocabulary=set(
            self.topics), token_pattern=pattern, ngram_range=(1, 3))
        self.counts = self.vectorizer.transform(self.data)
        if isinstance(self.data, io.IOBase):
            self.data.close()
        del(self.data)
        self.features = self.vectorizer.get_feature_names()
        self.ifeatures = {k: v for v, k in enumerate(self.features)}

    def make_matrices(self):
        logging.info("getting the overlap and weight matrices")
        self.counts = binarize(self.counts)
        self.overlaps = self.counts.T.dot(self.counts)
        # del(self.counts)
        self.overlaps.data *= self.overlaps.data > 1
        self.overlaps.eliminate_zeros()
        self.lengths = self.overlaps.diagonal()
        diagonal = sp.diags([1./x if x > 0 else 0 for x in self.lengths])
        self.overlaps = diagonal.dot(self.overlaps)

        self.weights = self.overlaps.minimum(self.overlaps.T)
        dotp_sub = self.overlaps - self.weights
        dotp_sub.eliminate_zeros()
        dotp_sub.data[dotp_sub.data > 0] = 1
        self.weights = self.weights.minimum(dotp_sub)
        self.weights.data *= -1

    def dump(self, obj, prefix, suffix):
        filename = prefix + "/" + \
            self.data_path.split("/")[-1].split(".")[0]
        if self.is_topic_path:
            if self.topics_label:
                filename += "-" + self.topics_label
        else:
            filename += "-" + self.topics_path 
        filename += suffix
        with open(filename, "wb") as fout:
            pickle.dump(obj, fout)

### Extract S2ORC data

In [4]:
def normalize_text(t):
    temp = str.lower(t)
    tokens = [".", ",", ":", "(", ")", ";", "!", "[", "]", "/"]
    for token in tokens:
        temp = temp.replace(token, "")
    return temp

In [None]:
root = "/calcul/datasets/s2orc/20200705v1/full/metadata/"
with open(root + "title_abstract_processed.txt", "w") as fout:    
    with open(root + "metadata_processed.txt", "w") as fout2:
        for archive in os.listdir(root):
            if archive.endswith(".gz"):
                with gzip.open(root + archive, 'rb') as fin:
                    for line in fin:
                        paper = json.loads(line.decode('utf-8'))
                        if paper["abstract"] is not None:
                            fout.write(normalize_text(paper['title']) + " " + normalize_text(paper['abstract']) + "\n")
                            fout2.write(json.dumps({"paper_id" : paper["paper_id"], 
                                         "authors" : paper["authors"], 
                                         "venue" : paper["venue"], 
                                         "journal" : paper["journal"], 
                                         "mag_id" : paper["mag_id"], 
                                         "mag_field" : paper["mag_field_of_study"]}) + "\n")

### Topic similarities

/!\ Can only be run after the generation of the subsumption pickle through `knowledge/subsumption.py`

In [None]:
subsumption = pickle.load(open("your _subsumption.pickle", "rb"))

In [None]:
n_subsumption = normalize(normalize(subsumption.overlaps, axis=1).tocsc().T, axis=1)

In [None]:
topic_similarities = n_subsumption.dot(n_subsumption.T)

In [None]:
pickle.dump(topic_similarities, open("topic_similarities.pickle", "wb"))

### Authors from S2ORC

In [None]:
class Author:
    def __init__(self, d) -> None:
        self.first = d["first"]
        self.middle = tuple(d["middle"])
        self.last = d["last"]
        self.suffix = d["suffix"]
        self.hash = self.compute_hash()
        self.str_hash = "a" + str(self.hash)

    def compute_hash(self) -> int:
        return hash((self.first, self.middle, self.last, self.suffix))

    def __hash__(self) -> int:
        return self.hash

    def __eq__(self, other) -> bool:
        return isinstance(other, Author) and hash(other) == self.hash

In [None]:
authors = {}
count = 0
with open(root + "metadata_processed.txt", "r") as fin:
    for line in tqdm(fin, total=76556428):
        paper = json.loads(line)
        for author in paper["authors"]:
            a = Author(author)
            if a not in authors:
                authors[a] = count
                count += 1

In [None]:
author_matrix = dok_matrix((76556428,count), dtype=int)

In [None]:
with open(root + "metadata_processed.txt", "r") as fin:
    for i, line in enumerate(tqdm(fin, total=76556428)):
        paper = json.loads(line)
        for author in paper["authors"]:
            author_matrix[i,authors[Author(author)]] = 1

In [None]:
pickle.dump(authors, open("author_dict.pickle", "wb"))

In [None]:
pickle.dump(author_matrix, open("dok_author_matrix.pickle", "wb"))

In [None]:
csr_author_matrix = author_matrix.tocsr()

In [None]:
pickle.dump(csr_author_matrix, open("csr_author_matrix", "wb"))

In [None]:
subsumption = pickle.load(open("your _subsumption.pickle", "rb"))

In [None]:
author_topics = csr_author_matrix.T.dot(subsumption.counts)

In [None]:
pickle.dump(author_matrix, open("author_topics.pickle"))

### Topic by author subsumption

In [None]:
author_topics = pickle.load(open("your author_topics.pickle", "rb"))

In [None]:
author_overlaps = binarize(author)

In [None]:
author_overlaps = author_overlaps.T.dot(author_overlaps)

In [None]:
author_overlaps.data *= author_overlaps.data > 1

In [None]:
author_overlaps.eliminate_zeros()

In [None]:
lengths = author_overlaps.diagonal()

In [None]:
diagonal = sp.diags([1./x if x > 0 else 0 for x in lengths])

In [None]:
author_overlaps = diagonal.dot(author_overlaps)

In [None]:
weights = author_overlaps.minimum(author_overlaps.T)

In [None]:
dotp_sub = author_overlaps - weights

In [None]:
dotp_sub.eliminate_zeros()

In [None]:
dotp_sub.data[dotp_sub.data > 0] = 1

In [None]:
weights = weights.minimum(dotp_sub)

In [None]:
weights.data *= -1

In [None]:
pickle.dump(weights, open("topic_from_author_subsumptions.pickle", "wb"))

### Topic by author similarities

In [None]:
author_topics = pickle.load(open("author_topics.pickle", "rb"))

In [None]:
author_topics = normalize(author_topics, axis=1)

In [None]:
author_topics = normalize(author_topics.tocsc().T, axis=1)

Please verify that threshold and chunk_size parameters fit your setup

In [None]:
chunk_size = 1000
threshold = 0.1 
threads = 1

In [None]:
def chunk_linear_dot(i):
        start = chunk_size * i
        chunk = author_topics[start:int(min(
            start+chunk_size, float(author_topics.shape[0])))] * author_topics.T
        return chunk.multiply(chunk > threshold)

In [None]:
pool = ThreadPool(threads)
chunks = pool.map(chunk_linear_dot, range(
    int(math.ceil(float(author_topics.shape[0]) / chunk_size))))
similarities = vstack(chunks)

In [None]:
pickle.dump(similarities, open("your _author_similarities.pickle", "wb"))

### Fields of study from S2ORC

In [None]:
fields = {}
index = 0
with open("/calcul/datasets/s2orc/20200705v1/full/metadata/metadata_processed.txt", "r") as fin:
    for line in fin:
        paper = json.loads(line)
        if paper["mag_field"] is not None:
            for field in paper["mag_field"]:
                if field not in fields:
                    fields[field] = index
                    index += 1

In [None]:
field_matrix = dok_matrix((76556428,len(fields)), dtype=int)

In [None]:
with open("/calcul/datasets/s2orc/20200705v1/full/metadata/metadata_processed.txt", "r") as fin:
    for i, line in enumerate(tqdm(fin, total=76556428)):
        paper = json.loads(line)
        if paper["mag_field"] is not None:
            for field in paper["mag_field"]:
                field_matrix[i,fields[field]] = 1

In [None]:
csr_field_matrix = field_matrix.tocsr()

In [None]:
pickle.dump(csr_field_matrix, open("field_matrix.pickle", "wb"))

In [None]:
field_topics = csr_field_matrix.T.dot(subsumption.counts)

In [None]:
pickle.dump(field_topics, open("field_topics.pickle", "wb"))

### Topic by field similarities

In [None]:
field_topics = pickle.load(open("field_topics.pickle", "rb"))

In [None]:
n_fields = normalize(normalize(field_topics, axis=1).tocsc().T, axis=1)

In [None]:
similarities = n_fields.dot(n_fields.T)

In [None]:
pickle.dump(similarities, open("field_similarities.pickle", "wb"))

### Topic by field subsumption

In [None]:
field_overlaps = binarize(field_topics)

In [None]:
field_overlaps = field_overlaps.T.dot(field_overlaps)

In [None]:
field_overlaps.data *= field_overlaps.data > 1

In [None]:
field_overlaps.eliminate_zeros()

In [None]:
lengths = field_overlaps.diagonal()

In [None]:
diagonal = sp.diags([1./x if x > 0 else 0 for x in lengths])

In [None]:
field_overlaps = diagonal.dot(field_overlaps)

In [None]:
weights = field_overlaps.minimum(field_overlaps.T)

In [None]:
dotp_sub = field_overlaps - weights

In [None]:
dotp_sub.eliminate_zeros()

In [None]:
dotp_sub.data[dotp_sub.data > 0] = 1

In [None]:
weights = weights.minimum(dotp_sub)

In [None]:
weights.data *= -1

In [None]:
pickle.dump(weights, open("field_subsumptions.pickle", "wb"))