In [46]:
import pickle

In [49]:
import networkx as nx

In [50]:
from scipy.sparse.csgraph import minimum_spanning_tree

In [51]:
import optuna

In [52]:
import json

In [None]:
import os

### Subsumption class (to avoid pickle errors)

In [None]:
import logging

In [None]:
import sys

In [None]:
import io

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
import scipy.sparse as sp

In [None]:
from sklearn.preprocessing import binarize

In [None]:
class Subsumption:
    def __init__(self, data, topics) -> None:
        self.data_path = data
        self.topics_path = topics
        self.is_topic_path = True
        self.topics_label = ""
        self.overlaps = None
        self.weights = None
        self.features = None
        self.ifeatures = None
        self.lengths = None

    def load_data(self):
        if os.path.exists(self.data_path):
            logging.info('loading preprocessed data from %s' % self.data_path)
            if self.data_path.endswith(".txt"):
                self.data = open(self.data_path, "r")
            else:
                with open(self.data_path, 'rb') as fin:
                    self.data = pickle.load(fin)
        else:
            logging.error("preprocessed data doesn't exist")
            sys.exit()

    def load_topics(self):
        fname = self.topics_path
        if not os.path.exists(fname):
            self.is_topic_path = False
            fname = '/calcul/datasets/nasa/topics-%s.txt' % self.topics_path
            if not os.path.exists(fname):
                logging.error("not a filename or a valid topic name")
                sys.exit()
        logging.info('loading topics from %s' % fname)
        with open(fname, 'r') as f_in:
            self.topics = f_in.read()
        self.topics = self.topics.split('\n')
        logging.info('loaded %d topics' % len(self.topics))

    def make_counts(self):
        logging.info("getting topics counts")
        pattern = "(?u)\\b[\\w-]+\\b"

        self.vectorizer = CountVectorizer(vocabulary=set(
            self.topics), token_pattern=pattern, ngram_range=(1, 3))
        self.counts = self.vectorizer.transform(self.data)
        if isinstance(self.data, io.IOBase):
            self.data.close()
        del(self.data)
        self.features = self.vectorizer.get_feature_names()
        self.ifeatures = {k: v for v, k in enumerate(self.features)}

    def make_matrices(self):
        logging.info("getting the overlap and weight matrices")
        self.counts = binarize(self.counts)
        self.overlaps = self.counts.T.dot(self.counts)
        # del(self.counts)
        self.overlaps.data *= self.overlaps.data > 1
        self.overlaps.eliminate_zeros()
        self.lengths = self.overlaps.diagonal()
        diagonal = sp.diags([1./x if x > 0 else 0 for x in self.lengths])
        self.overlaps = diagonal.dot(self.overlaps)

        self.weights = self.overlaps.minimum(self.overlaps.T)
        dotp_sub = self.overlaps - self.weights
        dotp_sub.eliminate_zeros()
        dotp_sub.data[dotp_sub.data > 0] = 1
        self.weights = self.weights.minimum(dotp_sub)
        self.weights.data *= -1

    def dump(self, obj, prefix, suffix):
        filename = prefix + "/" + \
            self.data_path.split("/")[-1].split(".")[0]
        if self.is_topic_path:
            if self.topics_label:
                filename += "-" + self.topics_label
        else:
            filename += "-" + self.topics_path 
        filename += suffix
        with open(filename, "wb") as fout:
            pickle.dump(obj, fout)

## ACM

Adapt the filename for your own pickles

In [118]:
subsumption = pickle.load(open("../pickles/taxonomies/whole-acm_processed_with_counts/title_abstract_processed_subsumption.pickle", "rb"))

In [119]:
topic_similarities = pickle.load(open("../pickles/topic_subsumptions/whole_acm/topic_from_topic_similarities.pickle", "rb"))

In [120]:
author_similarities = pickle.load(open("../pickles/authors/whole_acm_processed_author_similarities.pickle", "rb"))

In [121]:
author_subsumption = pickle.load(open("../pickles/topic_subsumptions/whole_acm/topic_from_author_subsumptions.pickle", "rb"))

In [122]:
field_similarities = pickle.load(open("../pickles/topic_subsumptions/whole_acm/topic_from_field_similarities.pickle", "rb"))

In [123]:
field_subsumption = pickle.load(open("../pickles/topic_subsumptions/whole_acm/topic_from_field_subsumptions.pickle", "rb"))

In [124]:
standard = pickle.load(open("ACM_processed_networkx.pickle", "rb"))

In [95]:
row_sum = subsumption.overlaps.sum(axis=1)
root = subsumption.features[row_sum.argmax()]

In [61]:
def common_edge_objective(params):
    topic_subsumption_weight = params['topic_subsumption_weight']
    topic_similarity_weight = params['topic_similarity_weight']
    author_subsumption_weight = params['author_subsumption_weight']
    author_similarity_weight = params['author_similarity_weight']
    field_subsumption_weight = params['field_subsumption_weight']
    field_similarity_weight = params['field_similarity_weight']
    
    final_matrix = subsumption.weights * topic_subsumption_weight
    final_matrix -= topic_similarities * topic_similarity_weight
    final_matrix += author_subsumption * author_subsumption_weight
    final_matrix -= author_similarities * author_similarity_weight
    final_matrix += field_subsumption * field_subsumption_weight
    final_matrix -= field_similarities * field_similarity_weight
    
    tree = minimum_spanning_tree(final_matrix)
    
    author_nx = nx.Graph()
    rows, cols = tree.nonzero()
    for row, col in zip(rows, cols):
        source = subsumption.features[row]
        target = subsumption.features[col]
        author_nx.add_node(source)
        author_nx.add_node(target)
        author_nx.add_edge(source, target, weight=1, group=1)
        
    subgraph = standard.subgraph(author_nx.nodes)
    common_edges = 0
    for source, target in subgraph.edges:
        try:
            if target in author_nx[source] or source in author_nx[target]:
                common_edges += 1
        except:
            pass
    
    return common_edges, author_nx

In [62]:
def optuna_common_edge_objective(trial):
    range = 100.0
    return common_edge_objective({
    "topic_subsumption_weight": trial.suggest_uniform('topic_subsumption_weight', -range, range),
    "topic_similarity_weight": trial.suggest_uniform('topic_similarity_weight', -range, range),
    "author_subsumption_weight": trial.suggest_uniform('author_subsumption_weight', -range, range),
    "author_similarity_weight": trial.suggest_uniform('author_similarity_weight', -range, range),
    "field_subsumption_weight": trial.suggest_uniform('field_subsumption_weight', -range, range),
    "field_similarity_weight": trial.suggest_uniform('field_similarity_weight', -range, range)})[0]

In [63]:
def oriented_graph_from_root(graph, root):
    directed_graph = nx.DiGraph()
    node_stack = [root]
    while len(node_stack) > 0:
        node = node_stack.pop()
        directed_graph.add_node(node)
        for _, child in graph.edges(node):
            if child not in directed_graph:
                directed_graph.add_node(child)
                directed_graph.add_edge(node, child)
                node_stack.append(child)
    return directed_graph

In [64]:
def directed_edge_objective(params):
    topic_subsumption_weight = params['topic_subsumption_weight']
    topic_similarity_weight = params['topic_similarity_weight']
    author_subsumption_weight = params['author_subsumption_weight']
    author_similarity_weight = params['author_similarity_weight']
    field_subsumption_weight = params['field_subsumption_weight']
    field_similarity_weight = params['field_similarity_weight']
    
    final_matrix = subsumption.weights * topic_subsumption_weight
    final_matrix -= topic_similarities * topic_similarity_weight
    final_matrix += author_subsumption * author_subsumption_weight
    final_matrix -= author_similarities.similarities * author_similarity_weight
    final_matrix += field_subsumption * field_subsumption_weight
    final_matrix -= field_similarities * field_similarity_weight
    
    tree = minimum_spanning_tree(final_matrix)
    
    author_nx = nx.Graph()
    directed_author_nx = nx.DiGraph()
    
    rows, cols = tree.nonzero()
    for row, col in zip(rows, cols):
        source = subsumption.features[row]
        target = subsumption.features[col]
        author_nx.add_node(source)
        author_nx.add_node(target)
        author_nx.add_edge(source, target, weight=1, group=1)
        
    directed_author_nx = oriented_graph_from_root(author_nx, root)
    subgraph = standard.subgraph(author_nx.nodes)
    parent_child_edges = 0
    for source, target in subgraph.edges:
        try:
            dl = nx.shortest_path_length(
                directed_author_nx, source=source, target=target)
            parent_child_edges += 1
        except:
            pass
    
    return parent_child_edges, directed_author_nx

In [65]:
def optuna_directed_edge_objective(trial):
    range = 100
    return directed_edge_objective({
    "topic_subsumption_weight": trial.suggest_uniform('topic_subsumption_weight', -range, range),
    "topic_similarity_weight": trial.suggest_uniform('topic_similarity_weight', -range, range),
    "author_subsumption_weight": trial.suggest_uniform('author_subsumption_weight', -range, range),
    "author_similarity_weight": trial.suggest_uniform('author_similarity_weight', -range, range),
    "field_subsumption_weight": trial.suggest_uniform('field_subsumption_weight', -range, range),
    "field_similarity_weight": trial.suggest_uniform('field_similarity_weight', -range, range)})[0]

In [66]:
study = optuna.create_study(direction='maximize')

[32m[I 2021-11-16 14:45:08,753][0m A new study created in memory with name: no-name-712d1ecd-ef7d-42c9-83c7-63183f5fbe09[0m


In [None]:
study.optimize(optuna_common_edge_objective, n_trials=10000)

[32m[I 2021-11-16 14:45:10,526][0m Trial 0 finished with value: 16.0 and parameters: {'topic_subsumption_weight': 64.66088994509681, 'topic_similarity_weight': 87.89033701227359, 'author_subsumption_weight': 72.38471950163756, 'author_similarity_weight': 50.93022001706916, 'field_subsumption_weight': -62.673755445133274, 'field_similarity_weight': -58.061276229594206}. Best is trial 0 with value: 16.0.[0m
[32m[I 2021-11-16 14:45:11,032][0m Trial 1 finished with value: 1.0 and parameters: {'topic_subsumption_weight': -76.35422623450381, 'topic_similarity_weight': -35.62103593303094, 'author_subsumption_weight': 84.26326714625264, 'author_similarity_weight': 74.29231195969294, 'field_subsumption_weight': -77.54554200686535, 'field_similarity_weight': -37.10020648680143}. Best is trial 0 with value: 16.0.[0m
[32m[I 2021-11-16 14:45:11,545][0m Trial 2 finished with value: 2.0 and parameters: {'topic_subsumption_weight': -47.876052261388224, 'topic_similarity_weight': -29.9188627067

In [None]:
pickle.dump(study.trials_dataframe(), open("optuna_10000_r100_common_acm_df.pickle", "wb"))

In [42]:
df = pickle.load(open("optuna_10000_r100_common_acm_df.pickle", "rb"))

In [45]:
df.sort_values(by="value", ascending=False)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_author_similarity_weight,params_author_subsumption_weight,params_field_similarity_weight,params_field_subsumption_weight,params_topic_similarity_weight,params_topic_subsumption_weight,state
4167,4167,195.0,2021-11-11 23:23:49.631910,2021-11-11 23:23:50.362467,0 days 00:00:00.730557,16.244915,3.632684,2.583082,0.216847,10.475058,18.704079,COMPLETE
3012,3012,195.0,2021-11-11 23:10:36.378861,2021-11-11 23:10:37.073422,0 days 00:00:00.694561,16.985061,2.651506,2.783380,0.317390,10.728233,19.309572,COMPLETE
4569,4569,195.0,2021-11-11 23:28:44.350725,2021-11-11 23:28:45.001061,0 days 00:00:00.650336,14.957283,2.299479,2.644191,0.233236,9.902176,18.678397,COMPLETE
8327,8327,195.0,2021-11-12 00:19:55.511562,2021-11-12 00:19:56.401001,0 days 00:00:00.889439,16.844619,2.562870,2.860502,0.241242,11.139824,19.278762,COMPLETE
8320,8320,195.0,2021-11-12 00:19:49.099468,2021-11-12 00:19:49.980924,0 days 00:00:00.881456,17.173507,2.660564,2.653955,0.251182,10.853437,19.993983,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...
68,68,1.0,2021-11-11 22:41:05.593515,2021-11-11 22:41:06.184358,0 days 00:00:00.590843,12.823081,-13.487265,-5.050290,-4.626281,-10.648937,19.950850,COMPLETE
2372,2372,1.0,2021-11-11 23:03:43.714513,2021-11-11 23:03:44.396596,0 days 00:00:00.682083,-19.104913,2.727860,-0.595007,-0.866472,-19.956411,17.041223,COMPLETE
2,2,1.0,2021-11-11 22:40:26.957967,2021-11-11 22:40:27.513839,0 days 00:00:00.555872,12.774292,15.525977,-17.888805,-17.177214,-13.464553,-16.418123,COMPLETE
1909,1909,1.0,2021-11-11 22:58:58.212833,2021-11-11 22:58:58.811218,0 days 00:00:00.598385,14.822769,3.058480,-12.348253,-0.060744,-7.307141,-2.369038,COMPLETE


In [96]:
study_directed = optuna.create_study(direction='maximize')

[32m[I 2021-11-17 10:40:40,174][0m A new study created in memory with name: no-name-089d8bbf-88cf-4210-8d75-b5a650c9241d[0m


In [None]:
study_directed.optimize(optuna_directed_edge_objective, n_trials=10000)

[32m[I 2021-11-17 10:40:41,301][0m Trial 0 finished with value: 248.0 and parameters: {'topic_subsumption_weight': 57.882351574716125, 'topic_similarity_weight': 78.66613971189261, 'author_subsumption_weight': -52.204593711290734, 'author_similarity_weight': 21.385979833117986, 'field_subsumption_weight': -25.37278835389938, 'field_similarity_weight': 75.66311426083331}. Best is trial 0 with value: 248.0.[0m
[32m[I 2021-11-17 10:40:41,915][0m Trial 1 finished with value: 118.0 and parameters: {'topic_subsumption_weight': 3.580714967547351, 'topic_similarity_weight': 58.6752945423103, 'author_subsumption_weight': -85.26107083776145, 'author_similarity_weight': -64.0437064596941, 'field_subsumption_weight': -25.048802576816414, 'field_similarity_weight': -59.005777743484835}. Best is trial 0 with value: 248.0.[0m
[32m[I 2021-11-17 10:40:42,504][0m Trial 2 finished with value: 118.0 and parameters: {'topic_subsumption_weight': 14.689292687385375, 'topic_similarity_weight': -3.7446

In [None]:
pickle.dump(study_directed.trials_dataframe(), open("optuna_10000_r100_directed_acm_df.pickle", "wb"))

In [None]:
pickle.dump(common_edge_objective({'topic_subsumption_weight': -4.799253123446798,
 'topic_similarity_weight': 18.11759567768348,
 'author_subsumption_weight': -14.397780657138293,
 'author_similarity_weight': 13.105006482462592,
 'field_subsumption_weight': 0.42624572407324207,
 'field_similarity_weight': 0.24594713480867564})[1], open("random_acm_tax.pickle", "wb"))

## Europa

In [110]:
subsumption = pickle.load(open("../pickles/taxonomies/whole-europa_participants_processed_with_counts/title_abstract_processed_subsumption.pickle", "rb"))

In [111]:
topic_similarities = pickle.load(open("../pickles/topic_subsumptions/whole_europa/topic_from_topic_similarities.pickle", "rb"))

In [112]:
author_similarities = pickle.load(open("../pickles/authors/whole_europa_processed_author_similarities.pickle", "rb"))

In [113]:
author_subsumption = pickle.load(open("../pickles/topic_subsumptions/whole_europa/topic_from_author_subsumptions.pickle", "rb"))

In [114]:
field_similarities = pickle.load(open("../pickles/topic_subsumptions/whole_europa/topic_from_field_similarities.pickle", "rb"))

In [115]:
field_subsumption = pickle.load(open("../pickles/topic_subsumptions/whole_europa/topic_from_field_subsumptions.pickle", "rb"))

In [116]:
standard = pickle.load(open("data/europa_participants_processed_networkx.pickle", "rb"))

In [None]:
row_sum = subsumption.overlaps.sum(axis=1)
root = subsumption.features[row_sum.argmax()]

In [None]:
study = optuna.create_study(direction='maximize')

In [None]:
study.optimize(optuna_common_edge_objective, n_trials=10000)

In [None]:
pickle.dump(study.trials_dataframe(), open("optuna_10000_r100_common_europa_df.pickle", "wb"))

In [117]:
pickle.dump(common_edge_objective({'topic_subsumption_weight': 16.488236695036736,
 'topic_similarity_weight': 9.955506956175146,
 'author_subsumption_weight': 16.7845806867976,
 'author_similarity_weight': -1.775345713318275,
 'field_subsumption_weight': -0.7555590585991485,
 'field_similarity_weight': 2.4692364626382792})[1], open("random_europa_tax.pickle", "wb"))

In [None]:
study_directed = optuna.create_study(direction='maximize')

In [None]:
study_directed.optimize(optuna_directed_edge_objective, n_trials=10000)

In [87]:
pickle.dump(study_directed.trials_dataframe(), open("optuna_10000_r100_directed_europa_df.pickle", "wb"))