In [1]:
import pandas as pd
import neo4j as neo
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [15, 10]
import seaborn as sns
from graphdatascience import GraphDataScience
from sklearn.preprocessing import StandardScaler

  from .autonotebook import tqdm as notebook_tqdm


## CONNECTION WITH NEO4J DATABASE

In [2]:
host = "bolt://44.204.150.95:7687"
user = "neo4j"
password= "cakes-resident-pans"


gds = GraphDataScience(host, auth=(user, password))

print(gds.version())

2.2.2


#  Variables to initialise

In [4]:
field_computer_science = ["Information Systems","Computer Software","Data Format",
"Artificial Intelligence and Image Processing","Computation Theory and Mathematics","Other Information and Computing Sciences",
"Library and Information Studies","Computer Hardware"]
field_computer_science

['Information Systems',
 'Computer Software',
 'Data Format',
 'Artificial Intelligence and Image Processing',
 'Computation Theory and Mathematics',
 'Other Information and Computing Sciences',
 'Library and Information Studies',
 'Computer Hardware']

In [5]:
researcher_1="Jean-Henry Morin"
researcher_2="Gilles Falquet"

In [12]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

### METADATA OF THE GRAPH

In [None]:
gds.run_cypher("CALL db.schema.nodeTypeProperties()")

In [None]:
#Some metadata
gds.run_cypher("CALL db.schema.relTypeProperties()")

### GET THE SCIENTISTS WITH THE MOST PUBLICATIONS IN THE DB

In [None]:
#Get the scientits in unige with the most publications

result_total_publications= gds.run_cypher("MATCH (r:Researcher)-[w:WORKS_IN]-(f:Field) RETURN r.full_name as name, r.researcher_id as researcher_id, r.total_number_of_publications as total, \
collect (f.domain) ORDER BY r.total_number_of_publications DESC LIMIT 80")
result_total_publications.head(20)

In [None]:
result_total_publications_map = gds.run_cypher("MATCH (r:Researcher)-[w:WORKS_IN]-(f:Field) RETURN r.full_name, \
r.total_number_of_publications as total, collect({number:w.number_of_publication,field:f.domain}) as field_number ORDER BY total DESC LIMIT 10")
result_total_publications_map.head()

In [None]:
g = sns.barplot(x=result_total_publications["name"], y=result_total_publications["total"])
g = g.set_xticklabels(g.get_xticklabels(), rotation=90)

## WHO IS K.CENZUAL????
When the data is weird like this, normally you investigate.

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [None]:
CENZUAL = gds.run_cypher("MATCH (n:Researcher{researcher_id:'ur.014174005714.85'})-[:WORKS_IN]-(f:Field) return n.full_name, n.last_name, n.total_number_of_publications, collect(f.domain)")
CENZUAL

### Number of publication in an domain of research

In [None]:
#Get the field which has the most publications
result_domain_count = gds.run_cypher("MATCH (r:Researcher)-[w:WORKS_IN]-(f:Field) RETURN f.domain as domain, count(*) as number_time ORDER BY  number_time DESC LIMIT 40")
result_domain_count.head()

In [None]:
g = sns.barplot(x=result_domain_count["domain"], y=result_domain_count["number_time"])
g =g.set_xticklabels(g.get_xticklabels(), rotation=90)

### Who published in the most area of research

In [None]:
#Get the researcher that works in several research fields.
result_field = gds.run_cypher("MATCH (r:Researcher)-[w:WORKS_IN]-(f:Field) RETURN r.full_name as name, r.researcher_id as researcher_id, count(*) as fields ORDER BY  fields DESC LIMIT 30")
result_field.head(10)

In [None]:
g = sns.barplot(x=result_field["name"], y=result_field["fields"])
g = g.set_xticklabels(g.get_xticklabels(), rotation=90)

In [None]:
choppard = gds.run_cypher("MATCH (r:Researcher{researcher_id:'ur.016275067565.80'})-[w:WORKS_IN]-(f:Field) RETURN r.full_name as full_name, count(*) as total_fields, collect(f.domain) as domain")

print(choppard.domain)

## Computer science

## Scientists with the most publications in Computer Science (areas)

In [None]:
#Get the researcher with the most publications in Computer Science 

query_number_publications_in_computer_science_person =f"MATCH(r:Researcher)-[w:WORKS_IN]-(f:Field) WHERE f.domain IN {field_computer_science} \
    and w.weight_field > 1 return distinct r.full_name as name,r.researcher_id as researcher_id, r.total_number_of_publications as total_publication ORDER by r.total_number_of_publications DESC"

result_number_publication_person_computer_science = gds.run_cypher(query_number_publications_in_computer_science_person)
result_number_publication_person_computer_science.head(20)

In [None]:
query_informatique_fields = (f"MATCH(n:Researcher)-[w:WORKS_IN]-(f:Field) WHERE f.domain \
     IN {field_computer_science}  RETURN DISTINCT n.full_name, n.researcher_id as researcher, f.domain, w.weight_field ORDER by w.weight_field DESC")

result_informatique_field = gds.run_cypher(query_informatique_fields)
result_informatique_field.head(10)

In [None]:
#Get the researcher that works in several research fields.
result_field = gds.run_cypher(f"MATCH (r:Researcher)-[w:WORKS_IN]-(f:Field) WHERE f.domain \
IN {field_computer_science} and w.weight_field > 5 with r,f,w MATCH (r)-[ww:WORKS_IN]-(ff:Field)  \
RETURN r.full_name,r.researcher_id as researcher_id, r.total_number_of_publications as total_publication,max(w.weight_field) as max_weight_field, \
collect( DISTINCT f.domain) as domain_computer, \
count(ff.domain) as count_all_domain, collect(ff.domain) as all_domain  \
ORDER BY total_publication DESC ")
result_field.head(20)

## Start of the Recommender 

## Start for the graph algorithms to skip if these relationships are already in the DB

In [None]:
graph_name = "co_authors_graph"

In [None]:
result = gds.run_cypher("""CALL gds.graph.drop($params,false) YIELD graphName""",params={"params":graph_name}) # drop the graph and do not raise the error if the graph is not found.
result = gds.run_cypher("""CALL gds.graph.project($params, 'Researcher', 'CO_AUTHORS')""",params={"params":graph_name})
result

## Centrality
Utiliser les algorithmes de centralités pour calculer les chercheurs qui ont plus de connection (plus important) que d'autres. Ceci pourrait être utiliser plus tard pour classifier les reviewers.

### Calculate the Degree Centrality

In [None]:
#Stream the result for the degree centrality 

result = gds.run_cypher(
f" CALL gds.degree.stream($params) YIELD nodeId, score RETURN gds.util.asNode(nodeId).full_name AS name, score ORDER BY score DESC, name ASC", params={"params":graph_name}
)
result.head()

In [None]:
#write the data in the graph
result = gds.run_cypher(f"CALL gds.degree.write($params_graph, {{writeProperty: 'degree_centrality'}})",params={"params_graph":graph_name})
result


### Difference between centrality and betweeeness measures

Degree centrality measures the number of incoming or outgoing (or both) relationships from a node, depending on the orientation of a relationship projection.

Betweenness centrality is a way of detecting the amount of influence a node has over the flow of information in a graph. It is often used to find nodes that serve as a bridge from one part of a graph to another.

In [None]:
# #difficult to run on AURO DS
# result_betweeness = gds.run_cypher(
# f" CALL gds.betweenness.write($params_graph, {{writeProperty: 'betweeness'}})",params={"params_graph":graph_name})
# result_betweeness


### Calculate PageRank
The PageRank algorithm measures the importance of each node within the graph, based on the number 
incoming relationships and the importance of the corresponding source nodes. 
The underlying assumption roughly speaking is that a page is only as important as the pages that link to it.


In [None]:
result_page_rank_stream = gds.run_cypher(
f" CALL gds.pageRank.stream($params)\
YIELD nodeId, score \
RETURN gds.util.asNode(nodeId).full_name AS name, score \
ORDER BY score DESC, name ASC" , params={"params":graph_name}
)
result_page_rank_stream.head(10)

In [None]:
result_page_rank_write = gds.run_cypher(
f" CALL gds.pageRank.write($params_graph, {{writeProperty: 'pageRank',scaler:'MinMax'}})",params={"params_graph":graph_name})
result_page_rank_write

## Query the database for pageRank and Degree centrality

In [None]:
betweeness_page_rank_df = gds.run_cypher(f"MATCH (n:Researcher)-[w:WORKS_IN]-(f:Field) WHERE f.domain IN {field_computer_science} and \
    w.weight_field>1 return DISTINCT n.full_name, n.researcher_id, n.pageRank as pageRank,n.degree_centrality as centrality_score ORDER by pageRank DESC")
betweeness_page_rank_df

#centrality_df = gds.run_cypher(f"MATCH (n:Researcher)-[w:WORKS_IN]-(f:Field) WHERE f.domain IN {field_computer_science} and w.weight_field>3 return DISTINCT n.full_name,  n.researcher_id, n.degree_centrality as centrality_score ORDER by centrality_score  DESC")

In [None]:
scaler = StandardScaler()
betweeness_page_rank_df.iloc[:,2:] = scaler.fit_transform(betweeness_page_rank_df.iloc[:,2:].to_numpy())
betweeness_page_rank_df

In [None]:
#betweeness_page_rank_df.iloc[:,2:]= betweeness_page_rank_df.iloc[:,2:].transform(lambda x: x / abs(x).max())

In [None]:
betweeness_page_rank_df.fillna(0,inplace=True)
betweeness_page_rank_df.set_index("n.full_name",inplace=True)

In [None]:
betweeness_page_rank_df.sort_values(by="centrality_score",ascending=False,inplace=True)

In [None]:
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(16,8))
betweeness_page_rank_df[["centrality_score"]].head(30).plot(kind="bar", ax=ax1)
betweeness_page_rank_df[["pageRank"]].head(30).plot(kind="bar",ax=ax2)
a = ax1.set_xticklabels(ax1.get_xticklabels(), rotation=90)

The Degree Centrality algorithm counts the number of incoming and outgoing relationships from a node. It is used to find popular nodes in a graph.

PageRank is a variant of the Eigenvector Centrality algorithm which measures the transitive (or directional) influence of nodes. Relationships to high-scoring nodes contribute more to the score of a node than connections to low-scoring nodes. A high score means that a node is connected to other nodes that have high scores.

On peut remarquer que les mesures ne donnent pas les mêmes résultats. La question est de savoir quel algorithmes choisir, et cela se fait en fonction de ce que l'on veut faire, et des hypotheses de départ.
Par exemple, si on recherche un scientifique qui est un hub on va choisir Betweenness Centrality comme algorithme.
Entre ces deux exemples, PageRank et centrality, on voit que certains scientifiques on un score PageRank plus haut que leur score de centrality, ou plus bas. Cela pourrait être du au fait que les personnes avec qui ces scientifiques collaborent sont des personnes qui sont "importante" ou peu "importante". Un·e scientifique qui collabore surtout avec des étudiants en Doctorat va avoir un PageRank score plus bas que quelqu'un qui collabore surtout avec des scientifiques de renoms.

In [None]:
betweeness_page_rank_df[betweeness_page_rank_df.index.str.startswith("Jean-Henry")]


In [None]:
betweeness_page_rank_df[betweeness_page_rank_df.index.str.startswith("Hélène")]

In [None]:
betweeness_page_rank_df[betweeness_page_rank_df.index.str.startswith("Nadia Magnenat-Thalmann")]

## COMMUNITY

The Label Propagation algorithm (LPA) is a fast algorithm for finding communities in a graph. It detects these communities using network structure alone as its guide, and doesn’t require a pre-defined objective function or prior information about the communities.
LPA works by propagating labels throughout the network and forming communities based on this process of label propagation.
The intuition behind the algorithm is that a single label can quickly become dominant in a densely connected group of nodes, but will have trouble crossing a sparsely connected region. Labels will get trapped inside a densely connected group of nodes, and those nodes that end up with the same label when the algorithms finish can be considered part of the same community.

The Louvain method is an algorithm to detect communities in large networks. It maximizes a modularity score for each community, where the modularity quantifies the quality of an assignment of nodes to communities. This means evaluating how much more densely connected the nodes within a community are, compared to how connected they would be in a random network. (source neo4j)

In [None]:
graph_name_2 = "co_authors_graph_weight"
result = gds.run_cypher("""CALL gds.graph.drop($params,false) YIELD graphName""",params={"params":graph_name_2}) # drop the graph and do not raise the error if the graph is not found.
result = gds.run_cypher("""CALL gds.graph.project('co_authors_graph_weight', 'Researcher', {CO_AUTHORS:{properties:"number_of_time"}})""",params={"params":graph_name_2})
result

In [None]:
#LabelPropation with weight
gds.run_cypher("""CALL gds.labelPropagation.stats($graph_name,{maxIterations:22})
YIELD communityCount, ranIterations, didConverge""",params={"graph_name":graph_name_2})

In [None]:
#labelPropagation without weight
gds.run_cypher("""CALL gds.labelPropagation.stats($graph_name,{maxIterations:22})
YIELD communityCount, ranIterations, didConverge""",params={"graph_name":graph_name})


In [None]:
gds.run_cypher("""CALL gds.labelPropagation.stream($graph_name,{relationshipWeightProperty:"number_of_time",maxIterations:20})
YIELD nodeId, communityId AS Community
RETURN gds.util.asNode(nodeId).full_name AS Name, Community
ORDER BY Community, Name""",params={"graph_name":graph_name_2})

In [None]:
#Louvain with weight
gds.run_cypher("""
CALL gds.louvain.stats($graph_name,{relationshipWeightProperty:"number_of_time",maxIterations:22})
YIELD communityCount""",params={"graph_name":graph_name_2})

In [None]:
#Louvain without weight
gds.run_cypher("""
CALL gds.louvain.stats($graph_name)
YIELD communityCount""",params={"graph_name":graph_name_2})

In [None]:
# Label propagation
gds.run_cypher("""CALL gds.labelPropagation.stream($graph_name,{relationshipWeightProperty:"number_of_time",maxIterations:20})
YIELD nodeId, communityId AS Community
RETURN gds.util.asNode(nodeId).full_name AS Name, Community
ORDER BY Community, Name""",params={"graph_name":graph_name_2})

In [None]:
#LOUVAIN

gds.run_cypher("""
CALL gds.louvain.stream($graph_name)
YIELD nodeId, communityId, intermediateCommunityIds
RETURN gds.util.asNode(nodeId).full_name AS full_name, communityId, intermediateCommunityIds
ORDER BY full_name ASC""",params={"graph_name":graph_name_2})



In [None]:
result_write_louvain = gds.run_cypher("""
CALL gds.louvain.write($graph_name,{relationshipWeightProperty:"number_of_time", writeProperty:"louvain"})
YIELD communityCount, modularity, modularities""",params={"graph_name":graph_name_2})
result_write_louvain.head()


In [9]:
graph_name_3 = "field_weight"
result = gds.run_cypher("""CALL gds.graph.drop($params,false) YIELD graphName""",params={"params":graph_name_3}) # drop the graph and do not raise the error if the graph is not found.
result = gds.run_cypher("""CALL gds.graph.project($params, 'Researcher','WORKS_IN_SIMILAR_AREA')""",params={"params":graph_name_3})
result

Unnamed: 0,nodeProjection,relationshipProjection,graphName,nodeCount,relationshipCount,projectMillis
0,"{'Researcher': {'label': 'Researcher', 'proper...",{'WORKS_IN_SIMILAR_AREA': {'orientation': 'NAT...,field_weight,64849,1212027,600


In [None]:
gds.run_cypher("""
CALL gds.louvain.stream($graph_name)
YIELD nodeId, communityId, intermediateCommunityIds
RETURN gds.util.asNode(nodeId).full_name AS full_name, communityId, intermediateCommunityIds
ORDER BY full_name ASC""",params={"graph_name":graph_name_3})

In [None]:
result_write_louvain_field = gds.run_cypher("""
CALL gds.louvain.write($graph_name,{ writeProperty:"louvain_field"})
YIELD communityCount, modularity, modularities""",params={"graph_name":graph_name_3})
result_write_louvain_field.head()

# Begining of the query for the result

In [13]:
louvain_df = gds.run_cypher(f"MATCH (n:Researcher)-[w:WORKS_IN]-(f:Field) WHERE f.domain IN {field_computer_science} and \
w.weight_field>2 return DISTINCT n.full_name, n.louvain as louvain ORDER by louvain DESC")
louvain_df_group = (
        louvain_df.groupby("louvain")
        .agg(co_authors_list=("n.full_name", "unique"))
        .reset_index()
    )
louvain_df_group

Unnamed: 0,louvain,co_authors_list
0,1876,[Michel Lauria]
1,2249,[Cedric Crettaz]
2,4125,"[Nicolas Zufferey, Jean-Marc Seigneur, C. Bryce, Dimitri Konstantas, Jean-Henry Morin, Chrislain Razafimahefa, Naoufel Cheikhrouhou, Carlos Ballester Lafuente, Michel Deriaz, Verena Kantere, Katarzyna Wac, Matteo Ciman, Simon Thevenin]"
3,4855,[Yves Wiaux]
4,6833,[Gregg Vanderheiden]
5,8038,"[Stéphane Marchand-Maillet, Edgar Roman-Rangel, Hisham Mohamed, Eric Bruno]"
6,14970,[Yin Wu]
7,19829,"[Flavia Donno, Erwin Laure, Ian Willers, Matteo Risoldi, Vladimir Loncar]"
8,24562,[Ahmad Din]
9,26571,"[Giovanna Di Marzo Serugendo, Didier Buchs, Luis Pedro, Dimitri Racordon, Abdelaziz Khadraoui, Nicolas Ray, Pierre Lacroix, Gilles Falquet, Gregory Giuliani, Jose Luis Fernandez-Marquez, Jolita Ralyté, Francesco Luca De Angelis, Michel Leonard, J. Hulaas, L. Moccozet, Alexis Marechal, Gilbert Ritschard]"


In [14]:
#return the community of two scientists
louvain_2_scientists = gds.run_cypher(f"MATCH (n:Researcher)-[w:WORKS_IN]-(f:Field) \
WHERE n.full_name=$researcher_1 return DISTINCT n.full_name, n.louvain as louvain \
UNION \
MATCH (n:Researcher)-[w:WORKS_IN]-(f:Field) \
WHERE n.full_name=$researcher_2 return DISTINCT n.full_name, n.louvain as louvain",params={"researcher_1":researcher_1,"researcher_2":researcher_2})
louvain_2_scientists

Unnamed: 0,n.full_name,louvain
0,Jean-Henry Morin,4125
1,Gilles Falquet,26571


In [15]:
#return the researcher that are in the same community than both scientists
louvain_other = gds.run_cypher(f"MATCH (n:Researcher)-[w:WORKS_IN]-(f:Field) WHERE n.louvain IN {louvain_2_scientists.louvain.tolist()} \
RETURN DISTINCT n.full_name as full_name, n.researcher_id as researcher_id, n.louvain")
louvain_other.head(20)

Unnamed: 0,full_name,researcher_id,n.louvain
0,Paul R. Hunter,ur.01255356036.46,26571
1,Michel Leonard,ur.01066422007.20,26571
2,Simona Zipursky,ur.01145320545.18,26571
3,Alejandro Ramirez Gonzalez,ur.013065015327.56,26571
4,Emily Wootton,ur.010031000127.92,26571
5,Omar Benkacem,ur.012406460341.25,26571
6,Suzanne Scheele,ur.01057413767.46,26571
7,E Allanson,ur.016340266524.30,26571
8,Ö Tunçalp,ur.01106661663.19,26571
9,RAMACHANDRA PARARAJASEGARAM,ur.01153035546.55,26571


In [16]:
louvain_2_scientists_field = gds.run_cypher(f"MATCH (n:Researcher)-[w:WORKS_IN]-(f:Field) \
WHERE n.full_name=$researcher_1 return DISTINCT n.full_name, n.louvain_field as louvain_field \
UNION \
MATCH (n:Researcher)-[w:WORKS_IN]-(f:Field) \
WHERE n.full_name=$researcher_2 return DISTINCT n.full_name, n.louvain_field as louvain_field",params={"researcher_1":researcher_1,"researcher_2":researcher_2})
louvain_2_scientists_field

Unnamed: 0,n.full_name,louvain_field
0,Jean-Henry Morin,31670
1,Gilles Falquet,31670


In [17]:
#return the researcher that are in the same community than both scientists
louvain_other_field = gds.run_cypher(f"MATCH (n:Researcher)-[w:WORKS_IN]-(f:Field) WHERE n.louvain_field IN {louvain_2_scientists_field.louvain_field.tolist()} \
RETURN DISTINCT n.full_name as full_name, n.researcher_id as researcher_id, n.louvain_field")
print(louvain_other_field.shape)
louvain_other_field.head(20)

(1500, 3)


Unnamed: 0,full_name,researcher_id,n.louvain_field
0,Emmanouil T. Dermitzakis,ur.01204470014.51,31670
1,Sébastien Ziegler,ur.013523127674.95,31670
2,K H Krause,ur.01132215437.58,31670
3,C. Bonadonna,ur.07732455104.21,31670
4,Donald Glowinski,ur.07567302507.39,31670
5,Michel Leonard,ur.01066422007.20,31670
6,Manfred Gilli,ur.010700105653.02,31670
7,Elvezio Ronchetti,ur.012507417265.19,31670
8,T. O. Niinikoski,ur.010674443035.14,31670
9,A. Rijllart,ur.01000325115.53,31670


## Inference avec la règle
∀x ∀y ∀z ∀f,
co_authors(x,y) ∧
co_authors(y,z) ∧
works_in(x,f) ∧
works_in(z,f) ∧
(weight(z,f)-n) <=
(weight(x,f)<=
(weight(z,f)+n) ∧
(works_in(x,f)>n) ->
possible_reviewers(𝑥,𝑧)


In [18]:
#Paper written by 2 scientists
difference_weight_field=10
minimum_weight_field=1.0

result_inference = gds.run_cypher(f"UNWIND $full_name as full_name\
        MATCH (res_1:Researcher{{full_name:full_name}}) CALL apoc.neighbors.athop(res_1, 'CO_AUTHORS', 2) \
        yield node as res_2  \
        MATCH (res_1)-[w_1:WORKS_IN]->(f:Field) \
        MATCH (res_2)-[w_2:WORKS_IN]->(f:Field) \
        WHERE (w_2.weight_field-$difference_weight_field)<= w_1.weight_field <=(w_2.weight_field+$difference_weight_field) \
        AND w_1.weight_field > $minimum_weight_field \
        and res_1 <> res_2 \
        RETURN DISTINCT res_1.full_name as full_name_author, res_2.full_name as reviewer_full_name,res_2.researcher_id as reviewer_id, round((w_2.weight_field+w_1.weight_field)/2,3) as score \
        ",params={"full_name":[researcher_1,researcher_2],"difference_weight_field":difference_weight_field,"minimum_weight_field":minimum_weight_field})
result_tmp = result_inference.groupby(["reviewer_id"]).sum("score")
result_merged = result_tmp.merge(result_inference,on="reviewer_id").drop(columns="score_y").rename(columns={"score_x":"score"}).drop_duplicates()
result_merged.sort_values(by="score",ascending=False,inplace=True)
print(result_merged.shape)
result_merged.head(10)

(216, 4)


Unnamed: 0,reviewer_id,score,full_name_author,reviewer_full_name
259,ur.0704711431.38,14.352,Jean-Henry Morin,Nadia Magnenat-Thalmann
261,ur.0704711431.38,14.352,Gilles Falquet,Nadia Magnenat-Thalmann
287,ur.07552063233.67,11.447,Gilles Falquet,Henning Müller
155,ur.013317560667.45,11.164,Jean-Henry Morin,Nadia Magnenat Thalmann
157,ur.013317560667.45,11.164,Gilles Falquet,Nadia Magnenat Thalmann
284,ur.07527316741.46,9.674,Gilles Falquet,P. Volino
283,ur.07527316741.46,9.674,Jean-Henry Morin,P. Volino
136,ur.012722422737.80,9.051,Jean-Henry Morin,I. Pandzic
138,ur.012722422737.80,9.051,Gilles Falquet,I. Pandzic
223,ur.016176475704.89,8.784,Gilles Falquet,Patrick Ruch


In [19]:
result_merged_louvain_inference = result_inference.merge(louvain_other, left_on="reviewer_id",right_on="researcher_id").drop(columns=["researcher_id","full_name"]).sort_values(by="n.louvain")
print(result_merged_louvain_inference.shape)
result_merged_louvain_inference


(78, 5)


Unnamed: 0,full_name_author,reviewer_full_name,reviewer_id,score,n.louvain
16,Jean-Henry Morin,Panagiotis Kostopoulos,ur.016265667045.20,2.900,4125
19,Jean-Henry Morin,Naoufel Cheikhrouhou,ur.016055002233.13,0.865,4125
18,Jean-Henry Morin,Naoufel Cheikhrouhou,ur.016055002233.13,3.096,4125
17,Jean-Henry Morin,Athanasios I. Kyritsis,ur.010360176245.76,2.636,4125
32,Jean-Henry Morin,Shervin Zakeri,ur.014425412436.05,2.261,4125
...,...,...,...,...,...
43,Gilles Falquet,Eric Wehrli,ur.01242625577.43,2.474,26571
44,Gilles Falquet,Violeta Seretan,ur.015071453432.09,2.135,26571
45,Gilles Falquet,Anastasiya Yurchyshyna,ur.013646161477.93,3.183,26571
49,Gilles Falquet,Isabelle Bolon,ur.010341730503.86,1.757,26571


In [20]:
def query_co_authors_by_name(names: list[str]):
        query = """
        UNWIND $researchers as name
        MATCH (r:Researcher{full_name: name})-[c:CO_AUTHORS]-(p:Researcher) 
        WHERE r<>p
        RETURN DISTINCT p.researcher_id,p.full_name
        """
        return gds.run_cypher(
            query,
            params={"researchers": names},
        )

In [21]:
co_authors = query_co_authors_by_name(result_merged_louvain_inference.full_name_author.drop_duplicates().tolist())
co_authors.head(10)

Unnamed: 0,p.researcher_id,p.full_name
0,ur.0643632165.06,Dimitri Konstantas
1,ur.013104610033.47,Katerina Stamou
2,ur.011260202163.33,Verena Kantere
3,ur.014140374103.20,Vassilis Prevelakis
4,ur.07350022537.69,C. Bryce
5,ur.015572001267.73,Antonella Longo
6,ur.014177040267.27,Mario Bochicchio
7,ur.015372375037.57,L. Moccozet
8,ur.010456151764.35,Jörn Erbguth
9,ur.015271360627.89,Matteo Risoldi


In [22]:
result_merged_louvain_inference_field = result_inference.merge(louvain_other_field, left_on="reviewer_id",right_on="researcher_id").drop(columns=["researcher_id","full_name"]).sort_values(by="n.louvain_field")
print(result_merged_louvain_inference_field.shape)
result_merged_louvain_inference_field.sort_values(by="score", ascending=False)

(177, 5)


Unnamed: 0,full_name_author,reviewer_full_name,reviewer_id,score,n.louvain_field
11,Gilles Falquet,Nadia Magnenat-Thalmann,ur.0704711431.38,6.998,31670
1,Jean-Henry Morin,Michel Leonard,ur.01066422007.20,5.602,31670
67,Gilles Falquet,Christian Lovis,ur.01133331655.52,4.553,31670
8,Gilles Falquet,P. Volino,ur.07527316741.46,4.473,31670
50,Gilles Falquet,M. Hilario,ur.010323216605.25,4.454,31670
...,...,...,...,...,...
0,Jean-Henry Morin,Michel Leonard,ur.01066422007.20,0.990,31670
16,Jean-Henry Morin,Katarzyna Wac,ur.010234164203.55,0.985,31670
31,Jean-Henry Morin,Naoufel Cheikhrouhou,ur.016055002233.13,0.865,31670
32,Jean-Henry Morin,I. Pandzic,ur.012722422737.80,0.849,31670


In [23]:
# FINAL result for possible reviewer and if p.researcher_id
final_result = result_merged_louvain_inference.merge(co_authors,how="left",left_on="reviewer_id",right_on="p.researcher_id").fillna("0").sort_values(by="p.researcher_id",ascending=False)
final_result = final_result.drop(final_result[(final_result["p.full_name"]!="0")].index)

final_result.sort_values(by="score",ascending=False)


Unnamed: 0,full_name_author,reviewer_full_name,reviewer_id,score,n.louvain,p.researcher_id,p.full_name
56,Jean-Henry Morin,Gilles Falquet,ur.016167755437.37,4.318,26571,0,0
21,Gilles Falquet,Jean-Henry Morin,ur.0631021307.20,4.318,4125,0,0
64,Gilles Falquet,Jolita Ralyté,ur.012265620367.64,4.246,26571,0,0
70,Gilles Falquet,Jose Luis Fernandez-Marquez,ur.013262732566.50,4.108,26571,0,0
49,Gilles Falquet,Giovanna Di Marzo Serugendo,ur.07662606251.28,4.079,26571,0,0
...,...,...,...,...,...,...,...
63,Gilles Falquet,Jolita Ralyté,ur.012265620367.64,1.436,26571,0,0
16,Jean-Henry Morin,Chrislain Razafimahefa,ur.016103437533.00,1.078,4125,0,0
5,Jean-Henry Morin,D. Tsichritzis,ur.013534320626.68,1.039,4125,0,0
11,Jean-Henry Morin,Katarzyna Wac,ur.010234164203.55,0.985,4125,0,0
