In [21]:
import pandas as pd
import neo4j as neo
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [15, 10]
import seaborn as sns
from graphdatascience import GraphDataScience

#source = https://towardsdatascience.com/create-a-graph-database-in-neo4j-using-python-4172d40f89c4
class Neo4jConnection:
    
    def __init__(self, uri, user, pwd):
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None

        try:
            self.__driver = neo.GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)
        
    def close(self):
        if self.__driver is not None:
            self.__driver.close()
        
    def query(self, query, parameters=None, db=None):
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        try: 
            session = self.__driver.session(database=db) if db is not None else self.__driver.session() 
            response = list(session.run(query, parameters))
        except Exception as e:
            print("Query failed:", e)
        finally: 
            if session is not None:
                session.close()
        return response
    
    def insert_data(self,query, rows, batch_size = 10000):
        # Function to handle the updating the Neo4j database in batch mode.
        
        total = 0
        batch = 0
        result = None
        
        while batch * batch_size < len(rows):

            res = conn.query(query, 
                            parameters = {'rows': rows[batch*batch_size:(batch+1)*batch_size].to_dict('records')})
            total += res[0]['total']
            batch += 1
            result = {"total":total, 
                    "batches":batch,}
            print(result)
            
        return result



## CONNECTION WITH NEO4J DATABASE

In [22]:
host = "bolt://44.204.150.95:7687"
user = "neo4j"
password= "cakes-resident-pans"

# conn = Neo4jConnection(uri=host, 
#                        user=user,              
#                        pwd=password)

gds = GraphDataScience(host, auth=(user, password))

print(gds.version())

### METADATA OF THE GRAPH

In [None]:
gds.run_cypher("CALL db.schema.nodeTypeProperties()")

In [None]:
#Some metadata
gds.run_cypher("CALL db.schema.relTypeProperties()")

### GET THE SCIENTISTS WITH THE MOST PUBLICATIONS IN THE DB

In [None]:
#Get the scientits in unige with the most publications

result_total_publications= gds.run_cypher("MATCH (r:Researcher)-[w:WORKS_IN]-(f:Field) RETURN r.full_name as name, r.researcher_id as researcher_id, r.total_number_of_publications as total, \
collect (f.domain) ORDER BY r.total_number_of_publications DESC LIMIT 80")
result_total_publications.head(10)

In [None]:
result_total_publications_map= gds.run_cypher("MATCH (r:Researcher)-[w:WORKS_IN]-(f:Field) RETURN r.full_name, \
r.total_number_of_publications as total, collect({number:w.number_of_publication,field:f.domain}) as field_number ORDER BY total DESC LIMIT 10")
result_total_publications_map.head()

In [None]:
g = sns.barplot(x=result_total_publications["name"], y=result_total_publications["total"])
g = g.set_xticklabels(g.get_xticklabels(), rotation=90)

## WHO IS K.CENZUAL????
When the data is weird like this, normally you investigate.

### Number of publication in an domain of research

In [None]:
#Get the field which has the most publications
result_domain_count = gds.run_cypher("MATCH (r:Researcher)-[w:WORKS_IN]-(f:Field) RETURN f.domain as domain, count(*) as number_time ORDER BY  number_time DESC LIMIT 40")
result_domain_count.head()

In [None]:
g = sns.barplot(x=result_domain_count["domain"], y=result_domain_count["number_time"])
g =g.set_xticklabels(g.get_xticklabels(), rotation=90)

### Who published in the most area of research

In [None]:
#Get the researcher that works in several research fields.
result_field = gds.run_cypher("MATCH (r:Researcher)-[w:WORKS_IN]-(f:Field) RETURN r.full_name as name, r.researcher_id as researcher_id, count(*) as fields ORDER BY  fields DESC LIMIT 30")
result_field.head()

In [None]:
g = sns.barplot(x=result_field["name"], y=result_field["fields"])
g = g.set_xticklabels(g.get_xticklabels(), rotation=90)

In [None]:
choppard = gds.run_cypher("MATCH (r:Researcher{researcher_id:'ur.016275067565.80'})-[w:WORKS_IN]-(f:Field) RETURN r.full_name as full_name, count(*) as total_fields, collect(f.domain) as domain")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
print(choppard.domain)

## Computer science

In [None]:
field_computer_science = ["Information Systems","Computer Software","Data Format",
"Artificial Intelligence and Image Processing","Computation Theory and Mathematics","Other Information and Computing Sciences",
"Library and Information Studies","Computer Hardware"]
field_computer_science

In [None]:
#Get the researcher with the most publications in Computer Science 

query_number_publications_in_computer_science_person =f"MATCH(r:Researcher)-[w:WORKS_IN]-(f:Field) WHERE f.domain IN {field_computer_science} \
    and w.weight_field > 1 return distinct r.full_name as name,r.researcher_id as researcher_id, r.total_number_of_publications as total_publication ORDER by r.total_number_of_publications DESC"

result_number_publication_person_computer_science = gds.run_cypher(query_number_publications_in_computer_science_person)
result_number_publication_person_computer_science.head(20)

In [None]:
query_informatique_fields = (f"MATCH(n:Researcher)-[w:WORKS_IN]-(f:Field) WHERE f.domain \
     IN {field_computer_science}  RETURN DISTINCT n.full_name, n.researcher_id as researcher, f.domain, w.weight_field ORDER by w.weight_field DESC")

result_informatique_field = gds.run_cypher(query_informatique_fields)
result_informatique_field.head(10)

In [None]:
#Get the researcher that works in several research fields.
result_field = gds.run_cypher(f"MATCH (r:Researcher)-[w:WORKS_IN]-(f:Field) WHERE f.domain \
IN {field_computer_science} and w.weight_field > 5 with r,f,w MATCH (r)-[ww:WORKS_IN]-(ff:Field)  \
RETURN r.full_name,r.researcher_id as researcher_id, r.total_number_of_publications as total_publication,max(w.weight_field) as max_weight_field, \
collect( DISTINCT f.domain) as domain_computer, \
count(ff.domain) as count_all_domain, collect(ff.domain) as all_domain  \
ORDER BY total_publication DESC ")
result_field.head(20)

In [None]:
graph_name = "co_authors_graph"

In [None]:
result = gds.run_cypher("""CALL gds.graph.drop($params,false) YIELD graphName""",params={"params":graph_name}) # drop the graph and do not raise the error if the graph is not found.
result = gds.run_cypher("""CALL gds.graph.project($params, 'Researcher', 'CO_AUTHORS')""",params={"params":graph_name})
result

## Start of the Recommender 

### Difference between centrality and betweeeness measures

Degree centrality measures the number of incoming or outgoing (or both) relationships from a node, depending on the orientation of a relationship projection.

Betweenness centrality is a way of detecting the amount of influence a node has over the flow of information in a graph. It is often used to find nodes that serve as a bridge from one part of a graph to another.

## Calculate the Degree Centrality

In [None]:
#Stream the result for the degree centrality 

result = gds.run_cypher(
f" CALL gds.degree.stream($params) YIELD nodeId, score RETURN gds.util.asNode(nodeId).full_name AS name, score ORDER BY score DESC, name ASC", params={"params":graph_name}
)
result.head()

In [None]:
#write the data in the graph
result = gds.run_cypher(f"CALL gds.degree.write($params_graph, {{writeProperty: 'degree_centrality'}})",params={"params_graph":graph_name})
result


In [None]:
# #difficult to run on AURO DS
# result_betweeness = gds.run_cypher(
# f" CALL gds.betweenness.write($params_graph, {{writeProperty: 'betweeness'}})",params={"params_graph":graph_name})
# result_betweeness


### Calculate PageRank
The PageRank algorithm measures the importance of each node within the graph, based on the number 
incoming relationships and the importance of the corresponding source nodes. 
The underlying assumption roughly speaking is that a page is only as important as the pages that link to it.


In [None]:
result_page_rank = gds.run_cypher(
f" CALL gds.pageRank.stream($params)\
YIELD nodeId, score \
RETURN gds.util.asNode(nodeId).full_name AS name, score \
ORDER BY score DESC, name ASC" , params={"params":graph_name}
)
result_page_rank.head(10)

In [None]:
result_page_rank = gds.run_cypher(
f" CALL gds.pageRank.write($params_graph, {{writeProperty: 'pageRank'}})",params={"params_graph":graph_name})
result_page_rank

## Query the database for pageRank and Degree centrality

In [None]:
betweeness_page_rank_df = gds.run_cypher(f"MATCH (n:Researcher)-[w:WORKS_IN]-(f:Field) WHERE f.domain IN {field_computer_science} and w.weight_field>1 return DISTINCT n.full_name, n.researcher_id, n.pageRank as pageRank,n.degree_centrality as centrality_score ORDER by pageRank DESC")
betweeness_page_rank_df

#centrality_df = gds.run_cypher(f"MATCH (n:Researcher)-[w:WORKS_IN]-(f:Field) WHERE f.domain IN {field_computer_science} and w.weight_field>3 return DISTINCT n.full_name,  n.researcher_id, n.degree_centrality as centrality_score ORDER by centrality_score  DESC")

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()


df = pd.DataFrame({
               'A':[1,2,3,5,10],
               'B':[100,200,300,500,50],
               'C':list('abcde')
})
df


In [None]:
df.iloc[:,0:-1] = df.iloc[:,0:-1].apply(lambda x: (x-x.mean())/ x.std(), axis=0)
print(df)

In [None]:
df.iloc[:,0:-1]= df.iloc[:,0:-1].transform(lambda x: x / abs(x).max())
df

In [None]:
betweeness_page_rank_df.iloc[:,2:] = scaler.fit_transform(betweeness_page_rank_df.iloc[:,2:].to_numpy())
betweeness_page_rank_df

In [None]:
df.iloc[:,0:-1] = (df.iloc[:,0:-1]-df.iloc[:,0:-1].min()) / (df.iloc[:,0:-1].max()-df.iloc[:,0:-1].min())
df

In [None]:
#betweeness_page_rank_df.iloc[:,2:]= betweeness_page_rank_df.iloc[:,2:].transform(lambda x: x / abs(x).max())

In [None]:
#betweeness_page_rank_df.iloc[:,2:]= betweeness_page_rank_df.iloc[:,2:].transform(lambda x: x / abs(x).max())
#betweeness_page_rank_df=(df-df.min())/(df.max()-df.min())

In [None]:

betweeness_page_rank_df.fillna(0,inplace=True)
betweeness_page_rank_df.set_index("n.full_name",inplace=True)
betweeness_page_rank_df

In [None]:
p = betweeness_page_rank_df.sort_values(by="centrality_score",ascending=False).head(40).plot.bar(stacked=True)
p = p.set_xticklabels(p.get_xticklabels(), rotation=90)


In [None]:
researcher_1="Jean-Henry Morin"
researcher_2="Katarzyna Wac"

In [None]:
#Paper written by 2 scientists

result_inference = gds.run_cypher(f"UNWIND $full_name as full_name\
        MATCH (res_1:Researcher{{full_name:full_name}}) CALL apoc.neighbors.athop(res_1, 'CO_AUTHORS', 2) \
        yield node as res_2  \
        MATCH (res_1)-[w_1:WORKS_IN]->(f:Field) \
        MATCH (res_2)-[w_2:WORKS_IN]->(f:Field) \
        WHERE (w_2.weight_field-$difference_weight_field)<= w_1.weight_field <=(w_2.weight_field+$difference_weight_field) \
        AND w_1.weight_field > $minimum_weight_field \
        and res_1 <> res_2 \
        RETURN DISTINCT res_1.full_name as full_name_author, res_2.full_name as reviewer_full_name,res_2.researcher_id as reviewer_id, round((w_2.weight_field+w_1.weight_field)/2,3) as score \
        ",params={"full_name":[researcher_1,researcher_2],"difference_weight_field":0.8,"minimum_weight_field":0.8})
result_tmp = result_inference.groupby(["reviewer_id"]).sum("score")
result_merged = result_tmp.merge(result_inference,on="reviewer_id").drop(columns="score_y").rename(columns={"score_x":"score"}).drop_duplicates()
result_merged.sort_values(by="score",ascending=False)


In [None]:
graph_result = gds.run_cypher("""
CALL gds.graph.project('co_authors_graph_weight', 'Researcher', {CO_AUTHORS:{properties:"number_of_time"}})
""")

### LOUVAIN COMMUNITY

The Louvain method is an algorithm to detect communities in large networks. It maximizes a modularity score for each community, where the modularity quantifies the quality of an assignment of nodes to communities. This means evaluating how much more densely connected the nodes within a community are, compared to how connected they would be in a random network. (source neo4j)

In [None]:
#LOUVAIN

result_stream_louvain = gds.run_cypher("""
CALL gds.louvain.stream('co_authors_graph_weight')
YIELD nodeId, communityId, intermediateCommunityIds
RETURN gds.util.asNode(nodeId).full_name AS full_name, communityId, intermediateCommunityIds
ORDER BY full_name ASC""")
result_stream_louvain.head()


In [None]:
result_stream_louvain = gds.run_cypher("""
CALL gds.louvain.write('co_authors_graph_weight',{relationshipWeightProperty:"number_of_time", writeProperty:"louvain"})
YIELD communityCount, modularity, modularities""")
result_stream_louvain.head()


In [None]:
louvain_df = gds.run_cypher(f"MATCH (n:Researcher)-[w:WORKS_IN]-(f:Field) WHERE f.domain IN {field_computer_science} and w.weight_field>3 return DISTINCT n.full_name, n.louvain as pageRank ORDER by pageRank DESC LIMIT 20")
louvain_df

In [None]:

louvain_2_scientists = gds.run_cypher(f"MATCH (n:Researcher)-[w:WORKS_IN]-(f:Field) \
WHERE n.full_name=$researcher_1 return DISTINCT n.full_name, n.louvain as louvain \
UNION \
MATCH (n:Researcher)-[w:WORKS_IN]-(f:Field) \
WHERE n.full_name=$researcher_2 return DISTINCT n.full_name, n.louvain as louvain",params={"researcher_1":researcher_1,"researcher_2":researcher_2})
louvain_2_scientists

In [None]:
#return the researcher that are in the same community than Gilles and Jean-Henry
louvain_other = gds.run_cypher(f"MATCH (n:Researcher)-[w:WORKS_IN]-(f:Field) WHERE n.louvain IN {louvain_2_scientists.louvain.tolist()} \
RETURN DISTINCT n.full_name as full_name, n.researcher_id as researcher_id, n.louvain")
louvain_other.head(10)

In [None]:
result_merged_louvain_inference = result_inference.merge(louvain_other, how="left", left_on="reviewer_id",right_on="researcher_id").drop(columns=["researcher_id","full_name"]).sort_values(by="n.louvain")
result_merged_louvain_inference

In [None]:
def query_co_authors_by_name(names: list[str]):
        query = """
        UNWIND $researchers as name
        MATCH (r:Researcher{full_name: name})-[c:CO_AUTHORS]-(p:Researcher) 
        WHERE r<>p
        RETURN DISTINCT p.researcher_id,p.full_name
        """
        return gds.run_cypher(
            query,
            params={"researchers": names},
        )

In [None]:
co_authors = query_co_authors_by_name(result_inference.reviewer_full_name.tolist())
co_authors

In [None]:
result_merged_louvain_inference.merge(co_authors,how="left",left_on="reviewer_id",right_on="p.researcher_id").fillna("0").sort_values(by="p.researcher_id",ascending=True)

In [None]:
#Node similarity

graph_result = gds.run_cypher("""
CALL gds.graph.project('co_authors_graph_weight', 'Researcher', {CO_AUTHORS:{properties:"number_of_time"}})
""")

In [None]:
# graph_result = gds.run_cypher("""
# CALL gds.graph.project('co_authors_graph_weight', 'Researcher', {CO_AUTHORS:{properties:"number_of_time"}})
# """)
# graph_result
# f, axs = plt.subplots(1,2,figsize=(16,5))

# g = sns.barplot(ax=axs[0], x=merged["n.full_name"], y=merged["betweeness_score"])
# b = sns.barplot(ax=axs[1],x=merged["n.full_name"], y=merged["centrality_score"])

# g.set_xticklabels(g.get_xticklabels(), rotation=90)
# b.set_xticklabels(g.get_xticklabels(), rotation=90)

# plt.show()

