In [1]:
import pandas as pd
import neo4j as neo
from collections import Counter


In [2]:
def transform_dataframe_researcher(df):
    total_number_of_publications_df = (
        df.groupby(["researcher_id"], dropna=True)
        .agg(total_number_of_publications=("publication_id", "count"))
        .astype({"total_number_of_publications": "int16"})
        .reset_index()
    )
    researcher = pd.merge(df, total_number_of_publications_df, on=["researcher_id"])
    return researcher

In [3]:
def transform_dataframe_field_resesarcher(df):
    field_researcher_df = df.groupby([
        "researcher_id",
        "second_level_field_of_research",
        "total_number_of_publications",
    ]).agg(number_of_publication=("publication_id",'nunique')).reset_index()
    total_field_research = (
        field_researcher_df.groupby(["researcher_id"])
        .agg(total_number_of_fields=("second_level_field_of_research", "count"))
        .reset_index()
    )
    field_researcher_transformed_df = pd.merge(
        total_field_research,
        field_researcher_df,
        on=["researcher_id"],
    )
    field_researcher_transformed_df["weight"] = (field_researcher_transformed_df["total_number_of_fields"]*
    field_researcher_transformed_df["number_of_publication"])/field_researcher_transformed_df["total_number_of_publications"]
    return field_researcher_transformed_df


In [4]:
def transform_co_authors(df):
    co_authors_df = (df.groupby(["publication_id", "year_publication"])
            .agg(co_authors_list=("researcher_id", "unique"))
            .reset_index()
        )
    co_authors_df = co_authors_df.astype(
            {
                "year_publication": "int16",
            }
        )
    co_authors_df["co_authors_list"] = co_authors_df["co_authors_list"].apply(list)
    return co_authors_df

In [None]:
#source = https://towardsdatascience.com/create-a-graph-database-in-neo4j-using-python-4172d40f89c4
class Neo4jConnection:
    
    def __init__(self, uri, user, pwd):
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None

        try:
            self.__driver = neo.GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)
        
    def close(self):
        if self.__driver is not None:
            self.__driver.close()
        
    def query(self, query, parameters=None, db=None):
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        try: 
            session = self.__driver.session(database=db) if db is not None else self.__driver.session() 
            response = list(session.run(query, parameters))
        except Exception as e:
            print("Query failed:", e)
        finally: 
            if session is not None:
                session.close()
        return response
    
    def insert_data(self,query, rows, batch_size = 1000):
        # Function to handle the updating the Neo4j database in batch mode.
        
        total = 0
        batch = 0
        result = None
        
        while batch * batch_size < len(rows):
            #print(type(rows[batch*batch_size:(batch+1)*batch_size].to_dict('records')))
            #print(rows[batch*batch_size:(batch+1)*batch_size].to_dict('records'))
            #print("\n")
            res = conn.query(query, 
                            parameters = {'rows': rows[batch*batch_size:(batch+1)*batch_size].to_dict('records')})
            #print(f"EKHDJKFHKDJSHFKDSHFJK {res}")
            total += res[0]['total']
            batch += 1
            result = {"total":total, 
                    "batches":batch,}
            print(result)
            
        return result

# conn = Neo4jConnection(
#   "bolt://44.203.23.150",
#  "neo4j", "tanks-syntax-streak")

conn = Neo4jConnection(
  "bolt://localhost:7689",
 "neo4j", "123456789")


In [6]:
def add_researcher(researcher_df): 
   query="""
            UNWIND $rows as row
            MERGE(c:Researcher {researcher_id:row.researcher_id})
            ON CREATE SET c.full_name = row.first_name + " " + row.last_name
            ON CREATE SET c.first_name= row.first_name
            ON CREATE SET c.last_name = row.last_name
            ON CREATE SET c.total_number_of_publications = row.total_number_of_publications
            return count(c) as total   
    """
   return conn.query(query,parameters={"rows":researcher_df.to_dict("records")})

In [7]:
def add_field(field_df):
    query=""" 
    UNWIND $rows as row
    MERGE (c:Field {domain:row.second_level_field_of_research})
    return count(c) as total
    """
    return conn.query(query,parameters={"rows":field_df.to_dict("records")})

In [8]:
def add_field_researcher(field_researcher):
    query =""" 
    UNWIND $rows as row
    MATCH (d:Field {domain:row.second_level_field_of_research})
    MATCH (p:Researcher {researcher_id:row.researcher_id})
    MERGE (p)-[r:WORKS_IN]-(d)
    ON CREATE SET r.number_of_publication = row.number_of_publication
    ON CREATE SET r.weight_field  = row.weight
    RETURN count(r) as total
    """
    #return conn.query(query,parameters={"rows":field_researcher.to_dict("records")})
    return conn.insert_data(query,field_researcher)


In [9]:
def add_co_authors(co_authors,batch_size):
   query="""
       UNWIND $rows as row
        UNWIND apoc.coll.combinations(row.co_authors_list, 2) as pair
        MATCH (p1:Researcher{researcher_id:pair[0]})
        MATCH (p2:Researcher{researcher_id:pair[1]})
        WHERE elementId(p1) <> elementId(p2)
        MERGE (p1)-[r:CO_AUTHORS]-(p2)
        ON CREATE SET r.number_of_time = 1
        ON MATCH SET r.number_of_time = r.number_of_time + 1
        ON CREATE SET r.last_year_collaboration = row.year_publication
        ON MATCH SET r.last_year_collaboration = case WHEN
        r.last_year_collaboration<row.year then row.year_publication
        else r.last_year_collaboration 
        end
        RETURN count(r) as total
        """
   
   return conn.insert_data(query,co_authors,batch_size=batch_size)


     

In [10]:
# Transform the dataframes

#Read the CSV and drop the NA researcher_id and drop duplicates
df = pd.read_csv("cui_unige_sample.csv")
df.dropna(subset=["researcher_id"], inplace=True)
df.drop_duplicates(inplace=True)

full_dataframe_researcher = transform_dataframe_researcher(df)
field_researcher_df = transform_dataframe_field_resesarcher(full_dataframe_researcher)
co_authors_df= transform_co_authors(full_dataframe_researcher)



In [12]:
# Uplaod the data to NEO4J

#Create some constraints
query = """
CREATE CONSTRAINT researcher_id_is_unique IF NOT EXISTS FOR (node:Researcher) REQUIRE node.researcher_id IS UNIQUE;
"""
conn.query(query)

query ="""
CREATE CONSTRAINT field_unique IF NOT EXISTS FOR (node:Field) REQUIRE node.domain IS UNIQUE
"""
conn.query(query)


query ="""
CREATE INDEX researcher_full_name_index FOR (n:Researcher) ON (n.full_name);
"""
conn.query(query)

query ="""
CREATE INDEX rel_co_authors FOR ()-[r:CO_AUTHORS]-() ON (r.number_of_time);
"""
conn.query(query)

query ="""
CREATE INDEX rel_fields_res FOR ()-[r:WORKS_IN]-() ON (r.number_of_publication);
"""
conn.query(query)



# Delete the duplicates and take only the field that are important for the graph
researcher_df = full_dataframe_researcher[["researcher_id","first_name","last_name","total_number_of_publications"]]
researcher_df = researcher_df.drop_duplicates(subset="researcher_id")
# Add the researcher in the graph
number_nodes = add_researcher(researcher_df)
print(f"Number of Researcher nodes {number_nodes}")

# Delete the duplicate for the field
field_research = full_dataframe_researcher[["second_level_field_of_research"]].drop_duplicates().dropna()
# add the field to the graph
number_nodes_field = add_field(field_research)
print(f"Number of field nodes {number_nodes_field}")

#Select the field and researcher in df for the creation of the relationships between the field and the researcher
field_researcher = field_researcher_df[["second_level_field_of_research","researcher_id","number_of_publication","weight"]]
# Add the field researcher relationships to the graph
number_relationships= add_field_researcher(field_researcher)
print(f"Number of relationships created between field and researcher {number_relationships}")

# Add the co_authors
number_relationships_co_authors= add_co_authors(co_authors_df,50)
print(f" Number of relationships added for co-authorships {number_relationships_co_authors}")


Number of Researcher nodes [<Record total=64849>]
Number of field nodes [<Record total=152>]
{'total': 1000, 'batches': 1}
{'total': 2000, 'batches': 2}
{'total': 3000, 'batches': 3}
{'total': 4000, 'batches': 4}
{'total': 5000, 'batches': 5}
{'total': 6000, 'batches': 6}
{'total': 7000, 'batches': 7}
{'total': 8000, 'batches': 8}
{'total': 9000, 'batches': 9}
{'total': 10000, 'batches': 10}
{'total': 11000, 'batches': 11}
{'total': 12000, 'batches': 12}
{'total': 13000, 'batches': 13}
{'total': 14000, 'batches': 14}
{'total': 15000, 'batches': 15}
{'total': 16000, 'batches': 16}
{'total': 17000, 'batches': 17}
{'total': 18000, 'batches': 18}
{'total': 19000, 'batches': 19}
{'total': 20000, 'batches': 20}
{'total': 21000, 'batches': 21}
{'total': 22000, 'batches': 22}
{'total': 23000, 'batches': 23}
{'total': 24000, 'batches': 24}
{'total': 25000, 'batches': 25}
{'total': 26000, 'batches': 26}
{'total': 27000, 'batches': 27}
{'total': 28000, 'batches': 28}
{'total': 29000, 'batches': 2

TypeError: 'NoneType' object is not subscriptable