# Part C

## Setup

In [1]:
from py2neo import Graph, ClientError

In [2]:
#set connection variables
PORT = "7687" #database running on this port for bolt connections
USER = "neo4j" #standard user
PASSWORD = "publication-graph" #db password

In [3]:
#connect to database
try:
    graph = Graph('bolt://localhost:'+PORT, auth=(USER, PASSWORD))
    print('SUCCESS: Connected to the Neo4j Database.')
except Exception as e:
    print('ERROR: Could not connect to the Neo4j Database. See console for details.')
    raise SystemExit(e)

SUCCESS: Connected to the Neo4j Database.


In [4]:
# query helper function
def run_query(query:str):
    try:
        return graph.run(query)
    except ClientError as e:
        print(e.message)

## Step 1
The first thing to do is to find/define the research communities. A community is defined by a set of keywords. Assume that the database community is defined through the following keywords: data management, indexing, data modeling, big data, data processing, data storage and data querying.

In [6]:
DATABASE_COMMUNITY_KEYWORDS = ["data management", "indexing", "data modeling", "big data", "data processing", "data storage","data querying"]

In [7]:
community_constraint = """
CREATE CONSTRAINT for (c:Community) require c.community_name is unique;
"""

In [8]:
run_query(community_constraint)

An equivalent constraint already exists, 'Constraint( id=20, name='constraint_8cb1b3fe', type='UNIQUENESS', schema=(:Community {community_name}), ownedIndex=19 )'.


In [11]:
link_kw_to_community = """
Merge (com:Community {community_name: "Database Community"})

with com, ["data management", "indexing", "data modeling", "big data", "data processing", "data storage","data querying"] as db_com_keywords
unwind db_com_keywords as kw
Match (k:Keyword {keyword: kw})
Merge (com)-[:DEFINED_BY]->(k)
"""

In [12]:
run_query(link_kw_to_community)

## Step 2

Next, we need to find the conferences and journals related to the database community (i.e., are specific to the field of databases). Assume that if 90% of the papers published in a conference/journal contain one of the keywords of the database community we consider that conference/journal as related to that community.

In [13]:
determine_community = """
Match (paper:Paper)-[:PUBLISHED_IN]->()<-[:HOLDS|ISSUES]-(pub)
with pub, count(paper) as no_publications

Match (com:Community {community_name: "Database Community"})-[:DEFINED_BY]-(kw:Keyword)
Match (kw)<-[:MAIN_TOPIC]-(paper:Paper)-[:PUBLISHED_IN]->()<-[:HOLDS|ISSUES]-(pub)

with pub, count(distinct paper) as no_in_community, no_publications, com
with pub, no_in_community, no_publications, toFloat(no_in_community) / no_publications as community_participation, com

where community_participation >= 0.9

Merge (com)<-[rt:RELATED_TO]-(pub)
set rt.participation_rate = community_participation

"""

In [14]:
run_query(determine_community)

## Step 3
Next, we want to identify the top papers of these conferences/journals. We need to find the papers with the highest page rank provided the number of citations from the papers of the same community (papers in the conferences/journals of the database community). As a result we would obtain (highlight), say, the top-100 papers of the conferences of the database community.

In [15]:
identify_community_papers = """

Match (com:Community {community_name: "Database Community"})
Match (paper:Paper)-[cite:CITES]->(cited_paper:Paper)
Match (paper)-[:PUBLISHED_IN]->()<-[:HOLDS|ISSUES]-(pub)-[:RELATED_TO]->(com)
Match (cited_paper)-[:PUBLISHED_IN]->()<-[:HOLDS|ISSUES]-(pub)-[:RELATED_TO]->(com)

with cited_paper, count(cite) as no_citations, com

Merge (com)-[cp:COMMUNITY_PAPER]->(cited_paper)
set cp.community_citations = no_citations

"""

In [None]:
run_query(identify_community_papers)

In [16]:
get_highlight_papers = """
Match (com:Community {community_name: "Database Community"})-[cp:COMMUNITY_PAPER]->(paper:Paper)
with cp, cp.community_citations as no_citations
order by no_citations desc

with collect(cp) as papers
with papers[0..100] as top100papers
unwind top100papers as highlight
set highlight.top100 = True

"""

In [17]:
run_query(get_highlight_papers)

In [18]:
# TODO: Explain why in 2 steps

## Step 4

Finally, an author of any of these top-100 papers is automatically considered a potential good match to review database papers. In addition, we want to identify gurus, i.e., very reputated authors that would be able to review for top conferences. We identify gurus as those authors that are authors of, at least, two papers among the top-100 identified.

In [20]:
potential_reviewer_and_guru = """
Match (com:Community {community_name: "Database Community"})-[cp:COMMUNITY_PAPER]->(paper:Paper)<-[:AUTHOR_OF]-(author:Researcher)
WHERE cp.top100 = True

Merge (com)-[:POTENTIAL_REVIEWER]->(author)

with author, count(distinct paper) as no_publications, com
where no_publications >= 2

Merge (com)-[:GURU]->(author)
"""

In [21]:
run_query(potential_reviewer_and_guru)