In [8]:
# CQL to create states as nodes
# CQL to query the difference in population from a given state to all the other states.
# Create relationships between the states as defined above (i.e. region and the binary classifications).
# Use the above states_df to read the data (i.e. please don't repeatedly write out state and/or field names).


# import the neo4j driver for Python
from neo4j import GraphDatabase
from scipy import stats
import pandas as pd
import numpy as np

# Database Credentials
uri = "bolt://192.168.3.224:7687"
userName = "neo4j"
password = "senator-index-nebula-silver-zero-9434"

# Connect to the neo4j database server
graphDB_Driver = GraphDatabase.driver(uri, auth=(userName, password))

regions = {'Maine': 'Northeast', 'Massachusetts': 'Northeast', 'Rhode Island': 'Northeast', 'Connecticut': 'Northeast', 'New Hampshire': 'Northeast', 'Vermont': 'Northeast', 'New York': 'Northeast', 'Pennsylvania': 'Northeast', 'New Jersey': 'Northeast', 'Delaware': 'Northeast', 'Maryland': 'Northeast', 'West Virginia': 'Southeast', 'Virginia': 'Southeast', 'Kentucky': 'Southeast', 'Tennessee': 'Southeast', 'North Carolina': 'Southeast', 'South Carolina': 'Southeast', 'Georgia': 'Southeast', 'Alabama': 'Southeast', 'Mississippi': 'Southeast', 'Arkansas': 'Southeast', 'Louisiana': 'Southeast', 'Florida': 'Southeast', 'Ohio': 'Midwest', 'Indiana': 'Midwest', 'Michigan': 'Midwest', 'Illinois': 'Midwest', 'Missouri': 'Midwest', 'Wisconsin': 'Midwest', 'Minnesota': 'Midwest', 'Iowa': 'Midwest', 'Kansas': 'Midwest', 'Nebraska': 'Midwest', 'South Dakota': 'Midwest', 'North Dakota': 'Midwest', 'Texas': 'Southwest', 'Oklahoma': 'Southwest', 'New Mexico': 'Southwest', 'Arizona': 'Southwest', 'Colorado': 'West', 'Wyoming': 'West', 'Montana': 'West', 'Idaho': 'West', 'Washington': 'West', 'Oregon': 'West', 'Utah': 'West', 'Nevada': 'West', 'California': 'West', 'Alaska': 'West', 'Hawaii': 'West'}

# Function to create nodes
def create_nodes(states_df):
    queries = []
    #delete everything first
    cqlCreate = "MATCH (n) DETACH DELETE n"
    with graphDB_Driver.session() as graphDB_Session:
            graphDB_Session.run(cqlCreate)
    # Create states as nodes
    for i, row in states_df.iterrows():
        state = row['States']
        poverty = row['Poverty']
        infant_mort = row['Infant Mort']
        white = row['White']
        crime = row['Crime']
        doctors = row['Doctors']
        traf_deaths = row['Traf Deaths']
        university = row['University']
        unemployed = row['Unemployed']
        income = row['Income']
        population = row['Population']
        region = row['region']
  
        cqlCreate = "CREATE (" + state.replace(" ", "") + ":state { name: '" + state + f"', poverty: {poverty}" + f", infant_mort: {infant_mort}" + f", white: {white}" + f", crime: {crime}" +\
        f", doctors: {doctors}" + f", traf_deaths: {traf_deaths}"  + f", university: {university}" + f", unemployed: {unemployed}" + f", income: {income}" + f", population: {population}" + f", region: '{region}'" +\
        "})"
        
        queries.append(cqlCreate)
                
        # Execute the CQL query
        with graphDB_Driver.session() as graphDB_Session:
            graphDB_Session.run(cqlCreate)
            
    # Create region nodes
    #{'Midwest', 'Northeast', 'Southeast', 'Southwest', 'West'}
    cqlCreate = "CREATE (Midwest:region {name: 'Midwest'}), (Northeast:region {name: 'Northeast'}), (Southeast:region {name: 'Southeast'}), (Southwest:region {name: 'Southwest'}), (West:region {name: 'West'})"
    with graphDB_Driver.session() as graphDB_Session:
        graphDB_Session.run(cqlCreate)
        
    # Create nodes for factors
    cqlCreate = "CREATE (poverty:factor {name: 'Poverty'}), (infant_mort:factor {name: 'Infant Mort'}), (white:factor {name: 'White'}), (crime:factor {name: 'Crime'}), (doctors:factor {name: 'Doctors'}), (traf_deaths:factor {name: 'Traf Deaths'}), (university:factor {name: 'University'}), (unemployed:factor {name: 'Unemployed'}), (income:factor {name: 'Income'}), (population:factor {name: 'Population'})"
    with graphDB_Driver.session() as graphDB_Session:
        graphDB_Session.run(cqlCreate)
                
# Function to create relationships between the states, factors, region
def create_relationships(states_df):
    cqlCreate = ""
    # Create state and region relationships
    for i, row in states_df.iterrows():
        state = row['States']
        region = row['region']
        cqlCreate = f"MATCH (x:state {{name: '{state}'}}), (y:region {{name: '{region}'}}) CREATE (x)-[:part_of]->(y)"
        with graphDB_Driver.session() as graphDB_Session:
            graphDB_Session.run(cqlCreate)
    
    cqlCreate = ""
    # Create state and factor relationships
    for i, row in states_df.iterrows():
        state = row['States']
        for column in states_df.columns:
            if column not in ["States", "region"]:
                cqlCreate = f"MATCH (x:state {{name: '{state}'}}), (y:factor {{name: '{column}'}}) CREATE (x)-[:has {{value: {row[column]}}}]->(y)"
                with graphDB_Driver.session() as graphDB_Session:
                    graphDB_Session.run(cqlCreate)
                    
# Rewritten findPageRank function
# Rewritten findPageRank function
def findPageRank(query, pages):
    # Execute the CQL query
    with graphDB_Driver.session() as graphDB_Session:
        results = graphDB_Session.run(query)

        # Create link matrix
        factor_nodes = []
        region_nodes = []
        for result in results:
            nodes = result.values()
            for i in range(len(nodes)):
                if nodes[i]['name'] in pages:
                    region_nodes.append(nodes[i]['name'])
                else:
                    factor_nodes.append(nodes[i]['name'])

        # Create link matrix with appropriate number of rows and columns
        linkmatrix = np.zeros((len(region_nodes), len(factor_nodes)))

        # Populate linkmatrix with values from query results
        for result in results:
            nodes = result.values()
            for i in range(len(nodes)-1):
                if nodes[i]['name'] in region_nodes and nodes[i+1]['name'] in factor_nodes:
                    linkmatrix[region_nodes.index(nodes[i]['name'])][factor_nodes.index(nodes[i+1]['name'])] = 1

        eigval, eigvector= np.linalg.eig(linkmatrix)
        dominant_eigval = np.abs(eigval).max()
        PageRank= np.where(eigval == dominant_eigval)
        print("The most important node is %s"% str(region_nodes[PageRank[0][0]]))


# Main
if __name__ == '__main__':
    # Read the data
    states_df = pd.read_csv('https://raw.githubusercontent.com/thistleknot/python-ml/master/data/raw/states.csv')
    
    mads = stats.median_abs_deviation(states_df.iloc[:,1:11])
    stds = np.std(states_df.iloc[:,1:11])
    
    medians_df = pd.DataFrame(states_df.iloc[:,1:11].median()).T
    mads_df = pd.DataFrame([mads],columns=states_df.columns[1:11])
    means_df = pd.DataFrame(states_df.iloc[:,1:11].mean()).T
    stds_df = pd.DataFrame([stds],columns=states_df.columns[1:11])
   
    states_df['region'] = [regions[s] for s in states_df['States'].values]   
    
    # Create states as nodes
    create_nodes(states_df)

    create_relationships(states_df)
    


Failed to write data to connection IPv4Address(('192.168.3.224', 7687)) (IPv4Address(('192.168.3.224', 7687)))


In [9]:
#states_df[['region']]

In [311]:

query = """call {MATCH (s:state)-[:has]->(f:factor) WITH avg(s.poverty) as poverty return poverty}
match (s:state)-[:has]->(f:factor),(s:state)-[:part_of]->(r:region)
where s.poverty > poverty
return s, f, r"""
pages = list(regions.keys())
findPageRank(query, pages)


LinAlgError: Last 2 dimensions of the array must be square

In [None]:
(means_df['White'].values[0] + stds_df['White'].values[0])

In [None]:
states_df['White']

In [None]:
(states_df['White'] >= (means_df['White'].values[0] + stds_df['White'].values[0]))

In [None]:
(states_df['University'] >= (states_df['University'].median()+mads_df['University'].values[0])) 

In [None]:
states_df.describe()