<a href="https://colab.research.google.com/github/tur103/Machine-Learning-With-Graph-Database/blob/master/Crime_Investigation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Crime Investigation

Setup the environment

In [0]:
!pip install py2neo==4.1.3 pandas matplotlib sklearn

In [0]:
from py2neo import Graph
import pandas as pd
import statistics

import matplotlib 
import matplotlib.pyplot as plt

plt.style.use('fivethirtyeight')
pd.set_option('display.float_format', lambda x: '%.3f' % x)

import pandas as pd
from collections import Counter
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score

Neo4j sandbox -> Crime Investigation, is used

In [0]:
graph = Graph("bolt://34.239.207.167:33768", auth=("neo4j", "display-suns-grids"))

## Find Vulnerable People At High Risk Of Being Part Of A Crime

Machine Learning using Graph Database for predicting near future crime victims 

### Setup Data Model

Create crime association relationships

In [0]:
crime_relationships = """
MATCH (person:Person)-[:KNOWS]-(other_person:Person)-[:PARTY_TO]->(:Crime)
MERGE (person)-[:KNOWS_CRIME]->(other_person)
"""

graph.run(crime_relationships).stats()

Devide the graph into `train` and `test` sub-graphs

In [0]:
query = """
MATCH (person:Person)
RETURN toInteger(split(person.nhs_no, "-")[-1]) AS nhs_number
ORDER BY nhs_number
"""
nhs_numbers = graph.run(query).to_data_frame()["nhs_number"]

median = statistics.median(nhs_numbers)
print(median)

In [0]:
query = """
MATCH (person:Person)
where toInteger(split(person.nhs_no, "-")[-1]) < 5383
SET person:PersonBefore
"""

graph.run(query).stats()

In [0]:
query = """
MATCH (person:Person)
where toInteger(split(person.nhs_no, "-")[-1]) >= 5383
SET person:PersonAfter
"""

graph.run(query).stats()

### Deep Learning

Use $Graph Data Science$ to execute $Graph Algorithms$ for insights gathering

`Centrality Algorithms`

In [0]:
graph.run("""
CALL algo.pageRank('PersonBefore', 'KNOWS_CRIME',{
  write: true, writeProperty:"pagerankTrain"
})
YIELD nodes, iterations, loadMillis, computeMillis, writeMillis, dampingFactor, write, writeProperty
""").stats()

In [0]:
graph.run("""
CALL algo.pageRank('PersonAfter', 'KNOWS_CRIME',{
  write: true, writeProperty:"pagerankTest"
})
YIELD nodes, iterations, loadMillis, computeMillis, writeMillis, dampingFactor, write, writeProperty
""").stats()

`Community Detection Algoritms`

In [0]:
graph.run("""
CALL algo.unionFind("PersonBefore", "FAMILY_REL",
{partitionProperty: "familyCommunityTrain", direction: "BOTH"});
""").stats()

graph.run("""
MATCH (person:PersonBefore)
OPTIONAL MATCH (other_person:PersonBefore {familyCommunityTrain: person.familyCommunityTrain})-[:PARTY_TO]->(:Crime)
WHERE NOT ID(person) = ID(other_person)
WITH person, COUNT(other_person) AS danger
SET person.familyDangerTrain = danger
""").stats()

In [0]:
graph.run("""
CALL algo.unionFind("PersonAfter", "FAMILY_REL",
{partitionProperty: "familyCommunityTest", direction: "BOTH"});
""").stats()

graph.run("""
MATCH (person:PersonAfter)
OPTIONAL MATCH (other_person:PersonAfter {familyCommunityTest: person.familyCommunityTest})-[:PARTY_TO]->(:Crime)
WHERE NOT ID(person) = ID(other_person)
WITH person, COUNT(other_person) AS danger
SET person.familyDangerTest = danger
""").stats()

In [0]:
graph.run("""
CALL algo.unionFind("PersonBefore", "KNOWS_LW",
{partitionProperty: "livingCommunityTrain", direction: "BOTH"});
""").stats()

graph.run("""
MATCH (person:PersonBefore)
OPTIONAL MATCH (other_person:PersonBefore {livingCommunityTrain: person.livingCommunityTrain})-[:PARTY_TO]->(:Crime)
WHERE NOT ID(person) = ID(other_person)
WITH person, COUNT(other_person) AS danger
SET person.livingDangerTrain = danger
""").stats()

In [0]:
graph.run("""
CALL algo.unionFind("PersonAfter", "KNOWS_LW",
{partitionProperty: "livingCommunityTest", direction: "BOTH"});
""").stats()

graph.run("""
MATCH (person:PersonAfter)
OPTIONAL MATCH (other_person:PersonAfter {livingCommunityTest: person.livingCommunityTest})-[:PARTY_TO]->(:Crime)
WHERE NOT ID(person) = ID(other_person)
WITH person, COUNT(other_person) AS danger
SET person.livingDangerTest = danger
""").stats()

In [0]:
graph.run("""
CALL algo.louvain.stream("PersonBefore", "KNOWS_PHONE", {direction: "BOTH"})
YIELD nodeId, community, communities
WITH algo.getNodeById(nodeId) AS person, community AS community_id
SET person.chatCommunityTrain = community_id
""").stats()

graph.run("""
MATCH (person:PersonBefore)
OPTIONAL MATCH (other_person:PersonBefore {chatCommunityTrain: person.chatCommunityTrain})-[:PARTY_TO]->(:Crime)
WHERE NOT ID(person) = ID(other_person)
WITH person, COUNT(other_person) AS danger
SET person.chatDangerTrain = danger
""").stats()

In [0]:
graph.run("""
CALL algo.louvain.stream("PersonAfter", "KNOWS_PHONE", {direction: "BOTH"})
YIELD nodeId, community, communities
WITH algo.getNodeById(nodeId) AS person, community AS community_id
SET person.chatCommunityTest = community_id
""").stats()

graph.run("""
MATCH (person:PersonAfter)
OPTIONAL MATCH (other_person:PersonAfter {chatCommunityTest: person.chatCommunityTest})-[:PARTY_TO]->(:Crime)
WHERE NOT ID(person) = ID(other_person)
WITH person, COUNT(other_person) AS danger
SET person.chatDangerTest = danger
""").stats()

In [0]:
graph.run("""
CALL algo.louvain.stream("PersonBefore", "KNOWS_SN", {direction: "BOTH"})
YIELD nodeId, community, communities
WITH algo.getNodeById(nodeId) AS person, community AS community_id
SET person.socialNetworkCommunityTrain = community_id
""").stats()

graph.run("""
MATCH (person:PersonBefore)
OPTIONAL MATCH (other_person:PersonBefore {socialNetworkCommunityTrain: person.socialNetworkCommunityTrain})-[:PARTY_TO]->(:Crime)
WHERE NOT ID(person) = ID(other_person)
WITH person, COUNT(other_person) AS danger
SET person.socialNetworkDangerTrain = danger
""").stats()

In [0]:
graph.run("""
CALL algo.louvain.stream("PersonAfter", "KNOWS_SN", {direction: "BOTH"})
YIELD nodeId, community, communities
WITH algo.getNodeById(nodeId) AS person, community AS community_id
SET person.socialNetworkCommunityTest = community_id
""").stats()

graph.run("""
MATCH (person:PersonAfter)
OPTIONAL MATCH (other_person:PersonAfter {socialNetworkCommunityTest: person.socialNetworkCommunityTest})-[:PARTY_TO]->(:Crime)
WHERE NOT ID(person) = ID(other_person)
WITH person, COUNT(other_person) AS danger
SET person.socialNetworkDangerTest = danger
""").stats()

In [0]:
graph.run("""
CALL algo.louvain.stream("PersonBefore", "KNOWS_CRIME", {direction: "BOTH"})
YIELD nodeId, community, communities
WITH algo.getNodeById(nodeId) AS person, community AS community_id
SET person.crimeCommunityTrain = community_id
""").stats()

graph.run("""
MATCH (person:PersonBefore)
OPTIONAL MATCH (other_person:PersonBefore {crimeCommunityTrain: person.crimeCommunityTrain})-[:PARTY_TO]->(:Crime)
WHERE NOT ID(person) = ID(other_person)
WITH person, COUNT(other_person) AS danger
SET person.crimeDangerTrain = danger
""").stats()

In [0]:
graph.run("""
CALL algo.louvain.stream("PersonAfter", "KNOWS_CRIME", {direction: "BOTH"})
YIELD nodeId, community, communities
WITH algo.getNodeById(nodeId) AS person, community AS community_id
SET person.crimeCommunityTest = community_id
""").stats()

graph.run("""
MATCH (person:PersonAfter)
OPTIONAL MATCH (other_person:PersonAfter {crimeCommunityTest: person.crimeCommunityTest})-[:PARTY_TO]->(:Crime)
WHERE NOT ID(person) = ID(other_person)
WITH person, COUNT(other_person) AS danger
SET person.crimeDangerTest = danger
""").stats()

### Machine Learning

In [0]:
classifier = RandomForestClassifier(n_estimators=30, max_depth=10, random_state=0)

In [0]:
def evaluate_model(predictions, actual):
    return pd.DataFrame({
        "Measure": ["Accuracy", "Precision", "Recall"],
        "Score": [accuracy_score(actual, predictions), 
                  precision_score(actual, predictions), 
                  recall_score(actual, predictions)]
    })

def feature_importance(columns, classifier):        
    display("Feature Importance")
    df = pd.DataFrame({
        "Feature": columns,
        "Importance": classifier.feature_importances_
    })
    df = df.sort_values("Importance", ascending=False)    
    ax = df.plot(kind='bar', x='Feature', y='Importance', legend=None)
    ax.xaxis.set_label_text("")
    plt.tight_layout()
    plt.show()

In [0]:
def down_sample(df):
    copy = df.copy()
    zero = Counter(copy.vulnerable.values)[0]
    un = Counter(copy.vulnerable.values)[1]
    n = zero - un
    copy = copy.drop(copy[copy.vulnerable == 0].sample(n=n, random_state=1).index)
    return copy.sample(frac=1)

Extract the train and test models

In [0]:
train_existing_links = graph.run("""
MATCH (person:PersonBefore)
WHERE (person)-[:PARTY_TO]->(:Crime)
RETURN id(person) AS node, 1 AS vulnerable
""").to_data_frame()

train_missing_links = graph.run("""
MATCH (person:PersonBefore)
WHERE NOT (person)-[:PARTY_TO]->(:Crime)
RETURN id(person) AS node, 0 AS vulnerable
""").to_data_frame()
train_missing_links = train_missing_links.drop_duplicates()

In [0]:
training_df = train_missing_links.append(train_existing_links, ignore_index=True)
training_df['vulnerable'] = training_df['vulnerable'].astype('category')
training_df = down_sample(training_df)

In [0]:
test_existing_links = graph.run("""
MATCH (person:PersonAfter)
WHERE (person)-[:PARTY_TO]->(:Crime)
RETURN id(person) AS node, 1 AS vulnerable
""").to_data_frame()

test_missing_links = graph.run("""
MATCH (person:PersonAfter)
WHERE NOT (person)-[:PARTY_TO]->(:Crime)
RETURN id(person) AS node, 0 AS vulnerable
""").to_data_frame()
test_missing_links = test_missing_links.drop_duplicates()

In [0]:
test_df = test_missing_links.append(test_existing_links, ignore_index=True)
test_df['vulnerable'] = test_df['vulnerable'].astype('category')
test_df = down_sample(test_df)

`Apply Centrality Features`

In [0]:
def apply_centrality_features(data, pagerank_prop):
    query = """
    UNWIND $nodes AS node
    MATCH (person) WHERE id(person) = node[0]
    RETURN node[0] AS node, person[$pagerankProp] AS pagerank
    """    
    nodes = data[["node"]].values.tolist() 
    params = {
    "nodes": nodes,
    "pagerankProp": pagerank_prop
    }
    features = graph.run(query, params).to_data_frame()    
    return pd.merge(data, features, on = ["node"])

In [0]:
training_df = apply_centrality_features(training_df, "pagerankTrain")
test_df = apply_centrality_features(test_df, "pagerankTest")

In [0]:
columns = [ 
    "pagerank" # centrality feature
]

X = training_df[columns]
y = training_df["vulnerable"]
classifier.fit(X, y)

predictions = classifier.predict(test_df[columns])
y_test = test_df["vulnerable"]

display(evaluate_model(predictions, y_test))
feature_importance(columns, classifier)

`Apply Community Detection Features`

In [0]:
def apply_community_detection_features(data, family_danger_prop, living_danger_prop,
                                       chat_danger_prop, social_network_danger_prop,
                                       crime_danger_prop):
    query = """
    UNWIND $nodes AS node
    MATCH (person) WHERE id(person) = node[0]
    RETURN node[0] AS node,
    person[$familyDangerProp] AS familyDanger,
    person[$livingDangerProp] AS livingDanger,
    person[$chatDangerProp] AS chatDanger,
    person[$socialNetworkDangerProp] AS socialNetworkDanger,
    person[$crimeDangerProp] AS crimeDanger
    """
    nodes = data[["node"]].values.tolist() 
    params = {
    "nodes": nodes,
    "familyDangerProp": family_danger_prop,
    "livingDangerProp": living_danger_prop,
    "chatDangerProp": chat_danger_prop,
    "socialNetworkDangerProp": social_network_danger_prop,
    "crimeDangerProp": crime_danger_prop
    }
    features = graph.run(query, params).to_data_frame()    
    return pd.merge(data, features, on = ["node"])

In [0]:
training_df = apply_community_detection_features(training_df, "familyDangerTrain",
                                                 "livingDangerTrain", "chatDangerTrain",
                                                 "socialNetworkDangerTrain", "crimeDangerTrain")

test_df = apply_community_detection_features(test_df, "familyDangerTest", "livingDangerTest",
                                             "chatDangerTest", "socialNetworkDangerTest",
                                             "crimeDangerTest")

In [0]:
columns = [ 
    "pagerank", # centrality feature
    "familyDanger", "livingDanger", "chatDanger", "socialNetworkDanger", "crimeDanger" # community detetion features
]

X = training_df[columns]
y = training_df["vulnerable"]
classifier.fit(X, y)

predictions = classifier.predict(test_df[columns])
y_test = test_df["vulnerable"]

display(evaluate_model(predictions, y_test))
feature_importance(columns, classifier)

Removing not important features

In [0]:
columns = [ 
    "pagerank", # centrality feature
    "socialNetworkDanger", "crimeDanger" # community detetion features
]

X = training_df[columns]
y = training_df["vulnerable"]
classifier.fit(X, y)

predictions = classifier.predict(test_df[columns])
y_test = test_df["vulnerable"]

display(evaluate_model(predictions, y_test))
feature_importance(columns, classifier)

# Summary

Accuracy: 95.8%

Precision: 100%

Recall: 91.7%


Features by importancy: PageRank, Social Network Danger, Crime Danger