# CLASSIFICATION OF CRIME SOLVED

In [1]:
# initialize a new Spark Context to use for the execution of the script
from pyspark import SparkContext
sc = SparkContext(appName="MY-APP-NAME", master="local[*]")
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)

# importing all the necessary libraries for building and evaluating the implemented models
import numpy as np
import pandas as pd
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.mllib.regression import LabeledPoint
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer, Bucketizer
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.evaluation import BinaryClassificationMetrics, MulticlassMetrics
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.mllib.util import MLUtils
from pyspark.ml import Pipeline

In [2]:
df = sqlCtx.read.load("Homicide_Undersampled.csv", format="csv", sep=",", inferSchema="true", header="true")

In [3]:
for attribute in ['Agency_Type', 'State', 'Month', 'Victim_Sex', 'Victim_Race','Weapon','Victim_Age']:
    indexer = StringIndexer(inputCol=attribute, outputCol=attribute+"_index")
    df = indexer.fit(df).transform(df)

In [4]:
df = df.drop('Agency_Type', 'State', 'Month', 'Victim_Sex', 'Victim_Race', 'Victim_Ethnicity','Weapon','Agency_Code', 'Perpetrator_Age','Perpetrator_Sex','Perpetrator_Race','Perpetrator_Ethnicity','Relationship','Record_Source','City_State', 'Record_ID','Victim_Age')

In [5]:
indexer = StringIndexer(inputCol="Crime_Solved", outputCol="label")
x = indexer.fit(df)
df = x.transform(df)

In [6]:
df.columns

['Year',
 'Crime_Solved',
 'Agency_Type_index',
 'State_index',
 'Month_index',
 'Victim_Sex_index',
 'Victim_Race_index',
 'Weapon_index',
 'Victim_Age_index',
 'label']

# Logistic Regression

In [7]:
df_log = df.drop('Crime_Solved')

In [8]:
data = df_log.rdd

In [9]:
labeled_data = data.map(lambda x: LabeledPoint(x[8], x[0:8])) 

In [10]:
# defining a function for calculating evaluations metrics
def evaluation(predictions):
    print('Accuracy:', MulticlassMetrics(predictions).accuracy)
    print('Precision 1.0:', MulticlassMetrics(predictions).precision('1.0'))
    print('Precision 0.0:', MulticlassMetrics(predictions).precision('0.0'))
    print('Recall 1.0:', MulticlassMetrics(predictions).recall('1.0'))
    print('Recall 0.0:', MulticlassMetrics(predictions).recall('0.0'))
    print('F1:', MulticlassMetrics(predictions).fMeasure(1.0))
    print('Area under PR:', BinaryClassificationMetrics(predictions).areaUnderPR)
    print('Area under ROC:', BinaryClassificationMetrics(predictions).areaUnderROC)

In [11]:
# Split data into training (70%) and test (30%)
training, test = labeled_data.randomSplit([0.7, 0.3], seed=11)
training.cache()

# Run training algorithm to build the model
model = LogisticRegressionWithLBFGS.train(training)

# Compute raw scores on the test set
predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))

In [12]:
evaluation(predictionAndLabels)

Accuracy: 0.5807803757904905
Precision 1.0: 0.5691256268687412
Precision 0.0: 0.5966130106810513
Recall 1.0: 0.6571376381404694
Recall 0.0: 0.5047729876175064
F1: 0.6099732103496219
Area under PR: 0.5570785107449165
Area under ROC: 0.5809553128789879


In [13]:
trainErr = predictionAndLabels.filter(lambda lp: lp[1] != lp[0]).count() / float(data.count()) 
print("Training Error = " + str(trainErr))

Training Error = 0.12558039236024862


# MLP

In [14]:
assembler = VectorAssembler(
    inputCols=['Year','Agency_Type_index','State_index', 'Month_index',
    'Victim_Sex_index','Victim_Race_index','Weapon_index','Victim_Age_index'],
    outputCol="features")
df_mlp = assembler.transform(df)

In [15]:
df_mlp = df_mlp.drop('Year',
'Victim_Age_index',
 'Agency_Type_index',
 'State_index',
 'Month_index',
 'Victim_Sex_index',
 'Victim_Race_index',
 'Weapon_index', 'Crime_solved')

In [16]:
df_mlp.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|[1980.0,0.0,37.0,...|
|  0.0|[1980.0,0.0,37.0,...|
|  0.0|[1980.0,0.0,37.0,...|
|  0.0|[1980.0,0.0,37.0,...|
|  0.0|(8,[0,2,3,7],[198...|
|  0.0|[1980.0,3.0,37.0,...|
|  0.0|[1980.0,3.0,37.0,...|
|  0.0|[1980.0,3.0,37.0,...|
|  0.0|[1980.0,3.0,37.0,...|
|  0.0|[1980.0,3.0,37.0,...|
|  0.0|[1980.0,3.0,37.0,...|
|  0.0|[1980.0,1.0,18.0,...|
|  0.0|[1980.0,1.0,18.0,...|
|  0.0|(8,[0,1,2],[1980....|
|  0.0|[1980.0,1.0,18.0,...|
|  0.0|[1980.0,1.0,18.0,...|
|  0.0|[1980.0,1.0,18.0,...|
|  0.0|[1980.0,1.0,18.0,...|
|  0.0|(8,[0,2,3,7],[198...|
|  0.0|(8,[0,2,3,6],[198...|
+-----+--------------------+
only showing top 20 rows



In [17]:
# Split the data into train and test
(training, test) = df_mlp.randomSplit([0.7, 0.3],seed=0)

# specify layers for the neural network:
# input layer of size 4 (features), two intermediate of size 5 and 4
# and output of size 3 (classes)
layers = [8, 16, 12, 6, 4, 2]

# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=1000, layers=layers, blockSize=256, seed=0)

# train the model
model = trainer.fit(training)

In [18]:
# Create new evaluators since the others give an error due to the model classifying everything identically
evaluatorMulti = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="prediction", metricName='areaUnderROC')

# Make predicitons
predictionAndTarget = model.transform(test).select("label", "prediction")

In [19]:
# Get metrics
acc = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "accuracy"})
f1 = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "f1"})
weightedPrecision = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "weightedPrecision"})
weightedRecall = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "weightedRecall"})
auc = evaluator.evaluate(predictionAndTarget)

In [20]:
print("Accuracy = " + str(acc))
print("F1 score = " + str(f1))
print("Weighted Precision = " + str(weightedPrecision))
print("Weighted Recall = " + str(weightedRecall))
print("AUC = " + str(auc))

Accuracy = 0.5003602206921709
F1 score = 0.33373365542258593
Weighted Precision = 0.250360350451118
Weighted Recall = 0.5003602206921709
AUC = 0.5


In [21]:
predictionAndTarget.select('prediction').distinct().show()
# as suspected, MLP classifies every record as solved, resulting in performances not better than the random
# classifier; therefore this model shouldn't be exploited for further analysis

+----------+
|prediction|
+----------+
|       0.0|
+----------+

