# CLASSIFICATION OF CRIME SOLVED ON SUBSAMPLES OF THE DATASET

In [1]:
# initialize a new Spark Context to use for the execution of the script
from pyspark import SparkContext
sc = SparkContext(appName="MY-APP-NAME", master="local[*]")
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)

# importing all the necessary libraries for building and evaluating the implemented models
import numpy as np
import pandas as pd
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel 
from pyspark.mllib.regression import LabeledPoint
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer, Bucketizer
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.evaluation import BinaryClassificationMetrics, MulticlassMetrics
from pyspark.mllib.util import MLUtils

## California

In [2]:
# uploading the already subsampled dataset containing only homicides commited in California
df_cali = sqlCtx.read.load("Homicide_California.csv", format="csv", sep=",", inferSchema="true", header="true")

In [3]:
# creating index columns
for attribute in ['Agency_Type', 'State', 'Month', 'Victim_Sex', 'Victim_Race','Weapon','Victim_Age','Crime_Solved']:
    indexer = StringIndexer(inputCol=attribute, outputCol=attribute+"_index")
    df_cali = indexer.fit(df_cali).transform(df_cali)

In [4]:
df_cali.count()

130845

In [5]:
df_cali = df_cali.drop('Agency_Type', 'State', 'Month', 'Victim_Sex', 'Victim_Race', 'Victim_Ethnicity','Weapon','Agency_Code', 'Perpetrator_Age','Perpetrator_Sex','Perpetrator_Race','Perpetrator_Ethnicity','Relationship','Record_Source','City_State', 'Record_ID','Victim_Age','Crime_Solved')

In [6]:
data_cali = df_cali.rdd #transforming the data into an rdd 

In [7]:
# transform each record to a labeled point in order to feed it to the model
labeled_data = data_cali.map(lambda x: LabeledPoint(x[8], x[0:8]))

In [8]:
# Split data into training (70%) and test (30%)
training, test = labeled_data.randomSplit([0.7, 0.3], seed=11)
training.cache()

# Run training algorithm to build the model
model = LogisticRegressionWithLBFGS.train(training)

# Compute raw scores on the test set
predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))

In [9]:
# defining a function for calculating evaluations metrics
def evaluation(predictions):
    print('Accuracy:', MulticlassMetrics(predictions).accuracy)
    print('Precision 1.0:', MulticlassMetrics(predictions).precision('1.0'))
    print('Precision 0.0:', MulticlassMetrics(predictions).precision('0.0'))
    print('Recall 1.0:', MulticlassMetrics(predictions).recall('1.0'))
    print('Recall 0.0:', MulticlassMetrics(predictions).recall('0.0'))
    print('F1:', MulticlassMetrics(predictions).fMeasure(1.0))
    print('Area under PR:', BinaryClassificationMetrics(predictions).areaUnderPR)
    print('Area under ROC:', BinaryClassificationMetrics(predictions).areaUnderROC)

In [10]:
evaluation(predictionAndLabels)

Accuracy: 0.5654921829021775
Precision 1.0: 0.5637057670483855
Precision 0.0: 0.5660813937661631
Recall 1.0: 0.29995608256477824
Recall 0.0: 0.7973160795590702
F1: 0.3915582786914616
Area under PR: 0.5295447032752114
Area under ROC: 0.5486360810619243


In [11]:
# computing training error
trainErr = predictionAndLabels.filter(lambda lp: lp[1] != lp[0]).count() / float(data_cali.count()) 
print("Training Error = " + str(trainErr))

Training Error = 0.12977951010737895


## New York
##### Repeting all the previous steps on NY's homicides

In [12]:
df_ny = sqlCtx.read.load("Homicide_NY.csv", format="csv", sep=",", inferSchema="true", header="true")

In [13]:
for attribute in ['Agency_Type', 'State', 'Month', 'Victim_Sex', 'Victim_Race','Weapon','Victim_Age','Crime_Solved']:
    indexer = StringIndexer(inputCol=attribute, outputCol=attribute+"_index")
    df_ny = indexer.fit(df_ny).transform(df_ny)

In [14]:
df_ny.count()

46586

In [15]:
df_ny = df_ny.drop('Agency_Type', 'State', 'Month', 'Victim_Sex', 'Victim_Race', 'Victim_Ethnicity','Weapon','Agency_Code', 'Perpetrator_Age','Perpetrator_Sex','Perpetrator_Race','Perpetrator_Ethnicity','Relationship','Record_Source','City_State', 'Record_ID','Victim_Age','Crime_Solved')

In [16]:
data_ny = df_ny.rdd

In [17]:
labeled_data = data_ny.map(lambda x: LabeledPoint(x[8], x[0:8])) 

In [18]:
# Split data into training (70%) and test (30%)
training, test = labeled_data.randomSplit([0.7, 0.3], seed=11)
training.cache()

# Run training algorithm to build the model
model = LogisticRegressionWithLBFGS.train(training)

# Compute raw scores on the test set
predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))

In [19]:
evaluation(predictionAndLabels)

Accuracy: 0.5733132573313258
Precision 1.0: 0.542232630757221
Precision 0.0: 0.5997083001856272
Recall 1.0: 0.534966112138016
Recall 0.0: 0.6067069081153588
F1: 0.5385748623710941
Area under PR: 0.524385517377006
Area under ROC: 0.5708365101266875


In [20]:
trainErr = predictionAndLabels.filter(lambda lp: lp[1] != lp[0]).count() / float(data_ny.count()) 
print("Training Error = " + str(trainErr))

Training Error = 0.12774224015798738


## Texas
##### Repeting all the previous steps on Texas' homicides

In [21]:
df_tex = sqlCtx.read.load("Homicide_Texas.csv", format="csv", sep=",", inferSchema="true", header="true")

In [22]:
for attribute in ['Agency_Type', 'State', 'Month', 'Victim_Sex', 'Victim_Race','Weapon','Victim_Age','Crime_Solved']:
    indexer = StringIndexer(inputCol=attribute, outputCol=attribute+"_index")
    df_tex = indexer.fit(df_tex).transform(df_tex)

In [23]:
df_tex.count()

87506

In [24]:
df_tex = df_tex.drop('Agency_Type', 'State', 'Month', 'Victim_Sex', 'Victim_Race', 'Victim_Ethnicity','Weapon','Agency_Code', 'Perpetrator_Age','Perpetrator_Sex','Perpetrator_Race','Perpetrator_Ethnicity','Relationship','Record_Source','City_State', 'Record_ID','Victim_Age','Crime_Solved')

In [25]:
data_tex = df_tex.rdd

In [26]:
labeled_data = data_tex.map(lambda x: LabeledPoint(x[8], x[0:8])) 

In [27]:
# Split data into training (70%) and test (30%)
training, test = labeled_data.randomSplit([0.7, 0.3], seed=11)
training.cache()

# Run training algorithm to build the model
model = LogisticRegressionWithLBFGS.train(training)

# Compute raw scores on the test set
predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))

In [28]:
evaluation(predictionAndLabels)

Accuracy: 0.5219155533437967
Precision 1.0: 0.47255619445896496
Precision 0.0: 0.5303852536215634
Recall 1.0: 0.1472432608518609
Recall 0.0: 0.8542328806703265
F1: 0.22452654455138155
Area under PR: 0.4714857115569134
Area under ROC: 0.5007380707610938


In [29]:
trainErr = predictionAndLabels.filter(lambda lp: lp[1] != lp[0]).count() / float(data_ny.count()) 
print("Training Error = " + str(trainErr))

Training Error = 0.26808483235306746
