In [1]:
from pyspark import SparkContext
sc = SparkContext(appName="MY-APP-NAME", master="local[*]")

from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)

from pyspark.sql.functions import *
import pandas as pd
from pyspark.ml.feature import Bucketizer
from pyspark.sql.types import *

In [2]:
df = sqlCtx.read.load("database_Homicide.csv",
                     format="csv", sep=",", inferSchema="true", header="true")

### Columns analysis

In [3]:
# Add column City_State
df=df.withColumn('City_State', concat(col("City"),lit(","),col("State")))

In [4]:
# Cast of Perpetrator_Age as integer
df=df.withColumn('Perpetrator_Age', col('Perpetrator Age').cast(IntegerType()))
df=df.drop('Perpetrator Age')

In [5]:
df=df.withColumn('Crime_Solved', col('Crime Solved'))

In [6]:
# Drop columns
df=df.drop('Record ID','Incident','Victim Count','Perpetrator Count','Agency Name','City','Crime Type','Crime Solved','Record Source')
df.dtypes

[('Agency Code', 'string'),
 ('Agency Type', 'string'),
 ('State', 'string'),
 ('Year', 'int'),
 ('Month', 'string'),
 ('Victim Sex', 'string'),
 ('Victim Age', 'int'),
 ('Victim Race', 'string'),
 ('Victim Ethnicity', 'string'),
 ('Perpetrator Sex', 'string'),
 ('Perpetrator Race', 'string'),
 ('Perpetrator Ethnicity', 'string'),
 ('Relationship', 'string'),
 ('Weapon', 'string'),
 ('City_State', 'string'),
 ('Perpetrator_Age', 'int'),
 ('Crime_Solved', 'string')]

In [7]:
# Renamed columns
df = df.withColumnRenamed("Agency Code", "Agency_Code")\
.withColumnRenamed("Agency Type", "Agency_Type")\
.withColumnRenamed("Victim Age", "Victim_Age")\
.withColumnRenamed("Victim Sex", "Victim_Sex")\
.withColumnRenamed("Victim Race", "Victim_Race")\
.withColumnRenamed("Victim Ethnicity", "Victim_Ethnicity")\
.withColumnRenamed("Perpetrator Sex", "Perpetrator_Sex")\
.withColumnRenamed("Perpetrator Race", "Perpetrator_Race")\
.withColumnRenamed("Perpetrator Ethnicity", "Perpetrator_Ethnicity")


In [8]:
print("The number of rows and columns: ",df.count(),",",len(df.columns))

The number of row and columns:  638454 , 17


In [9]:
# DataFrame
pandas_df = df.select("*").toPandas()
pandas_df

Unnamed: 0,Agency_Code,Agency_Type,State,Year,Month,Victim_Sex,Victim_Age,Victim_Race,Victim_Ethnicity,Perpetrator_Sex,Perpetrator_Race,Perpetrator_Ethnicity,Relationship,Weapon,City_State,Perpetrator_Age,Crime_Solved
0,AK00101,Municipal Police,Alaska,1980,January,Male,14,Native American/Alaska Native,Unknown,Male,Native American/Alaska Native,Unknown,Acquaintance,Blunt Object,"Anchorage,Alaska",15.0,Yes
1,AK00101,Municipal Police,Alaska,1980,March,Male,43,White,Unknown,Male,White,Unknown,Acquaintance,Strangulation,"Anchorage,Alaska",42.0,Yes
2,AK00101,Municipal Police,Alaska,1980,March,Female,30,Native American/Alaska Native,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,"Anchorage,Alaska",0.0,No
3,AK00101,Municipal Police,Alaska,1980,April,Male,43,White,Unknown,Male,White,Unknown,Acquaintance,Strangulation,"Anchorage,Alaska",42.0,Yes
4,AK00101,Municipal Police,Alaska,1980,April,Female,30,Native American/Alaska Native,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,"Anchorage,Alaska",0.0,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
638449,WY01500,Sheriff,Wyoming,2014,January,Male,30,White,Hispanic,Unknown,Unknown,Unknown,Unknown,Handgun,"Park,Wyoming",0.0,No
638450,WY01700,Sheriff,Wyoming,2014,June,Male,62,White,Unknown,Male,White,Unknown,Acquaintance,Handgun,"Sheridan,Wyoming",57.0,Yes
638451,WY01701,Municipal Police,Wyoming,2014,September,Female,0,Asian/Pacific Islander,Unknown,Female,Asian/Pacific Islander,Unknown,Daughter,Suffocation,"Sheridan,Wyoming",22.0,Yes
638452,WY01800,Sheriff,Wyoming,2014,December,Male,55,White,Not Hispanic,Male,White,Not Hispanic,Stranger,Knife,"Sublette,Wyoming",31.0,Yes


### Missing values

In [10]:
# Elimination Victim_Sex ="Unknown"
df=df.filter(df.Victim_Sex!="Unknown")
# Elimination Victim_Age =998 e 99
df=df.filter(df.Victim_Age<99)
# Elimination Victim_Race ="Unknown"
df=df.filter(df.Victim_Race!="Unknown")
# Elimination Perp_Age=1
df=df.filter(df.Perpetrator_Age!=1)
# Elimination Perp_Age=99
df=df.filter(df.Perpetrator_Age!=99)

In [11]:
# Elimination Perp_Age<10
df.createOrReplaceTempView("homicide")
df=sqlCtx.sql("SELECT * FROM homicide WHERE Perpetrator_Age==0 OR Perpetrator_Age>=10")

In [12]:
# Create view
df.createOrReplaceTempView("homicide")
# Elimination Perp_Sex = Unknown, Perp_Race="Unknown" e Crime_Solved=YES
df=sqlCtx.sql("SELECT * FROM homicide WHERE Crime_Solved='No' OR (Crime_Solved='Yes' AND Perpetrator_Sex!='Unknown' AND Perpetrator_Race!='Unknown')")

In [13]:
print("The number of rows and columns: ",df.count(),",",len(df.columns))

The number of row and columns:  618161 , 17


### Some other Trasformation

In [14]:
df=df.withColumn('Relationship',regexp_replace('Relationship','Boyfriend','Boyfriend/Girlfriend'))

In [15]:
# Strange cases B/G --> B/B/G
df=df.withColumn('Relationship',regexp_replace('Relationship','Boyfriend/Girlfriend/Girlfriend','Boyfriend/Girlfriend'))

In [16]:
df=df.withColumn('Relationship',regexp_replace('Relationship','Girlfriend','Boyfriend/Girlfriend'))

In [17]:
df=df.withColumn('Relationship',regexp_replace('Relationship','Boyfriend/Boyfriend/Girlfriend','Boyfriend/Girlfriend'))

In [18]:
df=df.withColumn('Relationship',regexp_replace('Relationship','Employer','Employee'))

In [19]:
# Create view
df.createOrReplaceTempView("homicide")

In [20]:
for col in df.columns:
    x=df.select(col).distinct().count()
    print('Column',col, 'has',x,'distinct values')

Column Agency_Code has 11925 distinct values
Column Agency_Type has 7 distinct values
Column State has 51 distinct values
Column Year has 35 distinct values
Column Month has 12 distinct values
Column Victim_Sex has 2 distinct values
Column Victim_Age has 99 distinct values
Column Victim_Race has 4 distinct values
Column Victim_Ethnicity has 3 distinct values
Column Perpetrator_Sex has 3 distinct values
Column Perpetrator_Race has 5 distinct values
Column Perpetrator_Ethnicity has 3 distinct values
Column Relationship has 25 distinct values
Column Weapon has 16 distinct values
Column City_State has 3039 distinct values
Column Perpetrator_Age has 90 distinct values
Column Crime_Solved has 2 distinct values


In [21]:
print("The final number of row and columns: ",df.count(),",",len(df.columns))

The final number of row and columns:  618161 , 17


In [22]:
df.toPandas().to_csv("Homicide_clean_final.csv", index = False)

## Categorization Victim_Age e Perp_Age

In [23]:
bucketizer = Bucketizer(splits=[0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100, float('Inf') ],inputCol="Victim_Age", outputCol="Victim_Age_Disc")
df = bucketizer.setHandleInvalid("keep").transform(df)
df=df.drop('Victim_Age')

In [24]:
df = df.withColumnRenamed("Victim_Age_Disc", "Victim_Age")

In [25]:
t = {0.0:"[0-5]", 1.0: "[6-10]", 2.0:"[11-15]", 3.0: "[16-20]",
    4.0: "[21-25]",
    5.0: "[26-30]",
    6.0: "[31-35]",
    7.0: "[36-40]",
    8.0: "[41-45]",
    9.0: "[46-50]",
    10.0: "[51-55]",
    11.0: "[56-60]",
    12.0: "[61-65]",
    13.0: "[66-70]",
    14.0: "[71-75]",
    15.0: "[76-80]",
    16.0: "[81-85]",
    17.0: "[86-90]",
     18.0: "[91-95]",
     19.0: "[96-100]"
    }
udf_foo = udf(lambda x: t[x], StringType())
df=df.withColumn("Victim_Age", udf_foo("Victim_Age"))
df.select('Victim_Age').show(10)

+----------+
|Victim_Age|
+----------+
|   [11-15]|
|   [41-45]|
|   [31-35]|
|   [41-45]|
|   [31-35]|
|   [31-35]|
|   [41-45]|
|   [31-35]|
|   [36-40]|
|   [21-25]|
+----------+
only showing top 10 rows



In [26]:
bucketizer = Bucketizer(splits=[ 0,10,20,30,40,50,60,70,80,90,100, float('Inf') ],inputCol="Perpetrator_Age", outputCol="Perpetrator_Age_Disc")
df = bucketizer.setHandleInvalid("keep").transform(df)
df=df.drop('Perpetrator_Age')
df = df.withColumnRenamed("Perpetrator_Age_Disc", "Perpetrator_Age")

In [27]:
t = {0.0:"[0-10]", 
    1.0: "[11-20]",
    2.0:"[21-30]",
    3.0: "[31-40]",
    4.0: "[41-50]",
    5.0: "[51-60]",
    6.0: "[61-70]",
    7.0: "[71-80]",
    8.0: "[81-90]",
    9.0: "[91-100]",
    }

udf_foo = udf(lambda x: t[x], StringType())
df=df.withColumn("Perpetrator_Age", udf_foo("Perpetrator_Age"))
df.select('Perpetrator_Age').show(10)

+---------------+
|Perpetrator_Age|
+---------------+
|        [11-20]|
|        [41-50]|
|         [0-10]|
|        [41-50]|
|         [0-10]|
|        [31-40]|
|        [21-30]|
|         [0-10]|
|         [0-10]|
|        [41-50]|
+---------------+
only showing top 10 rows



In [28]:
df.dtypes

[('Agency_Code', 'string'),
 ('Agency_Type', 'string'),
 ('State', 'string'),
 ('Year', 'int'),
 ('Month', 'string'),
 ('Victim_Sex', 'string'),
 ('Victim_Race', 'string'),
 ('Victim_Ethnicity', 'string'),
 ('Perpetrator_Sex', 'string'),
 ('Perpetrator_Race', 'string'),
 ('Perpetrator_Ethnicity', 'string'),
 ('Relationship', 'string'),
 ('Weapon', 'string'),
 ('City_State', 'string'),
 ('Crime_Solved', 'string'),
 ('Victim_Age', 'string'),
 ('Perpetrator_Age', 'string')]

# Save database

In [29]:
print("The final number of row and columns: ",df.count(),",",len(df.columns))

The final number of row and columns:  618161 , 17


In [30]:
df.toPandas().to_csv("Homicide_clean_Discr.csv", index = False)