In [1]:
# importing SparkSession
from pyspark.sql import SparkSession
# creating spark session using spark session builder
spark = SparkSession.builder\
                    .appName('Analyzing London Crime Data')\
                    .getOrCreate()

In [2]:
# loading our london crime data
data = spark.read\
            .format('csv')\
            .option('header', 'true')\
            .load('../datasets/london_crime.csv')

In [3]:
# printing schema of dataset
data.printSchema()

root
 |-- lsoa_code: string (nullable = true)
 |-- borough: string (nullable = true)
 |-- major_category: string (nullable = true)
 |-- minor_category: string (nullable = true)
 |-- value: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)



In [4]:
# counting data
data.count()

13490604

In [5]:
# displaying limited data
data.limit(5).show()

+---------+----------+--------------------+--------------------+-----+----+-----+
|lsoa_code|   borough|      major_category|      minor_category|value|year|month|
+---------+----------+--------------------+--------------------+-----+----+-----+
|E01001116|   Croydon|            Burglary|Burglary in Other...|    0|2016|   11|
|E01001646| Greenwich|Violence Against ...|      Other violence|    0|2016|   11|
|E01000677|   Bromley|Violence Against ...|      Other violence|    0|2015|    5|
|E01003774| Redbridge|            Burglary|Burglary in Other...|    0|2016|    3|
|E01004563|Wandsworth|             Robbery|   Personal Property|    0|2008|    6|
+---------+----------+--------------------+--------------------+-----+----+-----+



In [6]:
# cleaning and preprocessing the data
data.dropna()
# deleting data having no value

DataFrame[lsoa_code: string, borough: string, major_category: string, minor_category: string, value: string, year: string, month: string]

In [7]:
# munging data by deleting column which is not useful
data = data.drop('lsoa_code')
data.show(5)

+----------+--------------------+--------------------+-----+----+-----+
|   borough|      major_category|      minor_category|value|year|month|
+----------+--------------------+--------------------+-----+----+-----+
|   Croydon|            Burglary|Burglary in Other...|    0|2016|   11|
| Greenwich|Violence Against ...|      Other violence|    0|2016|   11|
|   Bromley|Violence Against ...|      Other violence|    0|2015|    5|
| Redbridge|            Burglary|Burglary in Other...|    0|2016|    3|
|Wandsworth|             Robbery|   Personal Property|    0|2008|    6|
+----------+--------------------+--------------------+-----+----+-----+
only showing top 5 rows



In [13]:
# selecting unique values in particular column
total_boroughs = data.select('borough').distinct()
# displaying top 20 rows from dataset
total_boroughs.show()
# count of distinct types of borough
total_boroughs.count()

+--------------------+
|             borough|
+--------------------+
|             Croydon|
|          Wandsworth|
|              Bexley|
|             Lambeth|
|Barking and Dagenham|
|              Camden|
|           Greenwich|
|              Newham|
|       Tower Hamlets|
|            Hounslow|
|              Barnet|
|              Harrow|
|Kensington and Ch...|
|           Islington|
|               Brent|
|            Haringey|
|             Bromley|
|              Merton|
|         Westminster|
|             Hackney|
+--------------------+
only showing top 20 rows



33

In [11]:
# filtering data based on values
hackney_data = data.filter(data['borough'] == 'Hackney')
hackney_data.show(5)

+-------+--------------------+--------------------+-----+----+-----+
|borough|      major_category|      minor_category|value|year|month|
+-------+--------------------+--------------------+-----+----+-----+
|Hackney|     Criminal Damage|Criminal Damage T...|    0|2011|    6|
|Hackney|Violence Against ...|          Harassment|    1|2013|    2|
|Hackney|     Criminal Damage|Other Criminal Da...|    0|2011|    7|
|Hackney|Violence Against ...|        Wounding/GBH|    0|2013|   12|
|Hackney|  Theft and Handling|  Other Theft Person|    0|2016|    8|
+-------+--------------------+--------------------+-----+----+-----+
only showing top 5 rows



In [16]:
# filtering data using isin()
data_2015_to_2016 = data.filter(data['year'].isin(['2015', '2016']))
# displaying in some fraction of data
data_2015_to_2016.sample(fraction=0.1).show()

+--------------------+--------------------+--------------------+-----+----+-----+
|             borough|      major_category|      minor_category|value|year|month|
+--------------------+--------------------+--------------------+-----+----+-----+
|            Haringey|Violence Against ...|        Wounding/GBH|    0|2015|   12|
|            Havering|    Fraud or Forgery|  Counted per Victim|    0|2015|   11|
|               Brent|  Theft and Handling|Motor Vehicle Int...|    0|2015|    9|
|            Hounslow|Violence Against ...|        Wounding/GBH|    2|2015|    8|
|         Westminster|               Drugs|    Drug Trafficking|    0|2015|    1|
|           Redbridge|            Burglary|Burglary in Other...|    0|2015|    2|
|               Brent|  Theft and Handling|  Other Theft Person|    1|2016|   12|
|             Enfield|     Criminal Damage|Other Criminal Da...|    0|2015|    9|
|          Hillingdon|            Burglary|Burglary in Other...|    0|2016|   12|
|          Wands

In [17]:
# using conditional operators for filter
data_2014_onwards = data.filter(data['year'] > 2014)
# showing 0.1 fraction of data with year greater than 2014
data_2014_onwards.sample(fraction=0.1).show()

+--------------------+--------------------+--------------------+-----+----+-----+
|             borough|      major_category|      minor_category|value|year|month|
+--------------------+--------------------+--------------------+-----+----+-----+
|           Redbridge|            Burglary|Burglary in Other...|    0|2016|    3|
|          Hillingdon|  Theft and Handling|Theft/Taking Of M...|    0|2016|    2|
|            Lewisham|Violence Against ...|      Common Assault|    0|2016|    2|
|      Waltham Forest|            Burglary|Burglary in Other...|    0|2015|    6|
|          Hillingdon|Violence Against ...|      Other violence|    0|2016|   11|
|       Tower Hamlets|            Burglary|Burglary in a Dwe...|    0|2016|    3|
|       Tower Hamlets|  Theft and Handling|Handling Stolen G...|    0|2015|   10|
|         Westminster|               Drugs|    Drug Trafficking|    0|2015|    1|
|             Croydon|  Theft and Handling|  Other Theft Person|    0|2016|    2|
|             Cr

In [19]:
# groupby for similar grouping data
borough_crime_count = data.groupBy('borough').count()
borough_crime_count.show(5)

+--------------------+------+
|             borough| count|
+--------------------+------+
|             Croydon|602100|
|          Wandsworth|498636|
|              Bexley|385668|
|             Lambeth|519048|
|Barking and Dagenham|311040|
+--------------------+------+
only showing top 5 rows



In [21]:
# aggregating/sum of all the value groupby borough
borough_conviction_sum = data.groupBy('borough').agg({'value':'sum'})
borough_conviction_sum.show(5)

+--------------------+----------+
|             borough|sum(value)|
+--------------------+----------+
|             Croydon|  260294.0|
|          Wandsworth|  204741.0|
|              Bexley|  114136.0|
|             Lambeth|  292178.0|
|Barking and Dagenham|  149447.0|
+--------------------+----------+
only showing top 5 rows



In [22]:
# renaming default column name
borough_conviction_sum = data.groupBy('borough').agg({'value':'sum'})\
                             .withColumnRenamed('sum(value)', 'convictions')
borough_conviction_sum.show(5)

+--------------------+-----------+
|             borough|convictions|
+--------------------+-----------+
|             Croydon|   260294.0|
|          Wandsworth|   204741.0|
|              Bexley|   114136.0|
|             Lambeth|   292178.0|
|Barking and Dagenham|   149447.0|
+--------------------+-----------+
only showing top 5 rows



In [26]:
# performing multiple operations on dataset
conviction_monthly = data.filter(data['year'] == 2014)\
                         .groupBy('month')\
                         .agg({'value':'sum'})\
                         .withColumnRenamed('sum(value)', 'convictions')
conviction_monthly

DataFrame[month: string, convictions: double]

In [27]:
# displaying data
conviction_monthly.show(5)

+-----+-----------+
|month|convictions|
+-----+-----------+
|    7|    58564.0|
|   11|    59704.0|
|    3|    57669.0|
|    8|    55641.0|
|    5|    56327.0|
+-----+-----------+
only showing top 5 rows



In [30]:
# importing sql functions
import pyspark.sql.functions as func

# calculating percentage of convictions per months
total_convictions_monthly = conviction_monthly.agg({'convictions':'sum'})\
                                              .collect()[0][0]
total_convictions_monthly = conviction_monthly.withColumn(
                            'percent',
                            func.round(conviction_monthly.convictions/total_convictions_monthly * 100, 2)
)
total_convictions_monthly.columns

['month', 'convictions', 'percent']

In [31]:
# displaying percentage and orderby descending order
total_convictions_monthly.orderBy(total_convictions_monthly.percent.desc()).show()

+-----+-----------+-------+
|month|convictions|percent|
+-----+-----------+-------+
|   10|    60537.0|    8.9|
|   11|    59704.0|   8.78|
|    7|    58564.0|   8.61|
|    3|    57669.0|   8.48|
|   12|    57565.0|   8.46|
|    6|    57039.0|   8.39|
|    9|    56933.0|   8.37|
|    5|    56327.0|   8.28|
|    8|    55641.0|   8.18|
|    1|    55515.0|   8.16|
|    4|    53467.0|   7.86|
|    2|    51222.0|   7.53|
+-----+-----------+-------+

