#Logistic Regression on the London Fire Brigade dataset

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.sql import functions as F
import hashlib
from pyspark.ml.feature import StringIndexer
from datetime import datetime
from pyspark.sql.functions import UserDefinedFunction, col
from pyspark.sql.types import IntegerType

# TODO: REPLACE PATH with the one from your csv upload
df = spark.read.csv("/FileStore/tables/LFB_Incident_data_from_January_2017.csv", sep=",", header=True)

#How many entries are we going to test?
print ("Total number of rows parsed from csv files: ", df.count())

Total number of rows parsed from csv files:  8175


In [0]:
display(df)

IncidentNumber,DateOfCall,TimeOfCall,IncidentGroup,StopCodeDescription,SpecialServiceType,PropertyCategory,PropertyType,AddressQualifier,Postcode_full,Postcode_district,IncGeo_BoroughCode,IncGeo_BoroughName,IncGeo_WardCode,IncGeo_WardName,Easting_m,Northing_m,Easting_rounded,Northing_rounded,FRS,IncidentStationGround,FirstPumpArriving_AttendanceTime,FirstPumpArriving_DeployedFromStation,SecondPumpArriving_AttendanceTime,SecondPumpArriving_DeployedFromStation,NumStationsWithPumpsAttending,NumPumpsAttending
000003-01012017,01.Jan.17,00:04:27,Special Service,Special Service,Lift Release,Dwelling,Purpose Built Flats/Maisonettes - 10 or more storeys,Within same building,,E9,E09000012,HACKNEY,E05009379,KING'S PARK,,,536650,185450,London,Homerton,,,,,1.0,1.0
000004-01012017,01.Jan.17,00:06:18,False Alarm,AFA,,Non Residential,Engineering manufacturing plant,Nearby address - street not listed in gazetteer,SE2 9SG,SE2,E09000011,GREENWICH,E05000214,ABBEY WOOD,547178.0,179210.0,547150,179250,London,Plumstead,835.0,Plumstead,,,1.0,1.0
000005-01012017,01.Jan.17,00:06:34,Special Service,Special Service,Effecting entry/exit,Dwelling,Purpose Built Flats/Maisonettes - Up to 3 storeys,Correct incident location,,N1,E09000019,ISLINGTON,E05000366,BARNSBURY,,,530750,183050,London,Euston,237.0,Euston,,,1.0,1.0
000006-01012017,01.Jan.17,00:07:57,Fire,Primary Fire,,Non Residential,Restaurant/cafe,Correct incident location,IG11 8TB,IG11,E09000002,BARKING AND DAGENHAM,E05000026,ABBEY,544654.0,184596.0,544650,184550,London,Barking,282.0,Barking,,,1.0,2.0
000007-01012017,01.Jan.17,00:08:59,Special Service,Special Service,Effecting entry/exit,Dwelling,Purpose Built Flats/Maisonettes - 10 or more storeys,Correct incident location,,SW8,E09000022,LAMBETH,E05000426,OVAL,,,530150,177950,London,Lambeth,,,,,1.0,1.0
000008-01012017,01.Jan.17,00:12:44,Fire,Secondary Fire,,Outdoor,Loose refuse,On land associated with building,UB6 8LY,UB6,E09000009,EALING,E05000178,GREENFORD GREEN,514211.0,183917.0,514250,183950,London,Northolt,415.0,Northolt,420.0,Northolt,1.0,2.0
000009-01012017,01.Jan.17,00:13:27,False Alarm,AFA,,Non Residential,Purpose built office,Correct incident location,EC1Y 8LZ,EC1Y,E09000019,ISLINGTON,E05000367,BUNHILL,532603.0,182151.0,532650,182150,London,Shoreditch,230.0,Shoreditch,,,1.0,1.0
000011-01012017,01.Jan.17,00:17:35,False Alarm,AFA,,Dwelling,Purpose Built Flats/Maisonettes - Up to 3 storeys,Within same building,,HA1,E09000005,BRENT,E05000096,NORTHWICK PARK,,,516750,187450,London,Wembley,421.0,Wembley,442.0,Wembley,1.0,2.0
000013-01012017,01.Jan.17,00:20:01,Fire,Primary Fire,,Outdoor Structure,Other outdoor structures,On land associated with building,IG5 0AZ,IG5,E09000026,REDBRIDGE,E05000496,BARKINGSIDE,542787.0,188990.0,542750,188950,London,Ilford,415.0,Ilford,882.0,Woodford,2.0,2.0
000015-01012017,01.Jan.17,00:22:37,Special Service,Special Service,Lift Release,Dwelling,Purpose Built Flats/Maisonettes - 10 or more storeys,Within same building,,E9,E09000012,HACKNEY,E05009379,KING'S PARK,,,536650,185450,London,Homerton,,,,,1.0,1.0


In [0]:
features = ["Postcode_full", "AddressQualifier", "PropertyCategory"]  

# drop rows where feature values are null (can't have this for fitting)
print("Filtering out null values...")
filteredDf = df.na.drop(subset=features)
print ("Now remaining: ", filteredDf.count())

# keep only first N rows for experimenting / faster results
#print("Keeping only first 1000 rows...")
#df = df.limit(1000)


Filtering out null values...
Now remaining:  4057


In [0]:
# map named labels to numerical
filteredDf = filteredDf.withColumn("label", F.when(df.IncidentGroup=="False Alarm", 0.0).otherwise(1.0))

indexer_ZipCode = StringIndexer(inputCol="Postcode_district", outputCol="Postcode_district_indexed")
#df = indexer_ZipCode.fit(df).transform(df).drop("Postcode_district")

#TODO: give the indexed feature columns some new names:
indexedFeatures = []
for featureCol in features:
    indexedFeatures.append(featureCol + "_indexed")
    
df_withIndexedFeatures = filteredDf
display(df_withIndexedFeatures)

IncidentNumber,DateOfCall,TimeOfCall,IncidentGroup,StopCodeDescription,SpecialServiceType,PropertyCategory,PropertyType,AddressQualifier,Postcode_full,Postcode_district,IncGeo_BoroughCode,IncGeo_BoroughName,IncGeo_WardCode,IncGeo_WardName,Easting_m,Northing_m,Easting_rounded,Northing_rounded,FRS,IncidentStationGround,FirstPumpArriving_AttendanceTime,FirstPumpArriving_DeployedFromStation,SecondPumpArriving_AttendanceTime,SecondPumpArriving_DeployedFromStation,NumStationsWithPumpsAttending,NumPumpsAttending,label
000004-01012017,01.Jan.17,00:06:18,False Alarm,AFA,,Non Residential,Engineering manufacturing plant,Nearby address - street not listed in gazetteer,SE2 9SG,SE2,E09000011,GREENWICH,E05000214,ABBEY WOOD,547178,179210,547150,179250,London,Plumstead,835.0,Plumstead,,,1.0,1.0,0.0
000006-01012017,01.Jan.17,00:07:57,Fire,Primary Fire,,Non Residential,Restaurant/cafe,Correct incident location,IG11 8TB,IG11,E09000002,BARKING AND DAGENHAM,E05000026,ABBEY,544654,184596,544650,184550,London,Barking,282.0,Barking,,,1.0,2.0,1.0
000008-01012017,01.Jan.17,00:12:44,Fire,Secondary Fire,,Outdoor,Loose refuse,On land associated with building,UB6 8LY,UB6,E09000009,EALING,E05000178,GREENFORD GREEN,514211,183917,514250,183950,London,Northolt,415.0,Northolt,420.0,Northolt,1.0,2.0,1.0
000009-01012017,01.Jan.17,00:13:27,False Alarm,AFA,,Non Residential,Purpose built office,Correct incident location,EC1Y 8LZ,EC1Y,E09000019,ISLINGTON,E05000367,BUNHILL,532603,182151,532650,182150,London,Shoreditch,230.0,Shoreditch,,,1.0,1.0,0.0
000013-01012017,01.Jan.17,00:20:01,Fire,Primary Fire,,Outdoor Structure,Other outdoor structures,On land associated with building,IG5 0AZ,IG5,E09000026,REDBRIDGE,E05000496,BARKINGSIDE,542787,188990,542750,188950,London,Ilford,415.0,Ilford,882.0,Woodford,2.0,2.0,1.0
000031-01012017,01.Jan.17,00:36:52,Fire,Primary Fire,,Non Residential,Pub/wine bar/bar,Correct incident location,W3 9BH,W3,E09000009,EALING,E05000188,SOUTH ACTON,519802,180251,519850,180250,London,Acton,131.0,Acton,302.0,Chiswick,4.0,4.0,1.0
000045-01012017,01.Jan.17,00:42:14,Fire,Secondary Fire,,Outdoor,Loose refuse,On land associated with building,UB5 4JF,UB5,E09000009,EALING,E05000184,NORTHOLT MANDEVILLE,512576,184822,512550,184850,London,Northolt,298.0,Northolt,,,1.0,1.0,1.0
000046-01012017,01.Jan.17,00:46:32,Fire,Secondary Fire,,Outdoor Structure,Small refuse/rubbish container,In street outside gazetteer location,SE18 6LQ,SE18,E09000011,GREENWICH,E05000230,WOOLWICH RIVERSIDE,543535,178991,543550,178950,London,Plumstead,286.0,Plumstead,,,1.0,1.0,1.0
000048-01012017,01.Jan.17,00:47:35,False Alarm,AFA,,Non Residential,Multi-Storey car park,Within same building,E14 9SD,E14,E09000030,TOWER HAMLETS,E05009318,BLACKWALL & CUBITT TOWN,538153,179788,538150,179750,London,Millwall,104.0,Millwall,,,1.0,1.0,0.0
000049-01012017,01.Jan.17,00:47:43,Fire,Secondary Fire,,Outdoor,Park,On land associated with building,N1 6TA,N1,E09000012,HACKNEY,E05009377,HOXTON EAST & SHOREDITCH,532806,183457,532850,183450,London,Shoreditch,332.0,Shoreditch,,,1.0,1.0,1.0


In [0]:
#TODO: do this for all features you defined earlier
for featureCol in features:
  indexer = StringIndexer(inputCol=featureCol, outputCol=featureCol + "_indexed")
  df_withIndexedFeatures = indexer.fit(df_withIndexedFeatures).transform(df_withIndexedFeatures)
  
##TODO: display the first rows of the dataset
display(df_withIndexedFeatures)


IncidentNumber,DateOfCall,TimeOfCall,IncidentGroup,StopCodeDescription,SpecialServiceType,PropertyCategory,PropertyType,AddressQualifier,Postcode_full,Postcode_district,IncGeo_BoroughCode,IncGeo_BoroughName,IncGeo_WardCode,IncGeo_WardName,Easting_m,Northing_m,Easting_rounded,Northing_rounded,FRS,IncidentStationGround,FirstPumpArriving_AttendanceTime,FirstPumpArriving_DeployedFromStation,SecondPumpArriving_AttendanceTime,SecondPumpArriving_DeployedFromStation,NumStationsWithPumpsAttending,NumPumpsAttending,label,Postcode_full_indexed,AddressQualifier_indexed,PropertyCategory_indexed
000004-01012017,01.Jan.17,00:06:18,False Alarm,AFA,,Non Residential,Engineering manufacturing plant,Nearby address - street not listed in gazetteer,SE2 9SG,SE2,E09000011,GREENWICH,E05000214,ABBEY WOOD,547178,179210,547150,179250,London,Plumstead,835.0,Plumstead,,,1.0,1.0,0.0,2259.0,9.0,0.0
000006-01012017,01.Jan.17,00:07:57,Fire,Primary Fire,,Non Residential,Restaurant/cafe,Correct incident location,IG11 8TB,IG11,E09000002,BARKING AND DAGENHAM,E05000026,ABBEY,544654,184596,544650,184550,London,Barking,282.0,Barking,,,1.0,2.0,1.0,1400.0,0.0,0.0
000008-01012017,01.Jan.17,00:12:44,Fire,Secondary Fire,,Outdoor,Loose refuse,On land associated with building,UB6 8LY,UB6,E09000009,EALING,E05000178,GREENFORD GREEN,514211,183917,514250,183950,London,Northolt,415.0,Northolt,420.0,Northolt,1.0,2.0,1.0,2964.0,3.0,3.0
000009-01012017,01.Jan.17,00:13:27,False Alarm,AFA,,Non Residential,Purpose built office,Correct incident location,EC1Y 8LZ,EC1Y,E09000019,ISLINGTON,E05000367,BUNHILL,532603,182151,532650,182150,London,Shoreditch,230.0,Shoreditch,,,1.0,1.0,0.0,1092.0,0.0,0.0
000013-01012017,01.Jan.17,00:20:01,Fire,Primary Fire,,Outdoor Structure,Other outdoor structures,On land associated with building,IG5 0AZ,IG5,E09000026,REDBRIDGE,E05000496,BARKINGSIDE,542787,188990,542750,188950,London,Ilford,415.0,Ilford,882.0,Woodford,2.0,2.0,1.0,1430.0,3.0,4.0
000031-01012017,01.Jan.17,00:36:52,Fire,Primary Fire,,Non Residential,Pub/wine bar/bar,Correct incident location,W3 9BH,W3,E09000009,EALING,E05000188,SOUTH ACTON,519802,180251,519850,180250,London,Acton,131.0,Acton,302.0,Chiswick,4.0,4.0,1.0,419.0,0.0,0.0
000045-01012017,01.Jan.17,00:42:14,Fire,Secondary Fire,,Outdoor,Loose refuse,On land associated with building,UB5 4JF,UB5,E09000009,EALING,E05000184,NORTHOLT MANDEVILLE,512576,184822,512550,184850,London,Northolt,298.0,Northolt,,,1.0,1.0,1.0,2949.0,3.0,3.0
000046-01012017,01.Jan.17,00:46:32,Fire,Secondary Fire,,Outdoor Structure,Small refuse/rubbish container,In street outside gazetteer location,SE18 6LQ,SE18,E09000011,GREENWICH,E05000230,WOOLWICH RIVERSIDE,543535,178991,543550,178950,London,Plumstead,286.0,Plumstead,,,1.0,1.0,1.0,2241.0,1.0,4.0
000048-01012017,01.Jan.17,00:47:35,False Alarm,AFA,,Non Residential,Multi-Storey car park,Within same building,E14 9SD,E14,E09000030,TOWER HAMLETS,E05009318,BLACKWALL & CUBITT TOWN,538153,179788,538150,179750,London,Millwall,104.0,Millwall,,,1.0,1.0,0.0,152.0,2.0,0.0
000049-01012017,01.Jan.17,00:47:43,Fire,Secondary Fire,,Outdoor,Park,On land associated with building,N1 6TA,N1,E09000012,HACKNEY,E05009377,HOXTON EAST & SHOREDITCH,532806,183457,532850,183450,London,Shoreditch,332.0,Shoreditch,,,1.0,1.0,1.0,1534.0,3.0,3.0


In [0]:
### User defined function that takes a String describing a date, e.g. 27.Jan.09 or 27-Jan-09 and returns
#   the index of that day in the week (e.g. Monday = 1) using a datetime object in Python
def dayOfWeek(text):
  for fmt in ('%d.%b.%y', '%d-%b-%y'):
        try:
            return datetime.strptime(text, fmt).weekday()
        except ValueError:
            pass
  raise ValueError('no valid date format found')
  
  
udf_getDayOfWeek = F.udf(dayOfWeek, IntegerType())

df_withIndexedFeatures = df_withIndexedFeatures.withColumn("DayOfWeek", udf_getDayOfWeek(col("DateOfCall")))
df_withIndexedFeatures = df_withIndexedFeatures.withColumn("HourOfDay", F.hour(col("TimeOfCall")))

#TODO: add here all the fields that you want to use and select only those from the dataframe. Don't forget the label column!
finalFeaturesList = []
finalFeaturesList.append("label")
finalFeaturesList.extend(indexedFeatures)
finalFeaturesList.append("DayOfWeek")
finalFeaturesList.append("HourOfDay")

featureVectorDf = df_withIndexedFeatures.select(finalFeaturesList)
display(featureVectorDf)

label,Postcode_full_indexed,AddressQualifier_indexed,PropertyCategory_indexed,DayOfWeek,HourOfDay
0.0,2259.0,9.0,0.0,6,0
1.0,1400.0,0.0,0.0,6,0
1.0,2964.0,3.0,3.0,6,0
0.0,1092.0,0.0,0.0,6,0
1.0,1430.0,3.0,4.0,6,0
1.0,419.0,0.0,0.0,6,0
1.0,2949.0,3.0,3.0,6,0
1.0,2241.0,1.0,4.0,6,0
0.0,152.0,2.0,0.0,6,0
1.0,1534.0,3.0,3.0,6,0


In [0]:
# Split into training and test data
training, testing = featureVectorDf.randomSplit([0.6, 0.4], seed=42)

# Configure an ML pipeline, which consists of two stages: feature assembler and lr.
# Transform n feature vectors into one single vector column
assembler = VectorAssembler(inputCols=training.columns[1:], outputCol='features')
lr = LogisticRegression(maxIter=10, regParam=0.01)
pipeline = Pipeline(stages=[assembler, lr])

print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")

LogisticRegression parameters:
aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial (default: auto)
featuresCol: features column name. (default: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label)
lowerBoundsOnCoefficients: The lower bounds on coefficients if fitting under bound constrained optimization. The bound matrix must be compatible with the shape (1, number of features) for binomial regression, or (number of classes, number of features) for multinomial regression. (undefined)
lowerBoundsOnIntercepts: The lower bounds on intercepts if fitting under bound constrained optimization. The bou

In [0]:
# predict 
model = pipeline.fit(training)
prediction = model.transform(testing)

print("prediction-schema: ", prediction.printSchema())


root
 |-- label: double (nullable = false)
 |-- Postcode_full_indexed: double (nullable = false)
 |-- AddressQualifier_indexed: double (nullable = false)
 |-- PropertyCategory_indexed: double (nullable = false)
 |-- DayOfWeek: integer (nullable = true)
 |-- HourOfDay: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)

prediction-schema:  None


In [0]:
selected = prediction.select("features", "label", "probability", "prediction")
print(selected)

DataFrame[features: vector, label: double, probability: vector, prediction: double]


In [0]:
# Caculate prediction accuracy
numRows = selected.count()
print("Total Number of Predictions: " + str(numRows))

Total Number of Predictions: 1564


In [0]:
# to compute the number of correct predictions, 
# select all where the prediction matches the label and count
correct = selected.filter(selected.label == selected.prediction).count()

# the accuracy is between 0 and 1, convert the variable 'correct' to a float to avoid the division returning an int (0 or 1)
accuracy = float(correct) / selected.count()
print("Accuracy: ", accuracy)

Accuracy:  0.7135549872122762
