In [1]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.sql import Row, SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.functions import desc
from pyspark.sql.types import *
from pyspark import SparkContext

In [2]:
sc = SparkContext("local", "TP1")
spark = SparkSession.builder.appName("TP1").getOrCreate()

In [26]:
df1 = spark.read.load("food.csv", format="csv",
sep=",",
inferSchema="true",
header="true")

In [27]:
df1.show() 
df1.printSchema()

+-------------+--------------------+--------------------+---------+--------------------+---------------+--------------------+-------+-----+-----+---------------+--------------------+------------------+--------------------+------------------+------------------+--------------------+--------------------------+---------+---------------+-------------+-----+
|Inspection ID|            DBA Name|            AKA Name|License #|       Facility Type|           Risk|             Address|   City|State|  Zip|Inspection Date|     Inspection Type|           Results|          Violations|          Latitude|         Longitude|            Location|Historical Wards 2003-2015|Zip Codes|Community Areas|Census Tracts|Wards|
+-------------+--------------------+--------------------+---------+--------------------+---------------+--------------------+-------+-----+-----+---------------+--------------------+------------------+--------------------+------------------+------------------+--------------------+---------

In [31]:
inspections = df1.na.drop(subset=['Inspection ID','DBA Name', 'Results'])

In [32]:
inspections.show()

+-------------+--------------------+--------------------+---------+--------------------+---------------+--------------------+-------+-----+-----+---------------+--------------------+------------------+--------------------+------------------+------------------+--------------------+--------------------------+---------+---------------+-------------+-----+
|Inspection ID|            DBA Name|            AKA Name|License #|       Facility Type|           Risk|             Address|   City|State|  Zip|Inspection Date|     Inspection Type|           Results|          Violations|          Latitude|         Longitude|            Location|Historical Wards 2003-2015|Zip Codes|Community Areas|Census Tracts|Wards|
+-------------+--------------------+--------------------+---------+--------------------+---------------+--------------------+-------+-----+-----+---------------+--------------------+------------------+--------------------+------------------+------------------+--------------------+---------

In [34]:
nbre = df1.groupBy("Results").count().sort("count")

In [54]:
x = nbre.show() ##fonction d'affichage, les 4 premières lignes, toujours une dataframe, ne renvoie rien
nbre.take(4) ##renvoie une liste
nbre.explain(True) ##execution plan
nbre.describe()

+--------------------+------+
|             Results| count|
+--------------------+------+
|Business Not Located|    67|
|           Not Ready|  1859|
|            No Entry|  6188|
|     Out of Business| 16784|
|  Pass w/ Conditions| 26780|
|                Fail| 37698|
|                Pass|105528|
+--------------------+------+

== Parsed Logical Plan ==
'Sort ['count ASC NULLS FIRST], true
+- Aggregate [Results#1032], [Results#1032, count(1) AS count#1456L]
   +- Relation[Inspection ID#1020,DBA Name#1021,AKA Name#1022,License ##1023,Facility Type#1024,Risk#1025,Address#1026,City#1027,State#1028,Zip#1029,Inspection Date#1030,Inspection Type#1031,Results#1032,Violations#1033,Latitude#1034,Longitude#1035,Location#1036,Historical Wards 2003-2015#1037,Zip Codes#1038,Community Areas#1039,Census Tracts#1040,Wards#1041] csv

== Analyzed Logical Plan ==
Results: string, count: bigint
Sort [count#1456L ASC NULLS FIRST], true
+- Aggregate [Results#1032], [Results#1032, count(1) AS count#1456L]
 

DataFrame[summary: string, Results: string, count: string]

In [51]:
def labelForResults(s):
    if s == 'Fail':
        return 0.0
    elif s == 'Pass w/ Conditions' or s == 'Pass':
        return 1.0
    else:
        return -1.0

In [53]:
monudf = udf(labelForResults,DoubleType())

In [82]:
labeledData = inspections.select(monudf(inspections["results"])
.alias('label'),"violations").where('label >= 0')

In [83]:
labeledData.show()

+-----+--------------------+
|label|          violations|
+-----+--------------------+
|  1.0|                null|
|  1.0|                null|
|  1.0|37. FOOD PROPERLY...|
|  0.0|53. TOILET FACILI...|
|  1.0|54. GARBAGE & REF...|
|  1.0|                null|
|  1.0|                null|
|  1.0|                null|
|  1.0|47. FOOD & NON-FO...|
|  0.0|                null|
|  1.0|                null|
|  1.0|22. PROPER COLD H...|
|  1.0|                null|
|  1.0|5. PROCEDURES FOR...|
|  0.0|3. MANAGEMENT, FO...|
|  1.0|                null|
|  1.0|                null|
|  0.0|3. MANAGEMENT, FO...|
|  1.0|37. FOOD PROPERLY...|
|  1.0|9. NO BARE HAND C...|
+-----+--------------------+
only showing top 20 rows



In [85]:
training, validationDf = labeledData.randomSplit([0.25,0.75], 105)

In [86]:
validationDf.show(100)

+-----+----------+
|label|violations|
+-----+----------+
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|      null|
|  0.0|     