# Chargement des données

Nous allons travailler sur des données consommateurs de restaurants via Spark SQL

* Créez vous une spark session sur le compte de stockage AWS

In [None]:
!pip install boto3

In [None]:
import pyspark
import boto3

spark = (pyspark.sql.SparkSession.builder \
         .master('local') \
         .appName('Introduction to PySpark') \
         .config("spark.some.config.option", "some-value") \
         .getOrCreate())

sc = spark.sparkContext

ACCESS_KEY_ID = "ACCESS_KEY_ID" # cle du compte student
SECRET_ACCESS_KEY = "SECRET_ACCESS_KEY" # secret key du compte student
BUCKET_NAME = "full-stack-bigdata-datasets"
S3_RESOURCE = "s3"
PREFIX = "Big Data/S8-5/Exos/restaurant-data-with-consumer-ratings/"
hadoop_conf = spark._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3a.access.key", ACCESS_KEY_ID)
hadoop_conf.set("fs.s3a.secret.key", SECRET_ACCESS_KEY)
hadoop_conf.set("fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem") 

import boto3

# We create a S3 resource and a Bucket from this same resource
session = boto3.Session(
    region_name='eu-west-3',  # Datacenters located in Paris, FR
    aws_access_key_id=ACCESS_KEY_ID,
    aws_secret_access_key=SECRET_ACCESS_KEY
)
s3 = session.resource('s3')
bucket = s3.Bucket(BUCKET_NAME)

def get_s3_path(key, bucket_name=BUCKET_NAME, scheme=S3_RESOURCE):
    return f"{scheme}://{bucket_name}/{key}"

* Importez les fichiers csv suivants dans une variable `datasets` :
  * chefmozaccepts.csv
  * chefmozcuisine.csv
  * chefmozhour4.csv
  * chefmozparking.csv
  * geoplaces2.csv
  * rating_final.csv
  * usercuisine.csv
  * userpayment.csv
  * userprofile.csv

Conseil : Vous pouvez utiliser une *list comprehension*. La variable `datasets` sera considérée comme un dictionnaire.

In [None]:
import os
datasets = {os.path.splitext(os.path.split(bobject.key)[1])[0]: spark.read.option("header","true").csv(get_s3_path(bobject.key)) 
            for bobject in bucket.objects.filter(Prefix=PREFIX) if bobject.key.endswith('.csv')}
print(datasets)

* Visualisez le dataset `userprofile`

In [None]:
display(datasets["userprofile"])

userID,latitude,longitude,smoker,drink_level,dress_preference,ambience,transport,marital_status,hijos,birth_year,interest,personality,religion,activity,color,weight,budget,height
U1001,22.139997,-100.978803,false,abstemious,informal,family,on foot,single,independent,1989,variety,thrifty-protector,none,student,black,69,medium,1.77
U1002,22.150087,-100.983325,false,abstemious,informal,family,public,single,independent,1990,technology,hunter-ostentatious,Catholic,student,red,40,low,1.87
U1003,22.119847,-100.946527,false,social drinker,formal,family,public,single,independent,1989,none,hard-worker,Catholic,student,blue,60,low,1.69
U1004,18.867,-99.183,false,abstemious,informal,family,public,single,independent,1940,variety,hard-worker,none,professional,green,44,medium,1.53
U1005,22.183477,-100.959891,false,abstemious,no preference,family,public,single,independent,1992,none,thrifty-protector,Catholic,student,black,65,medium,1.69
U1006,22.15,-100.983,true,social drinker,no preference,friends,car owner,single,independent,1989,variety,hard-worker,none,student,blue,75,medium,1.8
U1007,22.118464,-100.938256,false,casual drinker,informal,solitary,public,single,independent,1989,variety,thrifty-protector,Catholic,student,purple,60,low,1.59
U1008,22.122989,-100.923811,false,social drinker,formal,solitary,public,single,independent,1989,technology,hard-worker,Catholic,student,green,68,low,1.72
U1009,22.159427,-100.990448,false,abstemious,formal,family,on foot,single,kids,1991,variety,thrifty-protector,Catholic,student,green,75,medium,1.78
U1010,22.190889,-100.998669,false,social drinker,no preference,friends,car owner,married,kids,1987,technology,hard-worker,none,student,green,40,medium,1.67


* Visualisez le dataset `userpayment`

In [None]:
display(datasets["userpayment"])

userID,Upayment
U1001,cash
U1002,cash
U1003,cash
U1004,cash
U1004,bank_debit_cards
U1005,cash
U1006,cash
U1007,cash
U1008,cash
U1009,cash


* Faites de même avec `usercuisine`

In [None]:
display(datasets["usercuisine"])

userID,Rcuisine
U1001,American
U1002,Mexican
U1003,Mexican
U1004,Bakery
U1004,Breakfast-Brunch
U1004,Japanese
U1004,Contemporary
U1004,Mexican
U1004,Bagels
U1004,Cafe-Coffee_Shop


* Affichez les derniers datasets afin d'en voir leur contenu

In [None]:
display(datasets["rating_final"])

userID,placeID,rating,food_rating,service_rating
U1077,135085,2,2,2
U1077,135038,2,2,1
U1077,132825,2,2,2
U1077,135060,1,2,2
U1068,135104,1,1,2
U1068,132740,0,0,0
U1068,132663,1,1,1
U1068,132732,0,0,0
U1068,132630,1,1,1
U1067,132584,2,2,2


In [None]:
display(datasets["geoplaces2"])

placeID,latitude,longitude,the_geom_meter,name,address,city,state,country,fax,zip,alcohol,smoking_area,dress_code,accessibility,price,url,Rambience,franchise,area,other_services
134999,18.915421,-99.184871,0101000020957F000088568DE356715AC138C0A525FC464A41,Kiku Cuernavaca,Revolucion,Cuernavaca,Morelos,Mexico,?,?,No_Alcohol_Served,none,informal,no_accessibility,medium,kikucuernavaca.com.mx,familiar,f,closed,none
132825,22.1473922,-100.983092,0101000020957F00001AD016568C4858C1243261274BA54B41,puesto de tacos,esquina santos degollado y leon guzman,s.l.p.,s.l.p.,mexico,?,78280,No_Alcohol_Served,none,informal,completely,low,?,familiar,f,open,none
135106,22.1497088,-100.9760928,0101000020957F0000649D6F21634858C119AE9BF528A34B41,El Rinc�n de San Francisco,Universidad 169,San Luis Potosi,San Luis Potosi,Mexico,?,78000,Wine-Beer,only at bar,informal,partially,medium,?,familiar,f,open,none
132667,23.7526973,-99.1633594,0101000020957F00005D67BCDDED8157C1222A2DC8D84D4941,little pizza Emilio Portes Gil,calle emilio portes gil,victoria,tamaulipas,?,?,?,No_Alcohol_Served,none,informal,completely,low,?,familiar,t,closed,none
132613,23.7529035,-99.165076,0101000020957F00008EBA2D06DC8157C194E03B7B504E4941,carnitas_mata,lic. Emilio portes gil,victoria,Tamaulipas,Mexico,?,?,No_Alcohol_Served,permitted,informal,completely,medium,?,familiar,t,closed,none
135040,22.135617,-100.969709,0101000020957F00001B552189B84A58C15A2AAEFD2CA24B41,Restaurant los Compadres,Camino a Simon Diaz 155 Centro,San Luis Potosi,SLP,Mexico,?,74000,Wine-Beer,none,informal,no_accessibility,high,?,familiar,f,closed,none
132732,23.7543569,-99.171288,0101000020957F00008A20E615808157C16272FECBF84F4941,Taqueria EL amigo,Calle Mezquite Fracc Framboyanes,Cd Victoria,Tamaulipas,Mexico,?,87018,No_Alcohol_Served,none,casual,completely,low,?,familiar,f,open,none
132875,22.1499013,-100.9937793,0101000020957F00008A2A0747DE4758C11EB31D2A31A84B41,shi ro ie,?,?,?,?,?,?,Wine-Beer,section,informal,no_accessibility,high,?,familiar,t,open,Internet
132609,23.7602683,-99.1658646,0101000020957F0000A478418BBA8057C133851EB22C4E4941,Pollo_Frito_Buenos_Aires,tampico,victoria,Tamaulipas,Mexico,?,?,No_Alcohol_Served,not permitted,informal,completely,low,?,quiet,t,closed,none
135082,22.151448,-100.915099,0101000020957F0000A29FAF95CD4958C1FEEEBB73A9914B41,la Estrella de Dimas,Villa de Pozos 192 Villa de Pozos,San Luis Potosi,SLP,Mexico,?,78421,No_Alcohol_Served,none,informal,no_accessibility,medium,?,familiar,f,closed,none


In [None]:
display(datasets["userpayment"])

userID,Upayment
U1001,cash
U1002,cash
U1003,cash
U1004,cash
U1004,bank_debit_cards
U1005,cash
U1006,cash
U1007,cash
U1008,cash
U1009,cash


In [None]:
display(datasets["chefmozhours4"])

placeID,hours,days
135111,00:00-23:30;,Mon;Tue;Wed;Thu;Fri;
135111,00:00-23:30;,Sat;
135111,00:00-23:30;,Sun;
135110,08:00-19:00;,Mon;Tue;Wed;Thu;Fri;
135110,00:00-00:00;,Sat;
135110,00:00-00:00;,Sun;
135109,08:00-21:00;,Mon;Tue;Wed;Thu;Fri;
135109,08:00-21:00;,Sat;
135109,08:00-21:00;,Sun;
135108,00:00-23:30;,Mon;Tue;Wed;Thu;Fri;


In [None]:
display(datasets["chefmozcuisine"])

placeID,Rcuisine
135110,Spanish
135109,Italian
135107,Latin_American
135106,Mexican
135105,Fast_Food
135104,Mexican
135103,Burgers
135103,Dessert-Ice_Cream
135103,Fast_Food
135103,Hot_Dogs


In [None]:
display(datasets["chefmozaccepts"])

placeID,Rpayment
135110,cash
135110,VISA
135110,MasterCard-Eurocard
135110,American_Express
135110,bank_debit_cards
135109,cash
135107,cash
135107,VISA
135107,MasterCard-Eurocard
135107,American_Express


#1. Prospects

* Créez deux variables :
  * `usercuisine`
  * `chefmozcuisine`

Celles-ci doivent stocker respectivement le dataset ==> `datasets["usercuisine"]` & `datasets["chefmozcuisine"]`

In [None]:
usercuisine = datasets["usercuisine"]
chefmozcuisine = datasets["chefmozcuisine"]

* Visualisez les deux variables

In [None]:
display(usercuisine)

userID,Rcuisine
U1001,American
U1002,Mexican
U1003,Mexican
U1004,Bakery
U1004,Breakfast-Brunch
U1004,Japanese
U1004,Contemporary
U1004,Mexican
U1004,Bagels
U1004,Cafe-Coffee_Shop


In [None]:
display(chefmozcuisine)

placeID,Rcuisine
135110,Spanish
135109,Italian
135107,Latin_American
135106,Mexican
135105,Fast_Food
135104,Mexican
135103,Burgers
135103,Dessert-Ice_Cream
135103,Fast_Food
135103,Hot_Dogs


* En utilisant les fonctions dans pyspark.sql, comptez le nombre de `userID` dans le dataset et ordonnez les par ordre décroissant.

In [None]:
display(usercuisine.groupBy("userID").count().orderBy("count", ascending=False))

userID,count
U1135,103
U1108,18
U1101,15
U1016,14
U1060,13
U1008,10
U1004,9
U1009,7
U1045,4
U1021,4


* Faites de même avec le nombre de `placeID`

In [None]:
display(chefmozcuisine.groupBy("placeID").count().orderBy("count", ascending=False))

placeID,count
132774,9
135099,6
135097,6
135103,4
135098,4
132237,3
132166,3
132555,3
132177,3
132296,3


* Créez un dataset contenant pour chaque placeID, la liste des userID dans une colonne et le nombre de userID dans une autre colonne. Vous ajouterez la colonne `RCuisine` pour connaître le type de cuisine de chaque `placeID`. Vous ordonnerez le dataset par userID de façon descendante. 

Indice : les attributs [join](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html?highlight=dataframe#pyspark.sql.DataFrame.join) & [collect_set](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html?highlight=dataframe#pyspark.sql.functions.collect_set) en plus d'autres fonctions d'aggrégations du cours vont vous aider à réaliser cette tâche.

In [None]:
from pyspark.sql import functions as F
users = (chefmozcuisine
         .join(chefmozcuisine
              .join(usercuisine, "Rcuisine","inner")
              .groupBy("placeID")
              .agg(F.collect_set('userID').alias("users"))
              .withColumn("count_users", F.size("users")), "placeID")
        .orderBy("count_users", ascending=False))

display(users)

placeID,Rcuisine,users,count_users
132774,Bakery,"List(U1075, U1002, U1134, U1007, U1119, U1076, U1025, U1054, U1020, U1111, U1084, U1100, U1045, U1021, U1027, U1128, U1068, U1047, U1135, U1055, U1123, U1083, U1081, U1038, U1037, U1005, U1079, U1103, U1048, U1089, U1136, U1016, U1030, U1101, U1078, U1072, U1071, U1110, U1069, U1097, U1096, U1032, U1003, U1009, U1116, U1042, U1019, U1132, U1077, U1029, U1114, U1085, U1028, U1010, U1008, U1066, U1033, U1073, U1056, U1015, U1126, U1006, U1036, U1088, U1035, U1053, U1052, U1109, U1130, U1098, U1022, U1099, U1080, U1070, U1112, U1133, U1044, U1050, U1051, U1061, U1049, U1060, U1131, U1067, U1011, U1082, U1105, U1091, U1018, U1137, U1113, U1127, U1125, U1065, U1059, U1026, U1031, U1034, U1120, U1023, U1108, U1124, U1063, U1062, U1129, U1121, U1004, U1057, U1107, U1064, U1001, U1094)",112
132774,Cafeteria,"List(U1075, U1002, U1134, U1007, U1119, U1076, U1025, U1054, U1020, U1111, U1084, U1100, U1045, U1021, U1027, U1128, U1068, U1047, U1135, U1055, U1123, U1083, U1081, U1038, U1037, U1005, U1079, U1103, U1048, U1089, U1136, U1016, U1030, U1101, U1078, U1072, U1071, U1110, U1069, U1097, U1096, U1032, U1003, U1009, U1116, U1042, U1019, U1132, U1077, U1029, U1114, U1085, U1028, U1010, U1008, U1066, U1033, U1073, U1056, U1015, U1126, U1006, U1036, U1088, U1035, U1053, U1052, U1109, U1130, U1098, U1022, U1099, U1080, U1070, U1112, U1133, U1044, U1050, U1051, U1061, U1049, U1060, U1131, U1067, U1011, U1082, U1105, U1091, U1018, U1137, U1113, U1127, U1125, U1065, U1059, U1026, U1031, U1034, U1120, U1023, U1108, U1124, U1063, U1062, U1129, U1121, U1004, U1057, U1107, U1064, U1001, U1094)",112
132774,American,"List(U1075, U1002, U1134, U1007, U1119, U1076, U1025, U1054, U1020, U1111, U1084, U1100, U1045, U1021, U1027, U1128, U1068, U1047, U1135, U1055, U1123, U1083, U1081, U1038, U1037, U1005, U1079, U1103, U1048, U1089, U1136, U1016, U1030, U1101, U1078, U1072, U1071, U1110, U1069, U1097, U1096, U1032, U1003, U1009, U1116, U1042, U1019, U1132, U1077, U1029, U1114, U1085, U1028, U1010, U1008, U1066, U1033, U1073, U1056, U1015, U1126, U1006, U1036, U1088, U1035, U1053, U1052, U1109, U1130, U1098, U1022, U1099, U1080, U1070, U1112, U1133, U1044, U1050, U1051, U1061, U1049, U1060, U1131, U1067, U1011, U1082, U1105, U1091, U1018, U1137, U1113, U1127, U1125, U1065, U1059, U1026, U1031, U1034, U1120, U1023, U1108, U1124, U1063, U1062, U1129, U1121, U1004, U1057, U1107, U1064, U1001, U1094)",112
132774,Cafe-Coffee_Shop,"List(U1075, U1002, U1134, U1007, U1119, U1076, U1025, U1054, U1020, U1111, U1084, U1100, U1045, U1021, U1027, U1128, U1068, U1047, U1135, U1055, U1123, U1083, U1081, U1038, U1037, U1005, U1079, U1103, U1048, U1089, U1136, U1016, U1030, U1101, U1078, U1072, U1071, U1110, U1069, U1097, U1096, U1032, U1003, U1009, U1116, U1042, U1019, U1132, U1077, U1029, U1114, U1085, U1028, U1010, U1008, U1066, U1033, U1073, U1056, U1015, U1126, U1006, U1036, U1088, U1035, U1053, U1052, U1109, U1130, U1098, U1022, U1099, U1080, U1070, U1112, U1133, U1044, U1050, U1051, U1061, U1049, U1060, U1131, U1067, U1011, U1082, U1105, U1091, U1018, U1137, U1113, U1127, U1125, U1065, U1059, U1026, U1031, U1034, U1120, U1023, U1108, U1124, U1063, U1062, U1129, U1121, U1004, U1057, U1107, U1064, U1001, U1094)",112
132774,Mexican,"List(U1075, U1002, U1134, U1007, U1119, U1076, U1025, U1054, U1020, U1111, U1084, U1100, U1045, U1021, U1027, U1128, U1068, U1047, U1135, U1055, U1123, U1083, U1081, U1038, U1037, U1005, U1079, U1103, U1048, U1089, U1136, U1016, U1030, U1101, U1078, U1072, U1071, U1110, U1069, U1097, U1096, U1032, U1003, U1009, U1116, U1042, U1019, U1132, U1077, U1029, U1114, U1085, U1028, U1010, U1008, U1066, U1033, U1073, U1056, U1015, U1126, U1006, U1036, U1088, U1035, U1053, U1052, U1109, U1130, U1098, U1022, U1099, U1080, U1070, U1112, U1133, U1044, U1050, U1051, U1061, U1049, U1060, U1131, U1067, U1011, U1082, U1105, U1091, U1018, U1137, U1113, U1127, U1125, U1065, U1059, U1026, U1031, U1034, U1120, U1023, U1108, U1124, U1063, U1062, U1129, U1121, U1004, U1057, U1107, U1064, U1001, U1094)",112
132774,Family,"List(U1075, U1002, U1134, U1007, U1119, U1076, U1025, U1054, U1020, U1111, U1084, U1100, U1045, U1021, U1027, U1128, U1068, U1047, U1135, U1055, U1123, U1083, U1081, U1038, U1037, U1005, U1079, U1103, U1048, U1089, U1136, U1016, U1030, U1101, U1078, U1072, U1071, U1110, U1069, U1097, U1096, U1032, U1003, U1009, U1116, U1042, U1019, U1132, U1077, U1029, U1114, U1085, U1028, U1010, U1008, U1066, U1033, U1073, U1056, U1015, U1126, U1006, U1036, U1088, U1035, U1053, U1052, U1109, U1130, U1098, U1022, U1099, U1080, U1070, U1112, U1133, U1044, U1050, U1051, U1061, U1049, U1060, U1131, U1067, U1011, U1082, U1105, U1091, U1018, U1137, U1113, U1127, U1125, U1065, U1059, U1026, U1031, U1034, U1120, U1023, U1108, U1124, U1063, U1062, U1129, U1121, U1004, U1057, U1107, U1064, U1001, U1094)",112
132774,Burgers,"List(U1075, U1002, U1134, U1007, U1119, U1076, U1025, U1054, U1020, U1111, U1084, U1100, U1045, U1021, U1027, U1128, U1068, U1047, U1135, U1055, U1123, U1083, U1081, U1038, U1037, U1005, U1079, U1103, U1048, U1089, U1136, U1016, U1030, U1101, U1078, U1072, U1071, U1110, U1069, U1097, U1096, U1032, U1003, U1009, U1116, U1042, U1019, U1132, U1077, U1029, U1114, U1085, U1028, U1010, U1008, U1066, U1033, U1073, U1056, U1015, U1126, U1006, U1036, U1088, U1035, U1053, U1052, U1109, U1130, U1098, U1022, U1099, U1080, U1070, U1112, U1133, U1044, U1050, U1051, U1061, U1049, U1060, U1131, U1067, U1011, U1082, U1105, U1091, U1018, U1137, U1113, U1127, U1125, U1065, U1059, U1026, U1031, U1034, U1120, U1023, U1108, U1124, U1063, U1062, U1129, U1121, U1004, U1057, U1107, U1064, U1001, U1094)",112
132774,Diner,"List(U1075, U1002, U1134, U1007, U1119, U1076, U1025, U1054, U1020, U1111, U1084, U1100, U1045, U1021, U1027, U1128, U1068, U1047, U1135, U1055, U1123, U1083, U1081, U1038, U1037, U1005, U1079, U1103, U1048, U1089, U1136, U1016, U1030, U1101, U1078, U1072, U1071, U1110, U1069, U1097, U1096, U1032, U1003, U1009, U1116, U1042, U1019, U1132, U1077, U1029, U1114, U1085, U1028, U1010, U1008, U1066, U1033, U1073, U1056, U1015, U1126, U1006, U1036, U1088, U1035, U1053, U1052, U1109, U1130, U1098, U1022, U1099, U1080, U1070, U1112, U1133, U1044, U1050, U1051, U1061, U1049, U1060, U1131, U1067, U1011, U1082, U1105, U1091, U1018, U1137, U1113, U1127, U1125, U1065, U1059, U1026, U1031, U1034, U1120, U1023, U1108, U1124, U1063, U1062, U1129, U1121, U1004, U1057, U1107, U1064, U1001, U1094)",112
132774,Breakfast-Brunch,"List(U1075, U1002, U1134, U1007, U1119, U1076, U1025, U1054, U1020, U1111, U1084, U1100, U1045, U1021, U1027, U1128, U1068, U1047, U1135, U1055, U1123, U1083, U1081, U1038, U1037, U1005, U1079, U1103, U1048, U1089, U1136, U1016, U1030, U1101, U1078, U1072, U1071, U1110, U1069, U1097, U1096, U1032, U1003, U1009, U1116, U1042, U1019, U1132, U1077, U1029, U1114, U1085, U1028, U1010, U1008, U1066, U1033, U1073, U1056, U1015, U1126, U1006, U1036, U1088, U1035, U1053, U1052, U1109, U1130, U1098, U1022, U1099, U1080, U1070, U1112, U1133, U1044, U1050, U1051, U1061, U1049, U1060, U1131, U1067, U1011, U1082, U1105, U1091, U1018, U1137, U1113, U1127, U1125, U1065, U1059, U1026, U1031, U1034, U1120, U1023, U1108, U1124, U1063, U1062, U1129, U1121, U1004, U1057, U1107, U1064, U1001, U1094)",112
132917,American,"List(U1075, U1002, U1134, U1025, U1119, U1076, U1020, U1054, U1111, U1084, U1100, U1045, U1128, U1068, U1047, U1135, U1123, U1081, U1083, U1038, U1037, U1005, U1079, U1103, U1048, U1089, U1136, U1016, U1030, U1101, U1078, U1072, U1071, U1110, U1069, U1097, U1096, U1032, U1003, U1116, U1042, U1009, U1132, U1077, U1029, U1114, U1085, U1028, U1010, U1008, U1066, U1033, U1073, U1056, U1015, U1126, U1006, U1036, U1088, U1053, U1109, U1130, U1098, U1022, U1099, U1080, U1070, U1112, U1133, U1044, U1050, U1051, U1061, U1049, U1060, U1131, U1067, U1011, U1082, U1091, U1018, U1137, U1127, U1065, U1125, U1059, U1026, U1031, U1034, U1120, U1023, U1108, U1124, U1063, U1062, U1129, U1121, U1064, U1057, U1107, U1004, U1001, U1094)",103


#2. NPS

* Stockez `datasets["rating_final"]` dans une variable `rating` puis affichez la.

In [None]:
rating = datasets["rating_final"]
display(rating)

userID,placeID,rating,food_rating,service_rating
U1077,135085,2,2,2
U1077,135038,2,2,1
U1077,132825,2,2,2
U1077,135060,1,2,2
U1068,135104,1,1,2
U1068,132740,0,0,0
U1068,132663,1,1,1
U1068,132732,0,0,0
U1068,132630,1,1,1
U1067,132584,2,2,2


* Nous allons tenter de calculer un score NPS pour chaque `placeID`. Pour débuter cette démarche, nous allons avoir besoin de savoir les notes exactes données à chaque placeID en colonne. Nous compterons le nombre de notes sur chacune des placeID 

Conseil : Nous utiliserons [.pivot()](http://spark.apache.org/docs/2.1.0/api/python/pyspark.sql.html?highlight=pivot#pyspark.sql.GroupedData.pivot) pour obtenir les notes en colonnes

In [None]:
pivot = rating.groupBy("placeID").pivot("rating").count()
display(pivot)

placeID,0,1,2
132834,7.0,11.0,7.0
132626,1.0,1.0,2.0
135042,4.0,7.0,9.0
135058,4.0,8.0,6.0
132767,2.0,1.0,3.0
135026,1.0,5.0,5.0
135063,2.0,2.0,4.0
132862,3.0,5.0,10.0
132884,2.0,1.0,3.0
135035,,2.0,2.0


* Maintenant que nous savons afficher les notes en colonnes, nous pouvons calculer le NPS Score. A partir du `rating`, appliquez la formule du NPS Score suivante : 

  `NPS = (somme_de_note_max / somme_totale_des_notes) - (somme_de_note_min / somme_totale_des_notes)`

* Joignez ensuite `chefmozcuisine` et [utilisez la window function F.rank()](http://spark.apache.org/docs/2.1.0/api/python/pyspark.sql.html?highlight=pivot#pyspark.sql.Window) pour affichez un ranking global pour chaque placeID et un ranking pour chaque `RCuisine`

In [None]:
from pyspark.sql.window import Window

nps = (pivot
       .withColumn("Total", F.col("0") + F.col("1") + F.col("2"))
       .withColumn("NPS", F.col("2") / F.col("Total") - (F.col("0") / F.col("Total")))
       .join(chefmozcuisine, "placeID", "left")
       .withColumn("Global Rank", F.rank().over(Window.orderBy(F.desc("NPS"))))
       .withColumn("Rank", F.rank().over(Window.partitionBy("Rcuisine").orderBy(F.desc("NPS"))))
       .orderBy("Global Rank"))
display(nps)

placeID,0,1,2,Total,NPS,Rcuisine,Global Rank,Rank
135075,1.0,2.0,10.0,13.0,0.6923076923076923,Seafood,1,1
135059,1.0,1.0,7.0,9.0,0.6666666666666667,Bar,2,1
135025,1.0,3.0,11.0,15.0,0.6666666666666666,Mexican,3,1
132768,1.0,2.0,7.0,10.0,0.6,Family,4,1
134996,1.0,2.0,6.0,9.0,0.5555555555555556,,5,1
135028,1.0,5.0,9.0,15.0,0.5333333333333333,Mexican,6,2
135066,1.0,4.0,7.0,12.0,0.5,,7,2
135045,2.0,3.0,8.0,13.0,0.4615384615384615,,8,3
132754,2.0,3.0,8.0,13.0,0.4615384615384615,Mexican,8,3
135051,1.0,6.0,7.0,14.0,0.4285714285714286,,10,4


* Nous aimerions savoir si ce ranking est représentatif. Pour cela, nous aimerions savoir le nombre d'utilisateurs qui ont donné leur review. En utilisant le résultat d'au dessus, joignez le nombre d'utilisateurs pour chaque placeID.

In [None]:
display(nps.join(users, "placeID").orderBy("Global Rank"))

placeID,0,1,2,Total,NPS,Rcuisine,Global Rank,Rank,Rcuisine.1,users,count_users
135075,1.0,2.0,10.0,13.0,0.6923076923076923,Seafood,1,1,Seafood,"List(U1108, U1135)",2
135059,1.0,1.0,7.0,9.0,0.6666666666666667,Bar,2,1,Bar,"List(U1135, U1101, U1046)",3
135025,1.0,3.0,11.0,15.0,0.6666666666666666,Mexican,3,1,Mexican,"List(U1075, U1002, U1134, U1025, U1119, U1076, U1020, U1054, U1111, U1084, U1100, U1045, U1128, U1068, U1123, U1135, U1081, U1038, U1083, U1037, U1079, U1103, U1048, U1089, U1136, U1030, U1101, U1078, U1072, U1071, U1110, U1069, U1097, U1096, U1032, U1003, U1116, U1042, U1009, U1132, U1077, U1029, U1114, U1085, U1028, U1010, U1033, U1066, U1015, U1056, U1008, U1126, U1006, U1036, U1088, U1053, U1109, U1130, U1098, U1022, U1099, U1080, U1070, U1112, U1133, U1044, U1050, U1051, U1061, U1049, U1060, U1131, U1067, U1011, U1082, U1091, U1018, U1137, U1127, U1065, U1125, U1059, U1026, U1031, U1034, U1120, U1023, U1108, U1124, U1063, U1062, U1129, U1121, U1064, U1107, U1004, U1094)",97
132768,1.0,2.0,7.0,10.0,0.6,Family,4,1,Family,"List(U1108, U1007, U1135, U1101, U1009, U1019, U1027, U1035)",8
135028,1.0,5.0,9.0,15.0,0.5333333333333333,Mexican,6,2,Mexican,"List(U1075, U1002, U1134, U1025, U1119, U1076, U1020, U1054, U1111, U1084, U1100, U1045, U1128, U1068, U1123, U1135, U1081, U1038, U1083, U1037, U1079, U1103, U1048, U1089, U1136, U1030, U1101, U1078, U1072, U1071, U1110, U1069, U1097, U1096, U1032, U1003, U1116, U1042, U1009, U1132, U1077, U1029, U1114, U1085, U1028, U1010, U1033, U1066, U1015, U1056, U1008, U1126, U1006, U1036, U1088, U1053, U1109, U1130, U1098, U1022, U1099, U1080, U1070, U1112, U1133, U1044, U1050, U1051, U1061, U1049, U1060, U1131, U1067, U1011, U1082, U1091, U1018, U1137, U1127, U1065, U1125, U1059, U1026, U1031, U1034, U1120, U1023, U1108, U1124, U1063, U1062, U1129, U1121, U1064, U1107, U1004, U1094)",97
132754,2.0,3.0,8.0,13.0,0.4615384615384615,Mexican,8,3,Mexican,"List(U1075, U1002, U1134, U1025, U1119, U1076, U1020, U1054, U1111, U1084, U1100, U1045, U1128, U1068, U1123, U1135, U1081, U1038, U1083, U1037, U1079, U1103, U1048, U1089, U1136, U1030, U1101, U1078, U1072, U1071, U1110, U1069, U1097, U1096, U1032, U1003, U1116, U1042, U1009, U1132, U1077, U1029, U1114, U1085, U1028, U1010, U1033, U1066, U1015, U1056, U1008, U1126, U1006, U1036, U1088, U1053, U1109, U1130, U1098, U1022, U1099, U1080, U1070, U1112, U1133, U1044, U1050, U1051, U1061, U1049, U1060, U1131, U1067, U1011, U1082, U1091, U1018, U1137, U1127, U1065, U1125, U1059, U1026, U1031, U1034, U1120, U1023, U1108, U1124, U1063, U1062, U1129, U1121, U1064, U1107, U1004, U1094)",97
132723,1.0,5.0,6.0,12.0,0.4166666666666667,Mexican,12,4,Mexican,"List(U1075, U1002, U1134, U1025, U1119, U1076, U1020, U1054, U1111, U1084, U1100, U1045, U1128, U1068, U1123, U1135, U1081, U1038, U1083, U1037, U1079, U1103, U1048, U1089, U1136, U1030, U1101, U1078, U1072, U1071, U1110, U1069, U1097, U1096, U1032, U1003, U1116, U1042, U1009, U1132, U1077, U1029, U1114, U1085, U1028, U1010, U1033, U1066, U1015, U1056, U1008, U1126, U1006, U1036, U1088, U1053, U1109, U1130, U1098, U1022, U1099, U1080, U1070, U1112, U1133, U1044, U1050, U1051, U1061, U1049, U1060, U1131, U1067, U1011, U1082, U1091, U1018, U1137, U1127, U1065, U1125, U1059, U1026, U1031, U1034, U1120, U1023, U1108, U1124, U1063, U1062, U1129, U1121, U1064, U1107, U1004, U1094)",97
132866,1.0,1.0,3.0,5.0,0.3999999999999999,Bakery,13,1,Cafeteria,"List(U1108, U1128, U1008, U1135, U1105, U1101, U1009, U1004, U1052, U1060)",10
132866,1.0,1.0,3.0,5.0,0.3999999999999999,Cafeteria,13,1,Bakery,"List(U1108, U1128, U1008, U1135, U1105, U1101, U1009, U1004, U1052, U1060)",10
132866,1.0,1.0,3.0,5.0,0.3999999999999999,Bakery,13,1,Bakery,"List(U1108, U1128, U1008, U1135, U1105, U1101, U1009, U1004, U1052, U1060)",10


#3. Formatage

* Affichez `datasets["chefmozhours4"]`

In [None]:
chefmozhours4 = datasets["chefmozhours4"]
display(chefmozhours4)

placeID,hours,days
135111,00:00-23:30;,Mon;Tue;Wed;Thu;Fri;
135111,00:00-23:30;,Sat;
135111,00:00-23:30;,Sun;
135110,08:00-19:00;,Mon;Tue;Wed;Thu;Fri;
135110,00:00-00:00;,Sat;
135110,00:00-00:00;,Sun;
135109,08:00-21:00;,Mon;Tue;Wed;Thu;Fri;
135109,08:00-21:00;,Sat;
135109,08:00-21:00;,Sun;
135108,00:00-23:30;,Mon;Tue;Wed;Thu;Fri;


* La façon dont le dataset est affiché n'est pas très lisible, tentez de pivoter votre table de façon à avoir un dataset qui ressemble à celui de l'output ci-dessous.

In [None]:
display(chefmozhours4
        .withColumn("days", F.explode(F.split("days", ";")))
        .filter(F.col("days") != "")
        .groupBy("placeID")
        .pivot("days")
        .agg(F.first("hours")))

placeID,Fri,Mon,Sat,Sun,Thu,Tue,Wed
132012,12:00-22:00;,12:00-22:00;,12:00-22:00;,12:00-22:00;,12:00-22:00;,12:00-22:00;,12:00-22:00;
132023,11:00-00:00;,11:00-00:00;,11:00-00:00;,11:00-00:00;,11:00-00:00;,11:00-00:00;,11:00-00:00;
132024,11:00-21:00;,11:00-21:00;,11:00-21:00;,11:00-21:00;,11:00-21:00;,11:00-21:00;,11:00-21:00;
132026,12:00-14:30;,12:00-14:30;,,,12:00-14:30;,12:00-14:30;,12:00-14:30;
132030,12:00-15:00;15:00-21:00;,12:00-15:00;15:00-21:00;,12:00-15:00;15:00-21:00;,12:00-15:00;15:00-21:00;,12:00-15:00;15:00-21:00;,12:00-15:00;15:00-21:00;,12:00-15:00;15:00-21:00;
132097,06:30-21:00;,06:30-21:00;,06:30-22:00;,06:30-22:00;,06:30-21:00;,06:30-21:00;,06:30-21:00;
132103,11:00-16:00;16:00-13:00;16:00-12:00;16:00-21:00;,11:00-16:00;16:00-13:00;16:00-12:00;16:00-21:00;,,11:00-16:00;16:00-13:00;16:00-12:00;16:00-21:00;,11:00-16:00;16:00-13:00;16:00-12:00;16:00-21:00;,11:00-16:00;16:00-13:00;16:00-12:00;16:00-21:00;,11:00-16:00;16:00-13:00;16:00-12:00;16:00-21:00;
132107,12:00-22:00;,12:00-22:00;,12:00-22:00;,12:00-22:00;,12:00-22:00;,12:00-22:00;,12:00-22:00;
132108,17:00-01:00;,17:00-01:00;,17:00-01:00;,17:00-01:00;,17:00-01:00;,17:00-01:00;,17:00-01:00;
132109,17:00-21:00;,17:00-21:00;,17:00-21:00;,17:00-21:00;,17:00-21:00;,17:00-21:00;,17:00-21:00;
