Nous souhaitons étudier le comportement des trajets des taxis new yorkais. Pour cela nous
allons calculer les indicateurs ci-dessous :  
● la vitesse moyenne de chaque trajet,  
● le nombre de trajets effectués en fonction du jour de la semaine,  
● le nombre de trajets effectués en fonction de l’horaire de la journée par tranche de 4h,  
● le nombre de km parcourus par jour de la semaine.  
  
Les données et leurs descriptions sont sur le lien ci-dessous :
https://www.kaggle.com/c/nyc-taxi-trip-duration/data

In [1]:
# Install dependencies
!pip install pyspark



In [2]:
# Get Spark imports
from pyspark.sql import SparkSession
from pyspark.sql.context import SQLContext     # add
from pyspark.sql import functions as f
from pyspark.sql.types import StructType,StructField, StringType, IntegerType , BooleanType
#import pyspark.sql.functions as F             # to delete

# Other imports
import datetime
from math import sqrt, cos

In [3]:
# Get Spark Context
spark = SparkSession.builder.appName('pyspark - NYC Taxi Trip Duration').getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)

In [5]:
sc

In [6]:
df = spark.read.csv("data/trainExtract.csv", header=True, inferSchema=True)
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- vendor_id: integer (nullable = true)
 |-- pickup_datetime: string (nullable = true)
 |-- dropoff_datetime: string (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- pickup_longitude: double (nullable = true)
 |-- pickup_latitude: double (nullable = true)
 |-- dropoff_longitude: double (nullable = true)
 |-- dropoff_latitude: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- trip_duration: integer (nullable = true)



In [14]:
print(df.count(), len(df.columns))

999 11


In [15]:
df.show(1,False,vertical=True)

-RECORD 0---------------------------------
 id                 | id2875421           
 vendor_id          | 2                   
 pickup_datetime    | 2016-03-14 17:24:55 
 dropoff_datetime   | 2016-03-14 17:32:30 
 passenger_count    | 1                   
 pickup_longitude   | -73.9821548461914   
 pickup_latitude    | 40.76793670654297   
 dropoff_longitude  | -73.96463012695312  
 dropoff_latitude   | 40.765602111816406  
 store_and_fwd_flag | N                   
 trip_duration      | 455                 
only showing top 1 row



In [17]:
def getKmDistance(pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude):
    x = (dropoff_longitude - pickup_longitude) * cos((pickup_latitude + dropoff_latitude) / 2 )
    y = dropoff_latitude - pickup_latitude
    z = sqrt(pow(x, 2) + pow(y, 2))
    k = 1.852 * 60
    return k * z

In [18]:
df.registerTempTable('df_table')
df_with_distance = sqlContext.sql('select *, \
    1.852*60*sqrt(pow((dropoff_longitude-pickup_longitude)*cos((pickup_latitude+dropoff_latitude)/2),2) + pow(dropoff_latitude-pickup_latitude,2)) \
    as distance from df_table')
df_with_distance.show(1)

+---------+---------+-------------------+-------------------+---------------+-----------------+-----------------+------------------+------------------+------------------+-------------+-----------------+
|       id|vendor_id|    pickup_datetime|   dropoff_datetime|passenger_count| pickup_longitude|  pickup_latitude| dropoff_longitude|  dropoff_latitude|store_and_fwd_flag|trip_duration|         distance|
+---------+---------+-------------------+-------------------+---------------+-----------------+-----------------+------------------+------------------+------------------+-------------+-----------------+
|id2875421|        2|2016-03-14 17:24:55|2016-03-14 17:32:30|              1|-73.9821548461914|40.76793670654297|-73.96463012695312|40.765602111816406|                 N|          455|1.959277081692363|
+---------+---------+-------------------+-------------------+---------------+-----------------+-----------------+------------------+------------------+------------------+-------------+----

In [19]:
def getDayOfWeek(date_time_str):
    date_time_obj = datetime.datetime.strptime(date_time_str, '%Y-%m-%d %H:%M:%S')
    return date_time_obj.strftime('%A').lower()
test_date = getDayOfWeek('2016-01-01 03:49:48')
test_date

'friday'

In [20]:
df_with_day_of_week = df_with_distance.withColumn("pickup_datetime",
    f.to_timestamp(f.col("pickup_datetime"))).withColumn("day_of_week", f.date_format(f.col("pickup_datetime"), "EEEE"))
df_with_day_of_week.show(2)

+---------+---------+-------------------+-------------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------+-----------------+-----------+
|       id|vendor_id|    pickup_datetime|   dropoff_datetime|passenger_count|  pickup_longitude|   pickup_latitude| dropoff_longitude|  dropoff_latitude|store_and_fwd_flag|trip_duration|         distance|day_of_week|
+---------+---------+-------------------+-------------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------+-----------------+-----------+
|id2875421|        2|2016-03-14 17:24:55|2016-03-14 17:32:30|              1| -73.9821548461914| 40.76793670654297|-73.96463012695312|40.765602111816406|                 N|          455|1.959277081692363|     Monday|
|id2377394|        1|2016-06-12 00:43:35|2016-06-12 00:54:38|              1|-73.98041534423828|40.738563537597656|-73.9994812011718

In [22]:
# le nombre de trajets effectués en fonction du jour de la semaine
df_with_day_of_week.groupBy('day_of_week').count().withColumnRenamed("count","number of trips").show()

+-----------+---------------+
|day_of_week|number of trips|
+-----------+---------------+
|  Wednesday|            140|
|    Tuesday|            141|
|     Friday|            152|
|   Thursday|            142|
|   Saturday|            150|
|     Monday|            136|
|     Sunday|            138|
+-----------+---------------+



In [27]:
# le nombre de km parcourus par jour de la semaine.
df_with_day_of_week.groupby('day_of_week').agg({'distance': 'sum'}).withColumnRenamed("sum(distance)","number of kilometers travelled").show()

+-----------+------------------------------+
|day_of_week|number of kilometers travelled|
+-----------+------------------------------+
|  Wednesday|             637.3384980282934|
|    Tuesday|              535.447840446507|
|     Friday|             525.1827969908102|
|   Thursday|            488.91298518289153|
|   Saturday|             544.5255557385256|
|     Monday|             545.3308556108145|
|     Sunday|             644.6818696098692|
+-----------+------------------------------+

