In [1]:
import findspark
findspark.init()

In [2]:
import pyspark
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

In [5]:
!wget https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-01.parquet

--2022-06-05 20:07:34--  https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-01.parquet
Resolving nyc-tlc.s3.amazonaws.com (nyc-tlc.s3.amazonaws.com)... 52.216.178.171
Connecting to nyc-tlc.s3.amazonaws.com (nyc-tlc.s3.amazonaws.com)|52.216.178.171|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11886281 (11M) [binary/octet-stream]
Saving to: 'fhv_tripdata_2021-01.parquet'

     0K .......... .......... .......... .......... ..........  0%  158K 73s
    50K .......... .......... .......... .......... ..........  0%  121K 84s
   100K .......... .......... .......... .......... ..........  1%  204K 75s
   150K .......... .......... .......... .......... ..........  1%  675K 60s
   200K .......... .......... .......... .......... ..........  2% 3.50M 48s
   250K .......... .......... .......... .......... ..........  2%  205K 49s
   300K .......... .......... .......... .......... ..........  3%  565K 45s
   350K .......... .......... .......... ........

In [23]:
import pandas as pd
import pyarrow.parquet as pq
trips = pq.read_table('fhvhv_tripdata_2021-01.parquet')
df = trips.to_pandas()
df.to_csv('fhvhv_tripdata_2021-01.csv', index=False)

In [10]:
df = spark.read \
    .option('haeder', 'true') \
    .csv('fhv_tripdata_2021-01.csv')

In [15]:
df.schema

StructType(List(StructField(_c0,StringType,true),StructField(_c1,StringType,true),StructField(_c2,StringType,true),StructField(_c3,StringType,true),StructField(_c4,StringType,true),StructField(_c5,StringType,true),StructField(_c6,StringType,true)))

In [None]:
# Run this on terminal
# head -n 101 fhv_tripdata_2021-01.csv > head.csv

In [16]:
df_pandas = pd.read_csv('head.csv')

In [17]:
df_pandas.dtypes

dispatching_base_num       object
pickup_datetime            object
dropOff_datetime           object
PUlocationID              float64
DOlocationID              float64
SR_Flag                   float64
Affiliated_base_number     object
dtype: object

In [20]:
spark.createDataFrame(df_pandas).schema
# Integer - 4 bytes
# Long - 8 bytes 

StructType(List(StructField(dispatching_base_num,StringType,true),StructField(pickup_datetime,StringType,true),StructField(dropOff_datetime,StringType,true),StructField(PUlocationID,DoubleType,true),StructField(DOlocationID,DoubleType,true),StructField(SR_Flag,DoubleType,true),StructField(Affiliated_base_number,StringType,true)))

In [4]:
from pyspark.sql import types

In [5]:
schema = types.StructType([
    types.StructField('dispatching_base_num', types.StringType(), True),
    types.StructField('pickup_datetime', types.TimestampType(), True),
    types.StructField('dropOff_datetime', types.TimestampType(), True),
    types.StructField('PUlocationID', types.IntegerType(), True),
    types.StructField('DOlocationID', types.IntegerType(), True),
    types.StructField('SR_Flag', types.StringType(), True),
    types.StructField('Affiliated_base_number', types.StringType(), True)
])

In [24]:
df = spark.read \
    .option('haeder', 'true') \
    .schema(schema) \
    .csv('fhv_tripdata_2021-01.csv')

In [7]:
df = df.repartition(24)

In [8]:
df.write.parquet('fhv/2021/01/', mode='overwrite')

In [8]:
df.printSchema()

root
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropOff_datetime: timestamp (nullable = true)
 |-- PUlocationID: integer (nullable = true)
 |-- DOlocationID: integer (nullable = true)
 |-- SR_Flag: string (nullable = true)
 |-- Affiliated_base_number: string (nullable = true)



In [11]:
from pyspark.sql import functions as F

In [15]:
df.show()

+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|dispatching_base_num|    pickup_datetime|   dropOff_datetime|PUlocationID|DOlocationID|SR_Flag|Affiliated_base_number|
+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|              B02975|2021-01-04 11:08:40|2021-01-04 11:40:40|        null|        null|   null|                B02871|
|              B02657|2021-01-01 23:59:36|2021-01-02 00:17:21|        null|        null|   null|                B02657|
|              B02930|2021-01-02 15:34:06|2021-01-02 15:41:33|        null|        null|   null|                B02930|
|              B01437|2021-01-02 17:12:45|2021-01-02 17:21:42|        null|        null|   null|                B01437|
|              B01738|2021-01-04 07:18:18|2021-01-04 07:34:11|        null|        null|   null|                B01738|
|              B02437|2021-01-03 06:49:1

In [16]:
def random_stuff(base_num):
    num = int(base_num[1:])
    if num % 7 == 0:
        return f's/{num:03X}'
    else:
        return f'e/{num:03X}'

In [17]:
random_stuff('B01553')

'e/611'

In [18]:
random_stuff_udf = F.udf(random_stuff, returnType=types.StringType())

In [20]:
df \
    .withColumn('pickup_date', F.to_date(df.pickup_datetime)) \
    .withColumn('dropoff_date', F.to_date(df.dropOff_datetime)) \
    .withColumn('base_id', random_stuff_udf(df.dispatching_base_num)) \
    .select('pickup_date', 'dropoff_date', 'PUlocationID', 'DOlocationID') \
    .show(5)

+-----------+------------+------------+------------+
|pickup_date|dropoff_date|PUlocationID|DOlocationID|
+-----------+------------+------------+------------+
| 2021-01-04|  2021-01-04|        null|        null|
| 2021-01-03|  2021-01-03|        null|        null|
| 2021-01-03|  2021-01-03|        null|        null|
| 2021-01-03|  2021-01-03|        null|        null|
| 2021-01-03|  2021-01-03|        null|        null|
+-----------+------------+------------+------------+
only showing top 5 rows



In [10]:
df.select('pickup_datetime', 'dropOff_datetime', 'PUlocationID', 'DOlocationID') \
    .filter(df.dispatching_base_num == 'B00009') \
    .head(5)

# TODO equal to
# SELECT * FROM df WHERE dispatching_base_num == 'B00009'

[Row(pickup_datetime=datetime.datetime(2021, 1, 1, 7, 21), dropOff_datetime=datetime.datetime(2021, 1, 1, 7, 43), PUlocationID=None, DOlocationID=None),
 Row(pickup_datetime=datetime.datetime(2021, 1, 4, 6, 59), dropOff_datetime=datetime.datetime(2021, 1, 4, 7, 19), PUlocationID=None, DOlocationID=None),
 Row(pickup_datetime=datetime.datetime(2021, 1, 2, 5, 43), dropOff_datetime=datetime.datetime(2021, 1, 2, 5, 57), PUlocationID=None, DOlocationID=None),
 Row(pickup_datetime=datetime.datetime(2021, 1, 5, 16, 39), dropOff_datetime=datetime.datetime(2021, 1, 5, 16, 47), PUlocationID=None, DOlocationID=None),
 Row(pickup_datetime=datetime.datetime(2021, 1, 6, 6, 43), dropOff_datetime=datetime.datetime(2021, 1, 6, 7, 5), PUlocationID=None, DOlocationID=None)]

In [20]:
!head -n 10 head.csv

dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009
B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009
B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013
B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037
B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037
B00037,2021-01-01 00:59:02,2021-01-01 01:08:05,,71.0,,B00037
B00037,2021-01-01 00:18:12,2021-01-01 00:30:04,,91.0,,B00037
B00037,2021-01-01 00:36:15,2021-01-01 00:45:08,,39.0,,B00037
B00037,2021-01-01 00:55:04,2021-01-01 01:13:02,,37.0,,B00037
