- Padronizar nomes de estação para maisculo
- Renomear campo member_casual
- Calcular duracao da viagem
- Padronizar tipo da bike (eletric ou classic)

In [33]:
#demonstrar como upar o arquivo no dbfs
#file_path="dbfs:/FileStore/datasets/202212_divvy_tripdata.csv"
file_path="/home/jovyan/work/data/202212_divvy_tripdata.csv"

In [34]:
#Cria sessão para ambiente local
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("manipulando-dados") \
    .getOrCreate()

spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [35]:
# Lendo arquivo CSV especificando cabeçalho
df = spark.read.option("header", "true").csv(file_path)
df.limit(5)

ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
65DBD2F447EC51C2,electric_bike,2022-12-05 10:47:18,2022-12-05 10:56:34,Clifton Ave & Arm...,TA1307000163,Sedgwick St & Web...,13191,41.9182435,-87.65711466666667,41.922167,-87.638888,member
0C201AA7EA0EA1AD,classic_bike,2022-12-18 06:42:33,2022-12-18 07:08:44,Broadway & Belmon...,13277,Sedgwick St & Web...,13191,41.940106,-87.645451,41.922167,-87.638888,casual
E0B148CCB358A49D,electric_bike,2022-12-13 08:47:45,2022-12-13 08:59:51,Sangamon St & Lak...,TA1306000015,St. Clair St & Er...,13016,41.885918856,-87.65113318,41.89434513742426,-87.62279838323593,member
54C5775D2B7C9188,classic_bike,2022-12-13 18:50:47,2022-12-13 19:19:48,Shields Ave & 31s...,KA1503000038,Damen Ave & Madis...,13134,41.838464,-87.635406,41.88137,-87.67493,member
A4891F78776D35DF,classic_bike,2022-12-14 16:13:39,2022-12-14 16:27:50,Ashland Ave & Chi...,13247,Damen Ave & Charl...,13288,41.89595435734305,-87.6677280664444,41.920082,-87.677855,casual


# Utilizando Pyspark

In [36]:
from pyspark.sql.functions import upper, expr, split

result_df_py = (
  df
  .withColumn("start_station_name", upper(df["start_station_name"]))
  .withColumn("end_station_name", upper(df["end_station_name"]))
  .withColumn("trip_time", (expr("unix_timestamp(ended_at) - unix_timestamp(started_at)") / 60).cast("int"))
  .withColumn("rideable_type", split("rideable_type", "_")[0])
  .withColumnRenamed("member_casual", "member_category")
  
)

result_df_py.limit(5)



ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_category,trip_time
65DBD2F447EC51C2,electric,2022-12-05 10:47:18,2022-12-05 10:56:34,CLIFTON AVE & ARM...,TA1307000163,SEDGWICK ST & WEB...,13191,41.9182435,-87.65711466666667,41.922167,-87.638888,member,9
0C201AA7EA0EA1AD,classic,2022-12-18 06:42:33,2022-12-18 07:08:44,BROADWAY & BELMON...,13277,SEDGWICK ST & WEB...,13191,41.940106,-87.645451,41.922167,-87.638888,casual,26
E0B148CCB358A49D,electric,2022-12-13 08:47:45,2022-12-13 08:59:51,SANGAMON ST & LAK...,TA1306000015,ST. CLAIR ST & ER...,13016,41.885918856,-87.65113318,41.89434513742426,-87.62279838323593,member,12
54C5775D2B7C9188,classic,2022-12-13 18:50:47,2022-12-13 19:19:48,SHIELDS AVE & 31S...,KA1503000038,DAMEN AVE & MADIS...,13134,41.838464,-87.635406,41.88137,-87.67493,member,29
A4891F78776D35DF,classic,2022-12-14 16:13:39,2022-12-14 16:27:50,ASHLAND AVE & CHI...,13247,DAMEN AVE & CHARL...,13288,41.89595435734305,-87.6677280664444,41.920082,-87.677855,casual,14


In [None]:
display(df_trip_time.select("started_at", "ended_at", "trip_time"))

# Utilizando SQL

In [40]:
df = spark.read.option("header", "true").csv(file_path)
df.createOrReplaceTempView("bike_shared_sql")

In [41]:
# Reescrevendo a transformação acima com SPARK SQL
result_df_sql = spark.sql("""
    SELECT
        ride_id,
        upper(start_station_name) AS start_station_name,
        upper(end_station_name) AS end_station_name,
        (unix_timestamp(ended_at) - unix_timestamp(started_at)) / 60 AS trip_time,
        split(rideable_type, '_')[0] AS rideable_type,
        member_casual AS member_category
    FROM
        bike_shared_sql
""")

display(result_df_sql)

ride_id,start_station_name,end_station_name,trip_time,rideable_type,member_category
65DBD2F447EC51C2,CLIFTON AVE & ARM...,SEDGWICK ST & WEB...,9.266666666666667,electric,member
0C201AA7EA0EA1AD,BROADWAY & BELMON...,SEDGWICK ST & WEB...,26.183333333333334,classic,casual
E0B148CCB358A49D,SANGAMON ST & LAK...,ST. CLAIR ST & ER...,12.1,electric,member
54C5775D2B7C9188,SHIELDS AVE & 31S...,DAMEN AVE & MADIS...,29.016666666666666,classic,member
A4891F78776D35DF,ASHLAND AVE & CHI...,DAMEN AVE & CHARL...,14.183333333333334,classic,casual
DB91D9B8DFACA07A,WABASH AVE & 9TH ST,WACKER DR & WASHI...,9.45,electric,member
6AD396C5760CC992,LEAVITT ST & CHIC...,WACKER DR & WASHI...,17.85,classic,member
8D736E35E0075504,HUMBOLDT BLVD & A...,SPAULDING AVE & A...,8.65,classic,member
DDAB881F96C51DC8,CANAL ST & ADAMS ST,ST. CLAIR ST & ER...,18.9,classic,member
8DDBAE51E55DAEEF,GREENVIEW AVE & F...,SOUTHPORT AVE & W...,13.283333333333331,classic,member


In [45]:
# Exibindo 2 colunas do dataframe
spark.sql("select ride_id, start_station_name from bike_shared_sql")

ride_id,start_station_name
65DBD2F447EC51C2,Clifton Ave & Arm...
0C201AA7EA0EA1AD,Broadway & Belmon...
E0B148CCB358A49D,Sangamon St & Lak...
54C5775D2B7C9188,Shields Ave & 31s...
A4891F78776D35DF,Ashland Ave & Chi...
DB91D9B8DFACA07A,Wabash Ave & 9th St
6AD396C5760CC992,Leavitt St & Chic...
8D736E35E0075504,Humboldt Blvd & A...
DDAB881F96C51DC8,Canal St & Adams St
8DDBAE51E55DAEEF,Greenview Ave & F...


In [46]:
result_df_sql.createOrReplaceTempView("bike_shared_sql_transformada")

In [48]:
spark.sql("select * from bike_shared_sql_transformada")

ride_id,start_station_name,end_station_name,trip_time,rideable_type,member_category
65DBD2F447EC51C2,CLIFTON AVE & ARM...,SEDGWICK ST & WEB...,9.266666666666667,electric,member
0C201AA7EA0EA1AD,BROADWAY & BELMON...,SEDGWICK ST & WEB...,26.183333333333334,classic,casual
E0B148CCB358A49D,SANGAMON ST & LAK...,ST. CLAIR ST & ER...,12.1,electric,member
54C5775D2B7C9188,SHIELDS AVE & 31S...,DAMEN AVE & MADIS...,29.016666666666666,classic,member
A4891F78776D35DF,ASHLAND AVE & CHI...,DAMEN AVE & CHARL...,14.183333333333334,classic,casual
DB91D9B8DFACA07A,WABASH AVE & 9TH ST,WACKER DR & WASHI...,9.45,electric,member
6AD396C5760CC992,LEAVITT ST & CHIC...,WACKER DR & WASHI...,17.85,classic,member
8D736E35E0075504,HUMBOLDT BLVD & A...,SPAULDING AVE & A...,8.65,classic,member
DDAB881F96C51DC8,CANAL ST & ADAMS ST,ST. CLAIR ST & ER...,18.9,classic,member
8DDBAE51E55DAEEF,GREENVIEW AVE & F...,SOUTHPORT AVE & W...,13.283333333333331,classic,member
