- Criar um dataframe, filtrar viagens com duração maior do que 30 minutos
- Criar 2 dataframes. Um com dados de viagens de bikes eletricas e outro com bikes classicas, ambos ordenado por data


https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/functions.html
  

In [5]:
file_path="/home/jovyan/work/data/202212_divvy_tripdata.csv"

#Cria sessão para ambiente local
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("manipulando-dados") \
    .getOrCreate()

spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

df = spark.read.option("header", "true").csv(file_path)
df.limit(5)

ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
65DBD2F447EC51C2,electric_bike,2022-12-05 10:47:18,2022-12-05 10:56:34,Clifton Ave & Arm...,TA1307000163,Sedgwick St & Web...,13191,41.9182435,-87.65711466666667,41.922167,-87.638888,member
0C201AA7EA0EA1AD,classic_bike,2022-12-18 06:42:33,2022-12-18 07:08:44,Broadway & Belmon...,13277,Sedgwick St & Web...,13191,41.940106,-87.645451,41.922167,-87.638888,casual
E0B148CCB358A49D,electric_bike,2022-12-13 08:47:45,2022-12-13 08:59:51,Sangamon St & Lak...,TA1306000015,St. Clair St & Er...,13016,41.885918856,-87.65113318,41.89434513742426,-87.62279838323593,member
54C5775D2B7C9188,classic_bike,2022-12-13 18:50:47,2022-12-13 19:19:48,Shields Ave & 31s...,KA1503000038,Damen Ave & Madis...,13134,41.838464,-87.635406,41.88137,-87.67493,member
A4891F78776D35DF,classic_bike,2022-12-14 16:13:39,2022-12-14 16:27:50,Ashland Ave & Chi...,13247,Damen Ave & Charl...,13288,41.89595435734305,-87.6677280664444,41.920082,-87.677855,casual


In [6]:
from pyspark.sql.functions import upper, expr, split

result_df_py = (
  df
  .withColumn("start_station_name", upper(df["start_station_name"]))
  .withColumn("end_station_name", upper(df["end_station_name"]))
  .withColumn("trip_time", (expr("unix_timestamp(ended_at) - unix_timestamp(started_at)") / 60).cast("int"))
  .withColumn("rideable_type", split("rideable_type", "_")[0])
  .withColumnRenamed("member_casual", "member_category")
  
)

result_df_py.limit(5)


ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_category,trip_time
65DBD2F447EC51C2,electric,2022-12-05 10:47:18,2022-12-05 10:56:34,CLIFTON AVE & ARM...,TA1307000163,SEDGWICK ST & WEB...,13191,41.9182435,-87.65711466666667,41.922167,-87.638888,member,9
0C201AA7EA0EA1AD,classic,2022-12-18 06:42:33,2022-12-18 07:08:44,BROADWAY & BELMON...,13277,SEDGWICK ST & WEB...,13191,41.940106,-87.645451,41.922167,-87.638888,casual,26
E0B148CCB358A49D,electric,2022-12-13 08:47:45,2022-12-13 08:59:51,SANGAMON ST & LAK...,TA1306000015,ST. CLAIR ST & ER...,13016,41.885918856,-87.65113318,41.89434513742426,-87.62279838323593,member,12
54C5775D2B7C9188,classic,2022-12-13 18:50:47,2022-12-13 19:19:48,SHIELDS AVE & 31S...,KA1503000038,DAMEN AVE & MADIS...,13134,41.838464,-87.635406,41.88137,-87.67493,member,29
A4891F78776D35DF,classic,2022-12-14 16:13:39,2022-12-14 16:27:50,ASHLAND AVE & CHI...,13247,DAMEN AVE & CHARL...,13288,41.89595435734305,-87.6677280664444,41.920082,-87.677855,casual,14


In [8]:
df_maior_30_min = (
    result_df_py.filter(result_df_py.trip_time > 30)
)

In [10]:
display(df_maior_30_min)

ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_category,trip_time
E1DCD2C3B9471006,electric,2022-12-06 17:43:11,2022-12-06 19:10:01,CLINTON ST & ROOS...,WL-008,ST. CLAIR ST & ER...,13016,41.867122293,-87.641112328,41.89434513742426,-87.62279838323593,member,86
0B4AC802FF1F65A6,classic,2022-12-20 10:19:56,2022-12-20 13:13:57,WESTERN AVE & LEL...,TA1307000140,SOUTHPORT AVE & W...,13235,41.966399801840986,-87.68870428204536,41.94815,-87.66394,casual,174
DB68885CB978F021,electric,2022-12-05 18:12:40,2022-12-05 19:23:21,CLINTON ST & ROOS...,WL-008,WABASH AVE & 16TH ST,SL-012,41.8671255,-87.64102233333334,41.860384,-87.625813,member,70
8602A572ADB87616,classic,2022-12-19 12:07:51,2022-12-19 12:46:16,ROCKWELL ST & EAS...,KA1504000093,ROCKWELL ST & EAS...,KA1504000093,41.96590013976,-87.6936384935,41.96590013976,-87.6936384935,member,38
C05F37F09125FF03,docked,2022-12-28 17:07:56,2022-12-28 17:50:39,CANAL ST & ADAMS ST,13011,WABASH AVE & 16TH ST,SL-012,41.879255,-87.639904,41.860384,-87.625813,casual,42
35CFF1597C070CD2,classic,2022-12-04 14:37:03,2022-12-04 15:10:49,WABASH AVE & 16TH ST,SL-012,WABASH AVE & 16TH ST,SL-012,41.860384,-87.625813,41.860384,-87.625813,member,33
DB0C8326DA0608C4,electric,2022-12-19 23:11:36,2022-12-19 23:44:12,TRIPP AVE & 15TH ST,363.0,TRIPP AVE & 15TH ST,363.0,41.86,-87.73,41.86,-87.73,member,32
53B5A87A76E6D0CD,classic,2022-12-02 18:53:43,2022-12-02 19:52:01,CLARK ST & ELM ST,TA1307000039,MCCLURG CT & OHIO ST,TA1306000029,41.902973,-87.63128,41.892592119709725,-87.61728912591934,member,58
FA85E9AFDA4C9546,classic,2022-12-14 16:00:41,2022-12-14 16:31:52,WABASH AVE & 9TH ST,TA1309000010,MCCLURG CT & OHIO ST,TA1306000029,41.870769,-87.625734,41.892592119709725,-87.61728912591934,casual,31
C8F349D6915E4DA9,electric,2022-12-06 21:26:45,2022-12-06 22:00:34,CLINTON ST & ROOS...,WL-008,HARPER AVE & 59TH ST,KA1503000070,41.867166281,-87.640986681,41.78794281287,-87.5883151702,member,33


In [11]:
from pyspark.sql.functions import col

In [12]:
df_electric = (
    df_maior_30_min.filter(col("rideable_type") == "electric").orderBy("started_at")
)

display(df_electric)

ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_category,trip_time
B1D52B01956FFC46,electric,2022-12-01 01:01:09,2022-12-01 01:32:11,,,,,41.77,-87.57,41.74,-87.68,member,31
9ED6246AE12FF95D,electric,2022-12-01 03:42:34,2022-12-01 04:42:42,,,ASHLAND AVE & LAK...,13073,41.88,-87.67,41.88592,-87.66717,casual,60
A005B5A8068DDD18,electric,2022-12-01 03:53:06,2022-12-01 04:29:04,,,,,41.99,-87.66,41.99,-87.66,member,35
736CED10A7020EFD,electric,2022-12-01 06:10:39,2022-12-01 07:02:40,,,,,41.8,-87.6,41.79,-87.59,member,52
2F5B64527AEC3DF8,electric,2022-12-01 07:18:39,2022-12-01 08:09:37,,,DESPLAINES ST & J...,15539,41.99,-87.67,41.87811890091227,-87.64394760131836,member,50
F390EDB71035C820,electric,2022-12-01 07:25:31,2022-12-01 08:00:16,,,SHERIDAN RD & IRV...,13063,41.92,-87.76,41.954245,-87.654406,member,34
0C5FA74D7CFB6328,electric,2022-12-01 07:40:53,2022-12-01 08:12:31,ASHLAND AVE & BLA...,13224,,,41.907081723,-87.667278647,41.88,-87.64,member,31
A81704545E70797B,electric,2022-12-01 09:02:23,2022-12-01 09:42:19,,,HERMITAGE AVE & P...,13080,41.95,-87.65,41.871514,-87.669886,member,39
83B378A01809EAD8,electric,2022-12-01 09:09:55,2022-12-01 09:51:31,,,LARRABEE ST & ARM...,TA1309000006,42.0,-87.67,41.918084,-87.643749,member,41
60E446FC5975E758,electric,2022-12-01 09:46:43,2022-12-01 10:34:07,PUBLIC RACK - LON...,899,HOMEWOOD AVE & 11...,20206,41.69,-87.67,41.68459507202618,-87.67070800065994,member,47


In [13]:
df_classic = (
    df_maior_30_min.filter(col("rideable_type") == "classic").orderBy("started_at")
)

display(df_classic)

ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_category,trip_time
5042C02FA60D3D30,classic,2022-12-01 00:57:33,2022-12-01 01:28:35,ST. LOUIS AVE & F...,KA1504000090,KINGSBURY ST & KI...,KA1503000043,41.92481559681187,-87.71449506282805,41.88917683258,-87.6385057718,casual,31
59C8309C280E95E5,classic,2022-12-01 06:51:25,2022-12-01 07:28:31,BROADWAY & CORNEL...,13278,WESTERN AVE & MON...,15667,41.945529,-87.646439,41.8797877089,-87.686112808,member,37
EDB08365EDAFAC2B,classic,2022-12-01 07:03:22,2022-12-01 07:37:47,HALSTED ST & WRIG...,TA1309000061,WOLCOTT AVE & POL...,TA1309000064,41.929143,-87.649077,41.871262,-87.673688,member,34
0314A4E50FD16BA1,classic,2022-12-01 07:06:28,2022-12-01 07:38:52,CLARK ST & SCHILL...,TA1309000024,MICHIGAN AVE & 8T...,623,41.907993,-87.631501,41.872773,-87.623981,member,32
146DB3EC88A9CD75,classic,2022-12-01 07:08:34,2022-12-01 07:43:20,CLARENDON AVE & G...,13379,ST. CLAIR ST & ER...,13016,41.95786652415174,-87.64950513839722,41.89434513742426,-87.62279838323593,member,34
E1AA6E1DAF164AD9,classic,2022-12-01 07:24:32,2022-12-01 07:57:12,SHERIDAN RD & BUE...,TA1309000027,STATE ST & VAN BU...,TA1305000035,41.958494,-87.654966,41.877181,-87.627844,member,32
31B4FE39146EA449,classic,2022-12-01 07:30:09,2022-12-01 08:01:29,CANAL ST & ADAMS ST,13011,LINCOLN AVE & FUL...,TA1309000058,41.879255,-87.639904,41.924161029067626,-87.64638036489487,member,31
6BC767B34FC83904,classic,2022-12-01 07:38:13,2022-12-01 09:17:48,LOOMIS ST & 89TH ST,20102,LOOMIS ST & 89TH ST,20102,41.73237975329554,-87.65806943178177,41.73237975329554,-87.65806943178177,casual,99
556D6650BB4DBD3A,classic,2022-12-01 07:44:11,2022-12-02 08:43:57,AUSTIN BLVD & CHI...,16921,,,41.894887,-87.774704,,,member,1499
E3453155C603E8EB,classic,2022-12-01 07:55:12,2022-12-01 08:32:49,WILTON AVE & DIVE...,chargingstx0,DEARBORN ST & MON...,TA1305000006,41.932418,-87.652705,41.881319815,-87.6295209193,member,37


# Utilizando SQL

In [14]:
df = spark.read.option("header", "true").csv(file_path)
df.createOrReplaceTempView("bike_shared")

In [15]:
# Reescrevendo a transformação acima com SPARK SQL
result_df_sql = spark.sql("""
    SELECT
        started_at,
        ride_id,
        upper(start_station_name) AS start_station_name,
        upper(end_station_name) AS end_station_name,
        (unix_timestamp(ended_at) - unix_timestamp(started_at)) / 60 AS trip_time,
        split(rideable_type, '_')[0] AS rideable_type,
        member_casual AS member_category
    FROM
        bike_shared
""")

display(result_df_sql)

started_at,ride_id,start_station_name,end_station_name,trip_time,rideable_type,member_category
2022-12-05 10:47:18,65DBD2F447EC51C2,CLIFTON AVE & ARM...,SEDGWICK ST & WEB...,9.266666666666667,electric,member
2022-12-18 06:42:33,0C201AA7EA0EA1AD,BROADWAY & BELMON...,SEDGWICK ST & WEB...,26.183333333333334,classic,casual
2022-12-13 08:47:45,E0B148CCB358A49D,SANGAMON ST & LAK...,ST. CLAIR ST & ER...,12.1,electric,member
2022-12-13 18:50:47,54C5775D2B7C9188,SHIELDS AVE & 31S...,DAMEN AVE & MADIS...,29.016666666666666,classic,member
2022-12-14 16:13:39,A4891F78776D35DF,ASHLAND AVE & CHI...,DAMEN AVE & CHARL...,14.183333333333334,classic,casual
2022-12-02 15:24:47,DB91D9B8DFACA07A,WABASH AVE & 9TH ST,WACKER DR & WASHI...,9.45,electric,member
2022-12-20 09:02:03,6AD396C5760CC992,LEAVITT ST & CHIC...,WACKER DR & WASHI...,17.85,classic,member
2022-12-13 14:19:49,8D736E35E0075504,HUMBOLDT BLVD & A...,SPAULDING AVE & A...,8.65,classic,member
2022-12-13 07:38:07,DDAB881F96C51DC8,CANAL ST & ADAMS ST,ST. CLAIR ST & ER...,18.9,classic,member
2022-12-21 08:57:42,8DDBAE51E55DAEEF,GREENVIEW AVE & F...,SOUTHPORT AVE & W...,13.283333333333331,classic,member


In [16]:
result_df_sql.createOrReplaceTempView("bike_trip_time")

In [17]:
df_maior_30_min_sql = spark.sql("select * from bike_trip_time where trip_time > 30")
display(df_maior_30_min_sql)

started_at,ride_id,start_station_name,end_station_name,trip_time,rideable_type,member_category
2022-12-10 09:49:06,7D451A9C489FE4D2,BROADWAY & BELMON...,ST. CLAIR ST & ER...,30.183333333333334,classic,member
2022-12-06 17:43:11,E1DCD2C3B9471006,CLINTON ST & ROOS...,ST. CLAIR ST & ER...,86.83333333333333,electric,member
2022-12-20 10:19:56,0B4AC802FF1F65A6,WESTERN AVE & LEL...,SOUTHPORT AVE & W...,174.01666666666668,classic,casual
2022-12-05 18:12:40,DB68885CB978F021,CLINTON ST & ROOS...,WABASH AVE & 16TH ST,70.68333333333334,electric,member
2022-12-19 12:07:51,8602A572ADB87616,ROCKWELL ST & EAS...,ROCKWELL ST & EAS...,38.41666666666666,classic,member
2022-12-28 17:07:56,C05F37F09125FF03,CANAL ST & ADAMS ST,WABASH AVE & 16TH ST,42.71666666666667,docked,casual
2022-12-04 14:37:03,35CFF1597C070CD2,WABASH AVE & 16TH ST,WABASH AVE & 16TH ST,33.766666666666666,classic,member
2022-12-19 23:11:36,DB0C8326DA0608C4,TRIPP AVE & 15TH ST,TRIPP AVE & 15TH ST,32.6,electric,member
2022-12-02 18:53:43,53B5A87A76E6D0CD,CLARK ST & ELM ST,MCCLURG CT & OHIO ST,58.3,classic,member
2022-12-14 16:00:41,FA85E9AFDA4C9546,WABASH AVE & 9TH ST,MCCLURG CT & OHIO ST,31.183333333333334,classic,casual


In [18]:
df_maior_30_min_sql.createOrReplaceTempView("bike_trip_greather_than_30")
df_eletric = spark.sql("select * from bike_trip_greather_than_30 where rideable_type == 'electric' order by started_at")
display(df_eletric)

started_at,ride_id,start_station_name,end_station_name,trip_time,rideable_type,member_category
2022-12-01 01:01:09,B1D52B01956FFC46,,,31.03333333333333,electric,member
2022-12-01 03:42:34,9ED6246AE12FF95D,,ASHLAND AVE & LAK...,60.13333333333333,electric,casual
2022-12-01 03:53:06,A005B5A8068DDD18,,,35.96666666666667,electric,member
2022-12-01 06:10:39,736CED10A7020EFD,,,52.016666666666666,electric,member
2022-12-01 07:18:39,2F5B64527AEC3DF8,,DESPLAINES ST & J...,50.96666666666667,electric,member
2022-12-01 07:25:31,F390EDB71035C820,,SHERIDAN RD & IRV...,34.75,electric,member
2022-12-01 07:40:53,0C5FA74D7CFB6328,ASHLAND AVE & BLA...,,31.633333333333333,electric,member
2022-12-01 09:02:23,A81704545E70797B,,HERMITAGE AVE & P...,39.93333333333333,electric,member
2022-12-01 09:09:55,83B378A01809EAD8,,LARRABEE ST & ARM...,41.6,electric,member
2022-12-01 09:46:43,60E446FC5975E758,PUBLIC RACK - LON...,HOMEWOOD AVE & 11...,47.4,electric,member


In [19]:
df_classic = spark.sql("select * from bike_trip_greather_than_30 where rideable_type == 'classic' order by started_at")
display(df_classic)

started_at,ride_id,start_station_name,end_station_name,trip_time,rideable_type,member_category
2022-12-01 00:57:33,5042C02FA60D3D30,ST. LOUIS AVE & F...,KINGSBURY ST & KI...,31.03333333333333,classic,casual
2022-12-01 06:23:00,FA5F010ECFACC85C,MICHIGAN AVE & 8T...,INDIANA AVE & ROO...,30.03333333333333,classic,member
2022-12-01 06:51:25,59C8309C280E95E5,BROADWAY & CORNEL...,WESTERN AVE & MON...,37.1,classic,member
2022-12-01 07:03:22,EDB08365EDAFAC2B,HALSTED ST & WRIG...,WOLCOTT AVE & POL...,34.416666666666664,classic,member
2022-12-01 07:06:28,0314A4E50FD16BA1,CLARK ST & SCHILL...,MICHIGAN AVE & 8T...,32.4,classic,member
2022-12-01 07:08:34,146DB3EC88A9CD75,CLARENDON AVE & G...,ST. CLAIR ST & ER...,34.766666666666666,classic,member
2022-12-01 07:24:32,E1AA6E1DAF164AD9,SHERIDAN RD & BUE...,STATE ST & VAN BU...,32.666666666666664,classic,member
2022-12-01 07:30:09,31B4FE39146EA449,CANAL ST & ADAMS ST,LINCOLN AVE & FUL...,31.33333333333333,classic,member
2022-12-01 07:38:13,6BC767B34FC83904,LOOMIS ST & 89TH ST,LOOMIS ST & 89TH ST,99.58333333333331,classic,casual
2022-12-01 07:44:11,556D6650BB4DBD3A,AUSTIN BLVD & CHI...,,1499.7666666666669,classic,member
