In [0]:
# Create SparkSession from builder
import pyspark
from pyspark.sql import SparkSession
from functools import reduce  # For Python 3.x
from pyspark.sql import DataFrame
from pyspark.sql.functions import isnan, when, count, col
from pyspark.sql import Row
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pyspark.pandas as ps
from pyspark.sql.functions import monotonically_increasing_id, row_number
from pyspark.sql.window import Window


def unionAll(*dfs):
    return reduce(DataFrame.unionAll, dfs)


from pyspark.sql.functions import monotonically_increasing_id
def get_mode(df):
    column_lst = df.columns
    res = [df.select(i).groupby(i).count().orderBy("count", ascending=False) for i in column_lst]
    df_mode = res[0].limit(1).select(column_lst[0]).withColumn("temp_name_monotonically_increasing_id", monotonically_increasing_id())
    
    for i in range(1, len(res)):
        df2 = res[i].limit(1).select(column_lst[i]).withColumn("temp_name_monotonically_increasing_id", monotonically_increasing_id())
        df_mode = df_mode.join(df2, (df_mode.temp_name_monotonically_increasing_id == df2.temp_name_monotonically_increasing_id)).drop(df2.temp_name_monotonically_increasing_id)
        
    return df_mode.drop("temp_name_monotonically_increasing_id")





spark = SparkSession.builder \
                    .appName('SparkByExamples.com') \
                    .getOrCreate()
parquets=["/FileStore/tables/yellow_tripdata_2019/yellow_tripdata_2019_01.parquet",
          "/FileStore/tables/yellow_tripdata_2019/yellow_tripdata_2019_02.parquet",
          "/FileStore/tables/yellow_tripdata_2019/yellow_tripdata_2019_03.parquet",
          "/FileStore/tables/yellow_tripdata_2019/yellow_tripdata_2019_04.parquet",
          "/FileStore/tables/yellow_tripdata_2019/yellow_tripdata_2019_05.parquet",
          "/FileStore/tables/yellow_tripdata_2019/yellow_tripdata_2019_06.parquet",
          "/FileStore/tables/yellow_tripdata_2019/yellow_tripdata_2019_07.parquet",
          "/FileStore/tables/yellow_tripdata_2019/yellow_tripdata_2019_08.parquet",
          "/FileStore/tables/yellow_tripdata_2019/yellow_tripdata_2019_09.parquet",
          "/FileStore/tables/yellow_tripdata_2019/yellow_tripdata_2019_10.parquet",
          "/FileStore/tables/yellow_tripdata_2019/yellow_tripdata_2019_11.parquet",
          "/FileStore/tables/yellow_tripdata_2019/yellow_tripdata_2019_12.parquet",]
    
data=[]
for i in parquets:
    d = spark.read.option("header","true").parquet(i,inferSchema=True)
    data.append(d)
    
#Merge all the dataframes in list
df_complete=unionAll(*data)
print("Total Rows and Columns:",(df_complete.count(), len(df_complete.columns)))
columns=df_complete.columns

print("=============================================")

#Null values
print("Null values for every columns in DataFrame:")
print("=============================================")
for i in df_complete.columns:
      print(i,df_complete.count()-(df_complete.na.drop(subset=i).count()))

print("=============================================")


#Mode values for the DataFrame
mode_values=get_mode(df_complete)


Total Rows and Columns: (84598444, 19)
Null values for every columns in DataFrame:
VendorID 0
tpep_pickup_datetime 0
tpep_dropoff_datetime 0
passenger_count 444383
trip_distance 0
RatecodeID 444383
store_and_fwd_flag 444383
PULocationID 0
DOLocationID 0
payment_type 0
fare_amount 0
extra 0
mta_tax 0
tip_amount 0
tolls_amount 0
improvement_surcharge 0
total_amount 0
congestion_surcharge 5300601
airport_fee 84598444


In [0]:
print("Mode value of each column")
print("================================")
for i in range(len(df_complete.columns)):
    print(columns[i],":",mode_values.collect()[0][i])
print("================================")
# print("Grouping by each column")
# print("================================")
# for i in df_complete.columns:
#     display(df_complete.groupBy(i).count())

Mode value of each column
VendorID : 2
tpep_pickup_datetime : 2019-02-02 12:19:58
tpep_dropoff_datetime : 2019-10-27 00:00:00
passenger_count : 1.0
trip_distance : 0.9
RatecodeID : 1.0
store_and_fwd_flag : N
PULocationID : 237
DOLocationID : 236
payment_type : 1
fare_amount : 6.0
extra : 0.0
mta_tax : 0.5
tip_amount : 0.0
tolls_amount : 0.0
improvement_surcharge : 0.3
total_amount : 9.8
congestion_surcharge : 2.5
airport_fee : None


In [0]:
import pyspark.pandas as ps
pd=ps.DataFrame(df_complete)
df_corr=pd.to_spark()
df_corr=df_corr.withColumn("date_format",to_date(df_complete.tpep_pickup_datetime,"MM-dd-yyyy"))
df_corr=df_corr.where(df_corr.date_format.contains("2019"))


In [0]:
week_level=df_corr.drop('RatecodeID',
 'store_and_fwd_flag',
 'PULocationID',
 'DOLocationID',)
week_level=week_level.withColumn("week_date",date_trunc("week",week_level.date_format))
week_level=week_level.fillna({"passenger_count":1.0,
                   "congestion_surcharge":2.5,  
                  })
week_level.columns

Out[5]: ['VendorID',
 'tpep_pickup_datetime',
 'tpep_dropoff_datetime',
 'passenger_count',
 'trip_distance',
 'payment_type',
 'fare_amount',
 'extra',
 'mta_tax',
 'tip_amount',
 'tolls_amount',
 'improvement_surcharge',
 'total_amount',
 'congestion_surcharge',
 'airport_fee',
 'date_format',
 'week_date']

In [0]:
g=week_level.groupBy("VendorID","week_date").sum()

In [0]:

#I made a little helper function for this that might help some people out.

import re

from functools import partial

def rename_cols(agg_df, ignore_first_n=1):
    """changes the default spark aggregate names `avg(colname)` 
    to something a bit more useful. Pass an aggregated dataframe
    and the number of aggregation columns to ignore.
    """
    delimiters = "(", ")"
    split_pattern = '|'.join(map(re.escape, delimiters))
    splitter = partial(re.split, split_pattern)
    split_agg = lambda x: '_'.join(splitter(x))[0:-ignore_first_n]
    renamed = map(split_agg, agg_df.columns[ignore_first_n:])
    renamed = zip(agg_df.columns[ignore_first_n:], renamed)
    for old, new in renamed:
        agg_df = agg_df.withColumnRenamed(old, new)
    return agg_df

In [0]:
final=rename_cols(g)
final.printSchema()

root
 |-- VendorID: long (nullable = true)
 |-- week_dat: timestamp (nullable = true)
 |-- sum_VendorID: long (nullable = true)
 |-- sum_passenger_count: double (nullable = true)
 |-- sum_trip_distance: double (nullable = true)
 |-- sum_payment_type: long (nullable = true)
 |-- sum_fare_amount: double (nullable = true)
 |-- sum_extra: double (nullable = true)
 |-- sum_mta_tax: double (nullable = true)
 |-- sum_tip_amount: double (nullable = true)
 |-- sum_tolls_amount: double (nullable = true)
 |-- sum_improvement_surcharge: double (nullable = true)
 |-- sum_total_amount: double (nullable = true)
 |-- sum_congestion_surcharge: double (nullable = true)
 |-- sum_airport_fee: long (nullable = true)



In [0]:
display(final.drop("sum_VendorID","sum_payment_type","sum_passenger_count"))

In [0]:
month_level=df_corr.drop('RatecodeID',
 'store_and_fwd_flag',
 'PULocationID',
 'DOLocationID',)
month_level=month_level.withColumn("Month",month(month_level.date_format))
month_level=month_level.fillna({"passenger_count":1.0,
                   "congestion_surcharge":2.5,  
                  })
month_level.columns

Out[10]: ['VendorID',
 'tpep_pickup_datetime',
 'tpep_dropoff_datetime',
 'passenger_count',
 'trip_distance',
 'payment_type',
 'fare_amount',
 'extra',
 'mta_tax',
 'tip_amount',
 'tolls_amount',
 'improvement_surcharge',
 'total_amount',
 'congestion_surcharge',
 'airport_fee',
 'date_format',
 'Month']

In [0]:
f=month_level.groupBy("VendorID","Month").sum()
final2=rename_cols(f)
final2.printSchema()


root
 |-- VendorID: long (nullable = true)
 |-- Mont: integer (nullable = true)
 |-- sum_VendorID: long (nullable = true)
 |-- sum_passenger_count: double (nullable = true)
 |-- sum_trip_distance: double (nullable = true)
 |-- sum_payment_type: long (nullable = true)
 |-- sum_fare_amount: double (nullable = true)
 |-- sum_extra: double (nullable = true)
 |-- sum_mta_tax: double (nullable = true)
 |-- sum_tip_amount: double (nullable = true)
 |-- sum_tolls_amount: double (nullable = true)
 |-- sum_improvement_surcharge: double (nullable = true)
 |-- sum_total_amount: double (nullable = true)
 |-- sum_congestion_surcharge: double (nullable = true)
 |-- sum_airport_fee: long (nullable = true)
 |-- sum_Month: long (nullable = true)



In [0]:
display(final2.drop("sum_VendorID","sum_payment_type","sum_Month","sum_passenger_count"))

VendorID,Mont,sum_trip_distance,sum_fare_amount,sum_extra,sum_mta_tax,sum_tip_amount,sum_tolls_amount,sum_improvement_surcharge,sum_total_amount,sum_congestion_surcharge,sum_airport_fee
2,8,12825559.939997597,54322064.77000523,1428884.7000000172,1932889.75,8681133.410006532,1727795.660010087,1168148.7000855682,78080881.76741132,8880404.0,
5,1,285.6800000000001,1245.6,0.0,49.5,220.17000000000004,46.16,29.70000000000005,1410.5099999999998,247.5,
1,1,7738709.10000006,36071384.92000035,973381.2,1463008.1600000004,5200263.300002536,828683.6500009553,881517.6000410882,45418238.829568855,4642730.0,
2,6,14134425.479996435,61357935.29000942,1527938.3600000106,2175797.5,10161066.53001961,1923840.9900131545,1314760.2001083218,88443276.86681466,10025428.25,
2,9,13610311.489996975,59751313.04000386,1527361.3500000222,2104219.2,9931022.890016943,1822876.660011463,1271598.6001016262,86108856.32705237,9762034.5,
2,7,13070629.219998509,56209058.130004734,1475661.1000000157,2002161.01,9056172.78000944,1738726.3600103313,1211000.7000922195,80861290.94732818,9224474.5,
2,3,15421740.449998418,65959548.55002728,1702020.8000000145,2414141.25,10944588.630031554,1958690.0600024883,1459723.2001308238,95443507.5362394,11058970.25,
2,5,15108384.169995522,66065186.59002966,1721331.1000000108,2364913.0,10980074.16002508,2069197.230015553,1428489.9001259676,95505232.48636,10921443.0,
2,2,13237018.359997438,56998947.870031945,1509601.10000001,2146157.5,9499749.37000949,1632507.6900019788,1296768.600105523,82457820.47714151,9424652.25,
2,1,13846184.569998171,59471312.71003062,1596370.0500000096,2319821.25,8675271.550004618,1633460.1400019906,1399253.4001214372,75126040.03727005,7398427.75,


In [0]:
avg_month=df_corr.drop('RatecodeID',
 'store_and_fwd_flag',
 'PULocationID',
 'DOLocationID',)
avg_month=avg_month.withColumn("Month",month(month_level.date_format))
avg_month=avg_month.fillna({"passenger_count":1.0,
                   "congestion_surcharge":2.5,  
                  })
avg_month.columns
avg_month.groupby("VendorID","Month").agg({"congestion_surcharge":"avg"}).alias('Avg_congestion_surcharge').orderBy("Month", ascending=False).show(100)
  

+--------+-----+-------------------------+
|VendorID|Month|avg(congestion_surcharge)|
+--------+-----+-------------------------+
|       5|   12|                      2.5|
|       1|   12|        2.301466098925982|
|       2|   12|       2.2898577555082156|
|       2|   11|       2.3007396702511227|
|       5|   11|                      2.5|
|       1|   11|       2.3094858841879424|
|       1|   10|       2.3137281958316755|
|       2|   10|       2.2967218808453715|
|       5|   10|                      2.5|
|       5|    9|                      2.5|
|       2|    9|        2.284518289420906|
|       1|    9|        2.295024391221862|
|       4|    9|       1.6346153846153846|
|       4|    8|       1.8181818181818181|
|       2|    8|       2.2617148570842067|
|       5|    8|                      2.5|
|       1|    8|       2.2786958490650426|
|       2|    7|        2.268817562656751|
|       4|    7|        2.326229977116705|
|       5|    7|                      2.5|
|       1| 

In [0]:
pass_month=df_corr.drop('RatecodeID',
 'store_and_fwd_flag',
 'PULocationID',
 'DOLocationID',)
pass_month=pass_month.withColumn("Month",month(month_level.date_format))
pass_month=pass_month.fillna({"passenger_count":1.0,
                   "congestion_surcharge":2.5,  
                  })
pass_month.columns
pass_month.groupby("VendorID","Month").agg({"passenger_count":"sum"}).alias('Total_passenger_count').orderBy("Month").show(100)

+--------+-----+--------------------+
|VendorID|Month|sum(passenger_count)|
+--------+-----+--------------------+
|       2|    1|           8428590.0|
|       4|    1|             78704.0|
|       5|    1|                99.0|
|       1|    1|           3536755.0|
|       1|    2|           3198458.0|
|       4|    2|             52934.0|
|       5|    2|                68.0|
|       2|    2|           7807682.0|
|       1|    3|           3573563.0|
|       2|    3|           8754361.0|
|       5|    3|                46.0|
|       4|    3|             43528.0|
|       1|    4|           3411199.0|
|       4|    4|             33126.0|
|       2|    4|           8292821.0|
|       5|    4|                17.0|
|       4|    5|             29740.0|
|       1|    5|           3408856.0|
|       5|    5|                10.0|
|       2|    5|           8467842.0|
|       2|    6|           7737157.0|
|       1|    6|           3144416.0|
|       5|    6|                 5.0|
|       4|  

In [0]:
trip_hour=df_corr.drop('RatecodeID',
 'store_and_fwd_flag',
 'PULocationID',
 'DOLocationID',)
trip_hour=trip_hour.fillna({"passenger_count":1.0,
                   "congestion_surcharge":2.5,  
                  })

#trip_hour.columns
from pyspark.sql import functions as F
trip_hour=trip_hour.withColumn("hour", F.date_trunc('hour',F.to_timestamp("tpep_pickup_datetime","yyyy-MM-dd HH:mm:ss 'UTC'")))
trip_hour=trip_hour.withColumn("hour2", hour(trip_hour.tpep_pickup_datetime))

In [0]:
trip_hour.printSchema()

root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = false)
 |-- trip_distance: double (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = false)
 |-- airport_fee: integer (nullable = true)
 |-- date_format: date (nullable = true)
 |-- hour: timestamp (nullable = true)
 |-- hour2: integer (nullable = true)



In [0]:
trip=trip_hour.groupby("hour").agg(count("tpep_pickup_datetime").alias('total_trip_count')).orderBy("hour")
import pyspark.sql.functions as f
from pyspark.sql.window import Window
trip = trip.withColumn('percent', f.col('total_trip_count')/f.sum('total_trip_count').over(Window.partitionBy()))
trip.orderBy('percent', ascending=False).show()

+-------------------+----------------+--------------------+
|               hour|total_trip_count|             percent|
+-------------------+----------------+--------------------+
|2019-03-15 21:00:00|           21506|2.542170466040865E-4|
|2019-01-11 18:00:00|           20742|2.451859937069637...|
|2019-04-12 19:00:00|           20233|2.391692320255037E-4|
|2019-03-07 18:00:00|           20198|2.387555057802166...|
|2019-01-31 18:00:00|           20157|2.382708550357375...|
|2019-02-01 18:00:00|           20104|2.376443552928743...|
|2019-01-25 18:00:00|           20074| 2.37289732796914E-4|
|2019-01-30 18:00:00|           20014|2.365804878049933...|
|2019-02-06 18:00:00|           19952|2.358476013133420...|
|2019-05-02 20:00:00|           19914|2.353984128184589...|
|2019-01-25 19:00:00|           19879|2.349846865731719...|
|2019-04-09 18:00:00|           19814|2.342163378319245...|
|2019-03-06 18:00:00|           19805|2.341099510831364...|
|2019-04-05 19:00:00|           19718|2.

In [0]:
trip2=trip_hour.groupby("hour2").agg(count("tpep_pickup_datetime").alias('total_trip_count')).orderBy("hour2")
import pyspark.sql.functions as f
from pyspark.sql.window import Window
trip2 = trip2.withColumn('percent', f.col('total_trip_count')/f.sum('total_trip_count').over(Window.partitionBy()))
trip2.orderBy('percent', ascending=False).show(50)

+-----+----------------+--------------------+
|hour2|total_trip_count|             percent|
+-----+----------------+--------------------+
|   18|         5548948| 0.06559272632380046|
|   19|         5240544| 0.06194715978232893|
|   17|         5036391| 0.05953391823506937|
|   20|         4783081|  0.0565396040866791|
|   15|         4745926| 0.05610040412543225|
|   21|         4733473| 0.05595320032735912|
|   14|         4720305| 0.05579754469313227|
|   13|         4484443| 0.05300947898839252|
|   16|         4466825| 0.05280122101726489|
|   12|         4436595| 0.05244387974883554|
|   22|         4400975| 0.05202282463863199|
|   11|         4175330|0.049355531535266464|
|   10|         3976419| 0.04700425435880104|
|    9|         3932176|0.046481268922508624|
|    8|         3853839|0.045555266840307175|
|   23|         3481246| 0.04115093818572909|
|    7|         3083751|0.036452249218004204|
|    0|         2498194|0.029530526389103008|
|    6|         1774797|0.02097943

In [0]:
trip.agg(sum("percent")).show()

+------------------+
|      sum(percent)|
+------------------+
|1.0000000000000002|
+------------------+



In [0]:
payment_month=df_corr.drop('RatecodeID',
 'store_and_fwd_flag',
 'PULocationID',
 'DOLocationID',)
payment_month=payment_month.withColumn("Month",month(month_level.date_format))
payment_month=payment_month.fillna({"passenger_count":1.0,
                   "congestion_surcharge":2.5,  
                  })

In [0]:
from pyspark.sql.functions import when

payment_month = payment_month.withColumn("payment_type_name", 
              when(payment_month.payment_type == 0, "Cash")
             .when(payment_month.payment_type == 1, "Credit_card")
             .when(payment_month.payment_type == 2, "Net_Banking")                       
             .when(payment_month.payment_type == 3, "UPI")  
             .when(payment_month.payment_type == 4, "Debit_Card")  
             .when(payment_month.payment_type == 5, "Unknown")  
             .otherwise(payment_month.payment_type))
from pyspark.sql.functions import when

payment_month = payment_month.withColumn("payment_type_name2", 
              when(payment_month.payment_type == 0, "Credit card")
             .when(payment_month.payment_type == 1, "Cash")
             .when(payment_month.payment_type == 2, "No charge")                       
             .when(payment_month.payment_type == 3, "Dispute")  
             .when(payment_month.payment_type == 4, "Unknown")  
             .when(payment_month.payment_type == 5, "Voided trip")  
             .otherwise(payment_month.payment_type))

In [0]:
payment_month.printSchema()

root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = false)
 |-- trip_distance: double (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = false)
 |-- airport_fee: integer (nullable = true)
 |-- date_format: date (nullable = true)
 |-- Month: integer (nullable = true)
 |-- payment_type_name: string (nullable = true)
 |-- payment_type_name2: string (nullable = true)



In [0]:
payment_month1=payment_month.groupby("Month","payment_type_name").agg(count("payment_type_name").alias('payment_type_name_count')).orderBy("Month")
import pyspark.sql.functions as f
from pyspark.sql.window import Window
payment_month1 = payment_month1.withColumn('percent', f.col('payment_type_name_count')/f.sum('payment_type_name_count').over(Window.partitionBy()))
payment_month1.orderBy('percent', ascending=False).show(10000)

+-----+-----------------+-----------------------+--------------------+
|Month|payment_type_name|payment_type_name_count|             percent|
+-----+-----------------+-----------------------+--------------------+
|    3|      Credit_card|                5721654| 0.06763424075004455|
|    1|      Credit_card|                5486204| 0.06485104519424932|
|    5|      Credit_card|                5456422| 0.06449899962175965|
|    4|      Credit_card|                5354015| 0.06328847209029936|
|   10|      Credit_card|                5216086| 0.06165804788212235|
|    2|      Credit_card|                5184852|0.061288838580828196|
|    6|      Credit_card|                4983310| 0.05890646101146705|
|   11|      Credit_card|                4977584| 0.05883877539773809|
|   12|      Credit_card|                4875930|0.057637148890926415|
|    9|      Credit_card|                4726409| 0.05586969855031033|
|    7|      Credit_card|                4424183|0.052297160601506895|
|    8

In [0]:
payment_month1=payment_month.groupby("Month","payment_type_name2").agg(count("payment_type_name2").alias('payment_type_name_count')).orderBy("Month")
import pyspark.sql.functions as f
from pyspark.sql.window import Window
payment_month1 = payment_month1.withColumn('percent', f.col('payment_type_name_count')/f.sum('payment_type_name_count').over(Window.partitionBy()))
payment_month1.orderBy('percent', ascending=False).show(10000)

+-----+------------------+-----------------------+--------------------+
|Month|payment_type_name2|payment_type_name_count|             percent|
+-----+------------------+-----------------------+--------------------+
|    3|              Cash|                5721654| 0.06763424075004455|
|    1|              Cash|                5486204| 0.06485104519424932|
|    5|              Cash|                5456422| 0.06449899962175965|
|    4|              Cash|                5354015| 0.06328847209029936|
|   10|              Cash|                5216086| 0.06165804788212235|
|    2|              Cash|                5184852|0.061288838580828196|
|    6|              Cash|                4983310| 0.05890646101146705|
|   11|              Cash|                4977584| 0.05883877539773809|
|   12|              Cash|                4875930|0.057637148890926415|
|    9|              Cash|                4726409| 0.05586969855031033|
|    7|              Cash|                4424183|0.052297160601

In [0]:
import pandas as pd
#import findspark
#findspark.init()
import pyspark
from pyspark import SparkContext
from pyspark.sql import SQLContext 
#sc = SparkContext("local", "App Name")
sql = SQLContext(sc)
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col

payment_month_top=payment_month.groupby("Month","payment_type_name2").agg(count("payment_type_name2").alias('payment_type_name_count')).orderBy("Month")
window = Window.partitionBy(payment_month_top['Month']).orderBy(payment_month_top['payment_type_name_count'].desc())
payment_month_top.select('*', rank().over(window).alias('rank')).filter(col('rank') <= 5).show(1000) 

+-----+------------------+-----------------------+----+
|Month|payment_type_name2|payment_type_name_count|rank|
+-----+------------------+-----------------------+----+
|    1|              Cash|                5486204|   1|
|    1|         No charge|                2137168|   2|
|    1|           Dispute|                  33181|   3|
|    1|       Credit card|                  28672|   4|
|    1|           Unknown|                  11165|   5|
|    2|              Cash|                5184852|   1|
|    2|         No charge|                1787291|   2|
|    2|           Dispute|                  34801|   3|
|    2|       Credit card|                  29663|   4|
|    2|           Unknown|                  12571|   5|
|    3|              Cash|                5721654|   1|
|    3|         No charge|                2057314|   2|
|    3|           Dispute|                  39280|   3|
|    3|       Credit card|                  33474|   4|
|    3|           Unknown|                  1464

In [0]:
final_week_level=final.drop("sum_VendorID","sum_payment_type","sum_passenger_count")
final_month_level=final2.drop("sum_VendorID","sum_payment_type","sum_Month","sum_passenger_count")
final_avg_month=avg_month.groupby("VendorID","Month").agg({"congestion_surcharge":"avg"}).alias('Avg_congestion_surcharge').orderBy("Month", ascending=False)
final_pass_month=pass_month.groupby("VendorID","Month").agg({"passenger_count":"sum"}).alias('Total_passenger_count').orderBy("Month")
final_trip=trip2.orderBy('percent', ascending=False)
final_payment_month=payment_month1.orderBy('percent', ascending=False)
fianl_payment_month_top=payment_month_top.select('*', rank().over(window).alias('rank')).filter(col('rank') <= 5)

final_list=[final_week_level,final_month_level,final_avg_month,final_pass_month,final_trip,final_payment_month,fianl_payment_month_top]

In [0]:
for i in final_list:
    display(i)

In [0]:
display(final_avg_month)

VendorID,Month,avg(congestion_surcharge)
5,12,2.5
1,12,2.301466098925982
2,12,2.289857755508216
2,11,2.3007396702511227
5,11,2.5
1,11,2.3094858841879424
1,10,2.3137281958316755
2,10,2.2967218808453715
5,10,2.5
5,9,2.5


In [0]:
display(final_pass_month)

VendorID,Month,sum(passenger_count)
2,1,8428590.0
4,1,78704.0
5,1,99.0
1,1,3536755.0
1,2,3198458.0
4,2,52934.0
5,2,68.0
2,2,7807682.0
1,3,3573563.0
2,3,8754361.0


In [0]:
display(final_payment_month)

Month,payment_type_name2,payment_type_name_count,percent
3,Cash,5721654,0.0676342407500445
1,Cash,5486204,0.0648510451942493
5,Cash,5456422,0.0644989996217596
4,Cash,5354015,0.0632884720902993
10,Cash,5216086,0.0616580478821223
2,Cash,5184852,0.0612888385808281
6,Cash,4983310,0.058906461011467
11,Cash,4977584,0.058838775397738
12,Cash,4875930,0.0576371488909264
9,Cash,4726409,0.0558696985503103


In [0]:
display(fianl_payment_month_top)

Month,payment_type_name2,payment_type_name_count,rank
1,Cash,5486204,1
1,No charge,2137168,2
1,Dispute,33181,3
1,Credit card,28672,4
1,Unknown,11165,5
2,Cash,5184852,1
2,No charge,1787291,2
2,Dispute,34801,3
2,Credit card,29663,4
2,Unknown,12571,5


In [0]:
display(final_trip)

hour2,total_trip_count,percent
18,5548948,0.0655927263238004
19,5240544,0.0619471597823289
17,5036391,0.0595339182350693
20,4783081,0.0565396040866791
15,4745926,0.0561004041254322
21,4733473,0.0559532003273591
14,4720305,0.0557975446931322
13,4484443,0.0530094789883925
16,4466825,0.0528012210172648
12,4436595,0.0524438797488355


In [0]:
import pyspark.pandas as ps
pd_final_week_level=ps.DataFrame(final_week_level)
pd_final_month_level=ps.DataFrame(final_month_level)
pd_final_avg_month=ps.DataFrame(final_avg_month)
pd_final_pass_month=ps.DataFrame(final_pass_month)
pd_final_trip=ps.DataFrame(final_trip)
pd_final_payment_month=ps.DataFrame(final_payment_month)
pd_fianl_payment_month_top=ps.DataFrame(fianl_payment_month_top)

In [0]:
pd_final_week_level.to_orc("final_week_level")
pd_final_month_level.to_orc("final_month_level")
pd_final_avg_month.to_orc("final_avg_month")
pd_final_pass_month.to_orc("final_pass_month")
pd_final_trip.to_orc("pd_final_tripfinal_trip")
pd_final_payment_month.to_orc("final_payment_month")
pd_fianl_payment_month_top.to_orc("fianl_payment_month_top")

In [0]:
df_complete.select("*").write.format("jdbc")\
    .option("url", "jdbc:postgresql://localhost:5432/yellow_tripdata_2019") \
    .option("driver", "org.postgresql.Driver").option("dbtable", "yellow_tripdata_2019") \
    .option("user", "postgres").option("password", "vivek@3011").save()

[0;31m---------------------------------------------------------------------------[0m
[0;31mPy4JJavaError[0m                             Traceback (most recent call last)
[0;32m<command-3940188177395034>[0m in [0;36m<module>[0;34m[0m
[0;32m----> 1[0;31m [0mdf_complete[0m[0;34m.[0m[0mselect[0m[0;34m([0m[0;34m"*"[0m[0;34m)[0m[0;34m.[0m[0mwrite[0m[0;34m.[0m[0mformat[0m[0;34m([0m[0;34m"jdbc"[0m[0;34m)[0m[0;31m\[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m      2[0m     [0;34m.[0m[0moption[0m[0;34m([0m[0;34m"url"[0m[0;34m,[0m [0;34m"jdbc:postgresql://localhost:5432/yellow_tripdata_2019"[0m[0;34m)[0m[0;31m [0m[0;31m\[0m[0;34m[0m[0;34m[0m[0m
[1;32m      3[0m     [0;34m.[0m[0moption[0m[0;34m([0m[0;34m"driver"[0m[0;34m,[0m [0;34m"org.postgresql.Driver"[0m[0;34m)[0m[0;34m.[0m[0moption[0m[0;34m([0m[0;34m"dbtable"[0m[0;34m,[0m [0;34m"yellow_tripdata_2019"[0m[0;34m)[0m[0;31m [0m[0;31m\[0m[0;34m[0m[0;34m[

In [0]:
#Connection details
PSQL_SERVERNAME = "localhost"
PSQL_PORTNUMBER = 5432
PSQL_DBNAME = "test"
PSQL_USRRNAME = "postgres"
PSQL_PASSWORD = "vivek@3011"

URL = f"jdbc:postgresql://{PSQL_SERVERNAME}/{PSQL_DBNAME}"

#Table details
TABLE_MYTABLE = "t1"
TABLE_EMPLOYEE = "complete_data"
df_complete.write\
        .format("jdbc")\
        .option("url", URL)\
        .option("dbtable", TABLE_EMPLOYEE)\
        .option("user", PSQL_USRRNAME)\
        .option("password", PSQL_PASSWORD)\
        .mode("append")\
        .save()

[0;31m---------------------------------------------------------------------------[0m
[0;31mPy4JJavaError[0m                             Traceback (most recent call last)
[0;32m<command-1522843009367740>[0m in [0;36m<module>[0;34m[0m
[1;32m     11[0m [0mTABLE_MYTABLE[0m [0;34m=[0m [0;34m"t1"[0m[0;34m[0m[0;34m[0m[0m
[1;32m     12[0m [0mTABLE_EMPLOYEE[0m [0;34m=[0m [0;34m"complete_data"[0m[0;34m[0m[0;34m[0m[0m
[0;32m---> 13[0;31m [0mdf_complete[0m[0;34m.[0m[0mwrite[0m[0;31m\[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m     14[0m         [0;34m.[0m[0mformat[0m[0;34m([0m[0;34m"jdbc"[0m[0;34m)[0m[0;31m\[0m[0;34m[0m[0;34m[0m[0m
[1;32m     15[0m         [0;34m.[0m[0moption[0m[0;34m([0m[0;34m"url"[0m[0;34m,[0m [0mURL[0m[0;34m)[0m[0;31m\[0m[0;34m[0m[0;34m[0m[0m

[0;32m/databricks/spark/python/pyspark/sql/readwriter.py[0m in [0;36msave[0;34m(self, path, format, mode, partitionBy, **options)[0m
[1;32m    736[0

In [0]:
mode=[]
for i in range(len(df_complete.columns)): 
    mode.append(mode_values.collect()[0][i])
    



In [0]:
check=zip(df_complete.columns,mode)
mydict=dict(check)
a_dict = {key: mydict[key] for key in mydict if (key != 'VendorID' and key !='tpep_pickup_datetime' and key !='tpep_dropoff_datetime')}



In [0]:
a_dict 



In [0]:
df_complete.printSchema()

