# Aggregating DataFrames in PySpark

 - GroupBy
 - Pivot
 - Aggregate methods
 - Combos of each

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('agg2').getOrCreate()
cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size()
print('cores : {}'.format(cores))
spark

cores : 1


In [2]:
import pandas as pd
airbnb = pd.read_csv('s3://****************/nyc_air_bnb.csv')
for col in airbnb.columns:
    airbnb[col] = airbnb[col].astype('str')
airbnb = spark.createDataFrame(airbnb)

In [3]:
airbnb.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- host_id: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- neighbourhood_group: string (nullable = true)
 |-- neighbourhood: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- room_type: string (nullable = true)
 |-- price: string (nullable = true)
 |-- minimum_nights: string (nullable = true)
 |-- number_of_reviews: string (nullable = true)
 |-- last_review: string (nullable = true)
 |-- reviews_per_month: string (nullable = true)
 |-- calculated_host_listings_count: string (nullable = true)
 |-- availability_365: string (nullable = true)



In [5]:
airbnb.limit(2).toPandas()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355


In [7]:
numeric_col = ['latitude', #double,
            'longitude', #double
            'price' , #int
            'minimum_nights', #int
            'number_of_reviews', #int
            'reviews_per_month', #double
            'calculated_host_listings_count', #int
            'availability_365'] # int]

In [17]:
from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType, IntegerType

df = airbnb.withColumn('latitude', col('latitude').cast(DoubleType())) \
    .withColumn('longitude', col('longitude').cast(DoubleType())) \
    .withColumn('price', col('price').cast(IntegerType())) \
    .withColumn('minimum_nights', col('minimum_nights').cast(IntegerType())) \
    .withColumn('number_of_reviews', col('number_of_reviews').cast(IntegerType())) \
    .withColumn('reviews_per_month', col('reviews_per_month').cast(DoubleType())) \
    .withColumn('calculated_host_listings_count', col('calculated_host_listings_count').cast(IntegerType()))\
    .withColumn('availability_365', col('availability_365').cast(IntegerType()))
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- host_id: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- neighbourhood_group: string (nullable = true)
 |-- neighbourhood: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- room_type: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- minimum_nights: integer (nullable = true)
 |-- number_of_reviews: integer (nullable = true)
 |-- last_review: string (nullable = true)
 |-- reviews_per_month: double (nullable = true)
 |-- calculated_host_listings_count: integer (nullable = true)
 |-- availability_365: integer (nullable = true)

