# Aggregating DataFrames

First let's start up our PySpark instance

In [8]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('agg1').getOrCreate()
cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size()
print('your are working with {} core(s)'.format(cores))
spark

your are working with 1 core(s)


## Read in the dataFrame for this Notebook

In [10]:
import pandas as pd

In [14]:
df = pd.read_csv('s3://************/nyc_air_bnb.csv')
for col in df.columns:
    df[col] = df[col].astype('str')

airbnb = spark.createDataFrame(df)    

In [15]:
print(airbnb.printSchema())

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- host_id: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- neighbourhood_group: string (nullable = true)
 |-- neighbourhood: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- room_type: string (nullable = true)
 |-- price: string (nullable = true)
 |-- minimum_nights: string (nullable = true)
 |-- number_of_reviews: string (nullable = true)
 |-- last_review: string (nullable = true)
 |-- reviews_per_month: string (nullable = true)
 |-- calculated_host_listings_count: string (nullable = true)
 |-- availability_365: string (nullable = true)

None


Notice here that some of the columns that are obviously numeric have been incorrectly identified as "strings". Let's edit that. Otherwise we cannot aggregate any of the numeric columns.

In [17]:
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType
df = airbnb.withColumn('price', col('price').cast(IntegerType())) \
        .withColumn('minimum_nights', col('minimum_nights').cast(IntegerType())) \
        .withColumn('number_of_reviews', col('number_of_reviews').cast(IntegerType())) \
        .withColumn('minimum_nights', col('minimum_nights').cast(IntegerType())) \
        .withColumn('reviews_per_month', col('reviews_per_month').cast(IntegerType())) \
        .withColumn('calculated_host_listings_count', col('calculated_host_listings_count').cast(IntegerType()))
print(df.printSchema())

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- host_id: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- neighbourhood_group: string (nullable = true)
 |-- neighbourhood: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- room_type: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- minimum_nights: integer (nullable = true)
 |-- number_of_reviews: integer (nullable = true)
 |-- last_review: string (nullable = true)
 |-- reviews_per_month: integer (nullable = true)
 |-- calculated_host_listings_count: integer (nullable = true)
 |-- availability_365: string (nullable = true)

None


### 1. How many rows are in this dataset?

In [18]:
df.count()

48895

### 2. How many total reviews does each host have?

In [22]:
from pyspark.sql import functions as F

df.groupBy('host_id').agg(F.sum('number_of_reviews').alias('reviews')).orderBy(F.sum('number_of_reviews').desc()).show(5)

+--------+-------+
| host_id|reviews|
+--------+-------+
|37312959|   2273|
|  344035|   2205|
|26432133|   2017|
|35524316|   1971|
|40176101|   1818|
+--------+-------+
only showing top 5 rows



### 3. Show the min and max of all the numeric variables in the dataset

In [30]:
from pyspark.sql.functions import min, max

In [33]:
numeric_col = ['price', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count']
df.select(numeric_col).summary('min', 'max').show()

+-------+-----+--------------+-----------------+-----------------+------------------------------+
|summary|price|minimum_nights|number_of_reviews|reviews_per_month|calculated_host_listings_count|
+-------+-----+--------------+-----------------+-----------------+------------------------------+
|    min|    0|             1|                0|                0|                             1|
|    max|10000|          1250|              629|               58|                           327|
+-------+-----+--------------+-----------------+-----------------+------------------------------+



### 4. Which host had the highest number of reviews?

In [41]:
from pyspark.sql.functions import sum
df.groupBy('host_id').agg(sum('number_of_reviews')).orderBy(sum('number_of_reviews').desc()).show(5)

+--------+----------------------+
| host_id|sum(number_of_reviews)|
+--------+----------------------+
|37312959|                  2273|
|  344035|                  2205|
|26432133|                  2017|
|35524316|                  1971|
|40176101|                  1818|
+--------+----------------------+
only showing top 5 rows



### 5. On average, how many nights did most hosts specify for a minimum?

In [44]:
from pyspark.sql.functions import avg
df.agg({'minimum_nights' : 'avg'}) \
    .withColumnRenamed('avg(minimum_nights)', 'Avg_Min_Nights').show()

+-----------------+
|   Avg_Min_Nights|
+-----------------+
|7.029962163820431|
+-----------------+



### 6. What is the most expensive neighborhood to stay in on average?

In [50]:
from pyspark.sql.functions import avg, col
df.groupBy('neighbourhood').agg(avg(col('price'))).orderBy(avg(col('price')).desc()).show()

+------------------+------------------+
|     neighbourhood|        avg(price)|
+------------------+------------------+
|    Fort Wadsworth|             800.0|
|           Woodrow|             700.0|
|           Tribeca|  490.638418079096|
|          Sea Gate|487.85714285714283|
|         Riverdale|442.09090909090907|
|      Prince's Bay|             409.5|
| Battery Park City| 367.5571428571429|
| Flatiron District|           341.925|
|     Randall Manor|             336.0|
|              NoHo|295.71794871794873|
|              SoHo| 287.1033519553073|
|           Midtown| 282.7190938511327|
|          Neponsit| 274.6666666666667|
|      West Village| 267.6822916666667|
| Greenwich Village|263.40561224489795|
|           Chelsea|249.73854447439354|
|       Willowbrook|             249.0|
|  Theater District|248.01388888888889|
|            Nolita|230.13833992094862|
|Financial District|225.49059139784947|
+------------------+------------------+
only showing top 20 rows



Note: only show the one result

### 7. Display a two by two table that shows the average prices by room type (private and shared only) and neighborhood group (Manhattan and Brooklyn only)

In [59]:
from pyspark.sql.functions import avg
df.filter("room_type IN('Private room','Shared room')")\
    .groupBy('room_type').pivot('neighbourhood_group', ['Manhattan', 'Brooklyn']).avg('price').show()

+------------+-----------------+-----------------+
|   room_type|        Manhattan|         Brooklyn|
+------------+-----------------+-----------------+
| Shared room|88.97708333333334|50.52784503631961|
|Private room|116.7766224004009|  76.500098697197|
+------------+-----------------+-----------------+

