# **PySpark: Introduction to DataFrames**

## DataFrame creation

In [1]:
## Librerías
import numpy as np
import pandas as pd 
import pyspark

In [2]:
df_pandas = pd.read_csv('data/housing.csv')

df_pandas.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address
0,79545.45857,5.682861,7.009188,4.09,23086.8005,1059034.0,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701..."
1,79248.64245,6.0029,6.730821,3.09,40173.07217,1505891.0,"188 Johnson Views Suite 079\nLake Kathleen, CA..."
2,61287.06718,5.86589,8.512727,5.13,36882.1594,1058988.0,"9127 Elizabeth Stravenue\nDanieltown, WI 06482..."
3,63345.24005,7.188236,5.586729,3.26,34310.24283,1260617.0,USS Barnett\nFPO AP 44820
4,59982.19723,5.040555,7.839388,4.23,26354.10947,630943.5,USNS Raymond\nFPO AE 09386


In [3]:
## Crear sesión de Spark 
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Practise").getOrCreate()

## Crear un DataFrame
df = spark.read.csv('data/housing.csv', header=True, sep=',')
df.show(5)

+----------------+-------------------+-------------------------+----------------------------+---------------+-----------+--------------------+
|Avg. Area Income|Avg. Area House Age|Avg. Area Number of Rooms|Avg. Area Number of Bedrooms|Area Population|      Price|             Address|
+----------------+-------------------+-------------------------+----------------------------+---------------+-----------+--------------------+
|     79545.45857|        5.682861322|              7.009188143|                        4.09|     23086.8005|1059033.558|208 Michael Ferry...|
|       Laurabury|     NE 37010-5101"|                     NULL|                        NULL|           NULL|       NULL|                NULL|
|     79248.64245|        6.002899808|              6.730821019|                        3.09|    40173.07217|1505890.915|188 Johnson Views...|
|   Lake Kathleen|          CA 48958"|                     NULL|                        NULL|           NULL|       NULL|                NULL|

In [4]:
## the second row is just the continuation of the previous row
df = spark.read.format("csv").load("data/housing.csv", header=True, sep=',',inferSchema=True, multiLine=True)
df.show(5)

+----------------+-------------------+-------------------------+----------------------------+---------------+-----------+--------------------+
|Avg. Area Income|Avg. Area House Age|Avg. Area Number of Rooms|Avg. Area Number of Bedrooms|Area Population|      Price|             Address|
+----------------+-------------------+-------------------------+----------------------------+---------------+-----------+--------------------+
|     79545.45857|        5.682861322|              7.009188143|                        4.09|     23086.8005|1059033.558|208 Michael Ferry...|
|     79248.64245|        6.002899808|              6.730821019|                        3.09|    40173.07217|1505890.915|188 Johnson Views...|
|     61287.06718|         5.86588984|               8.51272743|                        5.13|     36882.1594|1058987.988|9127 Elizabeth St...|
|     63345.24005|        7.188236095|              5.586728665|                        3.26|    34310.24283|1260616.807|USS Barnett\nFPO ...|

In [5]:
## rename Avg. Area Income to Avg_Area_Income, Avg. Area House Age to Avg_Area_House_Age, Avg. Area Number of Rooms to Avg_Area_Number_of_Rooms, Avg. Area Number of Bedrooms to Avg_Area_Number_of_Bedrooms, Area Population to Area_Population
df = df.withColumnRenamed("Avg. Area Income", "Avg_Area_Income") \
       .withColumnRenamed("Avg. Area House Age", "Avg_Area_House_Age") \
       .withColumnRenamed("Avg. Area Number of Rooms", "Avg_Area_Number_of_Rooms") \
       .withColumnRenamed("Avg. Area Number of Bedrooms", "Avg_Area_Number_of_Bedrooms") \
       .withColumnRenamed("Area Population", "Area_Population")
df.show(5)

+---------------+------------------+------------------------+---------------------------+---------------+-----------+--------------------+
|Avg_Area_Income|Avg_Area_House_Age|Avg_Area_Number_of_Rooms|Avg_Area_Number_of_Bedrooms|Area_Population|      Price|             Address|
+---------------+------------------+------------------------+---------------------------+---------------+-----------+--------------------+
|    79545.45857|       5.682861322|             7.009188143|                       4.09|     23086.8005|1059033.558|208 Michael Ferry...|
|    79248.64245|       6.002899808|             6.730821019|                       3.09|    40173.07217|1505890.915|188 Johnson Views...|
|    61287.06718|        5.86588984|              8.51272743|                       5.13|     36882.1594|1058987.988|9127 Elizabeth St...|
|    63345.24005|       7.188236095|             5.586728665|                       3.26|    34310.24283|1260616.807|USS Barnett\nFPO ...|
|    59982.19723|       5.0

In [6]:
## equivalent to df.info in pandas 
df.printSchema()

root
 |-- Avg_Area_Income: double (nullable = true)
 |-- Avg_Area_House_Age: double (nullable = true)
 |-- Avg_Area_Number_of_Rooms: double (nullable = true)
 |-- Avg_Area_Number_of_Bedrooms: double (nullable = true)
 |-- Area_Population: double (nullable = true)
 |-- Price: double (nullable = true)
 |-- Address: string (nullable = true)



In [7]:
## select items from the dataframe just one column (Avg. Area Income)
df.select('Avg_Area_Income').show(5)

+---------------+
|Avg_Area_Income|
+---------------+
|    79545.45857|
|    79248.64245|
|    61287.06718|
|    63345.24005|
|    59982.19723|
+---------------+
only showing top 5 rows



In [8]:
df.select(['Avg_Area_Income','Avg_Area_Number_of_Rooms']).show(5)

+---------------+------------------------+
|Avg_Area_Income|Avg_Area_Number_of_Rooms|
+---------------+------------------------+
|    79545.45857|             7.009188143|
|    79248.64245|             6.730821019|
|    61287.06718|              8.51272743|
|    63345.24005|             5.586728665|
|    59982.19723|             7.839387785|
+---------------+------------------------+
only showing top 5 rows



In [9]:
df.dtypes

[('Avg_Area_Income', 'double'),
 ('Avg_Area_House_Age', 'double'),
 ('Avg_Area_Number_of_Rooms', 'double'),
 ('Avg_Area_Number_of_Bedrooms', 'double'),
 ('Area_Population', 'double'),
 ('Price', 'double'),
 ('Address', 'string')]

In [10]:
df.describe().show()

+-------+------------------+------------------+------------------------+---------------------------+-----------------+------------------+--------------------+
|summary|   Avg_Area_Income|Avg_Area_House_Age|Avg_Area_Number_of_Rooms|Avg_Area_Number_of_Bedrooms|  Area_Population|             Price|             Address|
+-------+------------------+------------------+------------------------+---------------------------+-----------------+------------------+--------------------+
|  count|              5000|              5000|                    5000|                       5000|             5000|              5000|                5000|
|   mean| 68583.10898397019| 5.977222035287008|       6.987791850909204|         3.9813299999999967|36163.51603854035|1232072.6541452995|                NULL|
| stddev|10657.991213888685|0.9914561798324225|      1.0058332312754115|         1.2341372654846832|9925.650113546026| 353117.6265836953|                NULL|
|    min|       17796.63119|       2.644304186

In [11]:
## Addding columns to the dataframe
# Add a new column called 'Avg_Area_Income_1000' which is the 'Avg_Area_Income' divided by 1000
df = df.withColumn('Avg_Area_Income_1000', df['Avg_Area_Income']/1000)
df.show(5)

+---------------+------------------+------------------------+---------------------------+---------------+-----------+--------------------+--------------------+
|Avg_Area_Income|Avg_Area_House_Age|Avg_Area_Number_of_Rooms|Avg_Area_Number_of_Bedrooms|Area_Population|      Price|             Address|Avg_Area_Income_1000|
+---------------+------------------+------------------------+---------------------------+---------------+-----------+--------------------+--------------------+
|    79545.45857|       5.682861322|             7.009188143|                       4.09|     23086.8005|1059033.558|208 Michael Ferry...|   79.54545857000001|
|    79248.64245|       6.002899808|             6.730821019|                       3.09|    40173.07217|1505890.915|188 Johnson Views...|         79.24864245|
|    61287.06718|        5.86588984|              8.51272743|                       5.13|     36882.1594|1058987.988|9127 Elizabeth St...|         61.28706718|
|    63345.24005|       7.188236095|    

In [12]:
## delete a column from the dataframe
df = df.drop('Avg_Area_Income_1000')
df.show(5)

+---------------+------------------+------------------------+---------------------------+---------------+-----------+--------------------+
|Avg_Area_Income|Avg_Area_House_Age|Avg_Area_Number_of_Rooms|Avg_Area_Number_of_Bedrooms|Area_Population|      Price|             Address|
+---------------+------------------+------------------------+---------------------------+---------------+-----------+--------------------+
|    79545.45857|       5.682861322|             7.009188143|                       4.09|     23086.8005|1059033.558|208 Michael Ferry...|
|    79248.64245|       6.002899808|             6.730821019|                       3.09|    40173.07217|1505890.915|188 Johnson Views...|
|    61287.06718|        5.86588984|              8.51272743|                       5.13|     36882.1594|1058987.988|9127 Elizabeth St...|
|    63345.24005|       7.188236095|             5.586728665|                       3.26|    34310.24283|1260616.807|USS Barnett\nFPO ...|
|    59982.19723|       5.0

In [13]:
## generate random nan values in the dataframe
import random
from pyspark.sql.functions import lit
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType

## read again the data
df = spark.read.format("csv").load("data/housing.csv", header=True, sep=',',inferSchema=True, multiLine=True)
df = df.withColumnRenamed("Avg. Area Income", "Avg_Area_Income") \
       .withColumnRenamed("Avg. Area House Age", "Avg_Area_House_Age") \
       .withColumnRenamed("Avg. Area Number of Rooms", "Avg_Area_Number_of_Rooms") \
       .withColumnRenamed("Avg. Area Number of Bedrooms", "Avg_Area_Number_of_Bedrooms") \
       .withColumnRenamed("Area Population", "Area_Population")

def random_nan(value):
    if random.random() < 0.1:
        return None
    return value

random_nan_udf = udf(random_nan, FloatType())

df = df.withColumn('Avg_Area_Income', random_nan_udf(df['Avg_Area_Income']))

df.show(5)

+---------------+------------------+------------------------+---------------------------+---------------+-----------+--------------------+
|Avg_Area_Income|Avg_Area_House_Age|Avg_Area_Number_of_Rooms|Avg_Area_Number_of_Bedrooms|Area_Population|      Price|             Address|
+---------------+------------------+------------------------+---------------------------+---------------+-----------+--------------------+
|       79545.46|       5.682861322|             7.009188143|                       4.09|     23086.8005|1059033.558|208 Michael Ferry...|
|       79248.64|       6.002899808|             6.730821019|                       3.09|    40173.07217|1505890.915|188 Johnson Views...|
|      61287.066|        5.86588984|              8.51272743|                       5.13|     36882.1594|1058987.988|9127 Elizabeth St...|
|       63345.24|       7.188236095|             5.586728665|                       3.26|    34310.24283|1260616.807|USS Barnett\nFPO ...|
|      59982.195|       5.0

In [14]:
## show rows with nan values
df.filter(df['Avg_Area_Income'].isNull()).show()

+---------------+------------------+------------------------+---------------------------+---------------+-----------+--------------------+
|Avg_Area_Income|Avg_Area_House_Age|Avg_Area_Number_of_Rooms|Avg_Area_Number_of_Bedrooms|Area_Population|      Price|             Address|
+---------------+------------------+------------------------+---------------------------+---------------+-----------+--------------------+
|      66394.875|       7.069512154|             7.204639709|                       3.18|    39741.07751| 1499988.88|71956 Jenkins Fal...|
|       74399.84|       6.382452912|             7.252665322|                       6.36|    41084.66282| 1417819.74|03819 Lee Junctio...|
|       75795.58|       5.786780024|             7.327325343|                       6.25|    33197.77774|1534479.907|562 Brown Junctio...|
|       82061.56|       4.182828228|             5.963031884|                        4.1|     36271.4085|1102821.438|708 Pena Ramp\nTe...|
|      67579.234|       5.9

In [15]:
## Handling missing values
df.count()

5000

In [16]:
df = df.na.drop()
df.count()

4499

## Filling NaN Values

In [47]:
## Filling missing values

# read again the data
df = spark.read.format("csv").load("data/housing.csv", header=True, sep=',',inferSchema=True, multiLine=True)
df = df = df.withColumnRenamed("Avg. Area Income", "Avg_Area_Income") \
       .withColumnRenamed("Avg. Area House Age", "Avg_Area_House_Age") \
       .withColumnRenamed("Avg. Area Number of Rooms", "Avg_Area_Number_of_Rooms") \
       .withColumnRenamed("Avg. Area Number of Bedrooms", "Avg_Area_Number_of_Bedrooms") \
       .withColumnRenamed("Area Population", "Area_Population")

## pyspark seed
random.seed(42)

## generate random values over the first, second and third columns
def random_nan(value):
    if random.random() < 0.1:
        return None
    return value

## modify values randomly over the first column
random_nan_udf = udf(random_nan, FloatType())

df = df.withColumn('Avg_Area_Income', random_nan_udf(df['Avg_Area_Income']))
df = df.withColumn('Avg_Area_House_Age', random_nan_udf(df['Avg_Area_House_Age']))
df = df.withColumn('Avg_Area_Number_of_Rooms', random_nan_udf(df['Avg_Area_Number_of_Rooms']))


## show rows with nan values
df.show(20)


+---------------+------------------+------------------------+---------------------------+---------------+-----------+--------------------+
|Avg_Area_Income|Avg_Area_House_Age|Avg_Area_Number_of_Rooms|Avg_Area_Number_of_Bedrooms|Area_Population|      Price|             Address|
+---------------+------------------+------------------------+---------------------------+---------------+-----------+--------------------+
|       79545.46|         5.6828613|                7.009188|                       4.09|     23086.8005|1059033.558|208 Michael Ferry...|
|       79248.64|         6.0028996|                6.730821|                       3.09|    40173.07217|1505890.915|188 Johnson Views...|
|      61287.066|           5.86589|                8.512728|                       5.13|     36882.1594|1058987.988|9127 Elizabeth St...|
|       63345.24|          7.188236|               5.5867286|                       3.26|    34310.24283|1260616.807|USS Barnett\nFPO ...|
|      59982.195|         5

In [48]:
## filling missing values with the mean of the column  
from pyspark.sql.functions import mean
df.na.fill(df.select(mean(df['Avg_Area_Income'])).collect()[0][0], ['Avg_Area_Income']).show(5)

+---------------+------------------+------------------------+---------------------------+---------------+-----------+--------------------+
|Avg_Area_Income|Avg_Area_House_Age|Avg_Area_Number_of_Rooms|Avg_Area_Number_of_Bedrooms|Area_Population|      Price|             Address|
+---------------+------------------+------------------------+---------------------------+---------------+-----------+--------------------+
|       79545.46|         5.6828613|                7.009188|                       4.09|     23086.8005|1059033.558|208 Michael Ferry...|
|       79248.64|         6.0028996|                6.730821|                       3.09|    40173.07217|1505890.915|188 Johnson Views...|
|      61287.066|           5.86589|                8.512728|                       5.13|     36882.1594|1058987.988|9127 Elizabeth St...|
|       63345.24|          7.188236|                    NULL|                       3.26|    34310.24283|1260616.807|USS Barnett\nFPO ...|
|      59982.195|          

In [49]:
# read again the data
df = spark.read.format("csv").load("data/housing.csv", header=True, sep=',',inferSchema=True, multiLine=True)
df = df = df.withColumnRenamed("Avg. Area Income", "Avg_Area_Income") \
       .withColumnRenamed("Avg. Area House Age", "Avg_Area_House_Age") \
       .withColumnRenamed("Avg. Area Number of Rooms", "Avg_Area_Number_of_Rooms") \
       .withColumnRenamed("Avg. Area Number of Bedrooms", "Avg_Area_Number_of_Bedrooms") \
       .withColumnRenamed("Area Population", "Area_Population")

## pyspark seed
random.seed(42)

## generate random values over the first, second and third columns
def random_nan(value):
    if random.random() < 0.1:
        return None
    return value

## modify values randomly over the first column
random_nan_udf = udf(random_nan, FloatType())

df = df.withColumn('Avg_Area_Income', random_nan_udf(df['Avg_Area_Income']))
df = df.withColumn('Avg_Area_House_Age', random_nan_udf(df['Avg_Area_House_Age']))
df = df.withColumn('Avg_Area_Number_of_Rooms', random_nan_udf(df['Avg_Area_Number_of_Rooms']))


## show rows with nan values
df.show(20)

+---------------+------------------+------------------------+---------------------------+---------------+-----------+--------------------+
|Avg_Area_Income|Avg_Area_House_Age|Avg_Area_Number_of_Rooms|Avg_Area_Number_of_Bedrooms|Area_Population|      Price|             Address|
+---------------+------------------+------------------------+---------------------------+---------------+-----------+--------------------+
|           NULL|         5.6828613|                7.009188|                       4.09|     23086.8005|1059033.558|208 Michael Ferry...|
|       79248.64|              NULL|                6.730821|                       3.09|    40173.07217|1505890.915|188 Johnson Views...|
|      61287.066|           5.86589|                8.512728|                       5.13|     36882.1594|1058987.988|9127 Elizabeth St...|
|       63345.24|          7.188236|                    NULL|                       3.26|    34310.24283|1260616.807|USS Barnett\nFPO ...|
|      59982.195|         5

In [50]:
## replace nan values with the mean of the column
from pyspark.ml.feature import Imputer

imputer = Imputer(inputCols=['Avg_Area_Income', 'Avg_Area_House_Age'], 
                  outputCols=[".{}_imputed".format(c) for c in ['Avg_Area_Income', 'Avg_Area_House_Age']]).setStrategy("mean")


imputer.fit(df).transform(df).show(20)

+---------------+------------------+------------------------+---------------------------+---------------+-----------+--------------------+------------------------+---------------------------+
|Avg_Area_Income|Avg_Area_House_Age|Avg_Area_Number_of_Rooms|Avg_Area_Number_of_Bedrooms|Area_Population|      Price|             Address|.Avg_Area_Income_imputed|.Avg_Area_House_Age_imputed|
+---------------+------------------+------------------------+---------------------------+---------------+-----------+--------------------+------------------------+---------------------------+
|       79545.46|         5.6828613|                7.009188|                       4.09|     23086.8005|1059033.558|208 Michael Ferry...|                79545.46|                  5.6828613|
|       79248.64|         6.0028996|                    NULL|                       3.09|    40173.07217|1505890.915|188 Johnson Views...|                79248.64|                  6.0028996|
|      61287.066|           5.86589|    

## Filter Operation

In [52]:
## read dataset
# read again the data
df = spark.read.format("csv").load("data/housing.csv", header=True, sep=',',inferSchema=True, multiLine=True)
df = df = df.withColumnRenamed("Avg. Area Income", "Avg_Area_Income") \
       .withColumnRenamed("Avg. Area House Age", "Avg_Area_House_Age") \
       .withColumnRenamed("Avg. Area Number of Rooms", "Avg_Area_Number_of_Rooms") \
       .withColumnRenamed("Avg. Area Number of Bedrooms", "Avg_Area_Number_of_Bedrooms") \
       .withColumnRenamed("Area Population", "Area_Population")
       
## pyspark seed
random.seed(42)

df.show(5)

+---------------+------------------+------------------------+---------------------------+---------------+-----------+--------------------+
|Avg_Area_Income|Avg_Area_House_Age|Avg_Area_Number_of_Rooms|Avg_Area_Number_of_Bedrooms|Area_Population|      Price|             Address|
+---------------+------------------+------------------------+---------------------------+---------------+-----------+--------------------+
|    79545.45857|       5.682861322|             7.009188143|                       4.09|     23086.8005|1059033.558|208 Michael Ferry...|
|    79248.64245|       6.002899808|             6.730821019|                       3.09|    40173.07217|1505890.915|188 Johnson Views...|
|    61287.06718|        5.86588984|              8.51272743|                       5.13|     36882.1594|1058987.988|9127 Elizabeth St...|
|    63345.24005|       7.188236095|             5.586728665|                       3.26|    34310.24283|1260616.807|USS Barnett\nFPO ...|
|    59982.19723|       5.0

In [56]:
## filter avg area income greater than 80000
df.filter(df['Avg_Area_Income'] > 80000).show(5)

## filter avg area income greater than 80000
df.filter("Avg_Area_Income > 80000").show(5)

+---------------+------------------+------------------------+---------------------------+---------------+-----------+--------------------+
|Avg_Area_Income|Avg_Area_House_Age|Avg_Area_Number_of_Rooms|Avg_Area_Number_of_Bedrooms|Area_Population|      Price|             Address|
+---------------+------------------+------------------------+---------------------------+---------------+-----------+--------------------+
|    80175.75416|       4.988407758|             6.104512439|                       4.04|    26748.42842|1068138.074|06039 Jennifer Is...|
|    81885.92718|        4.42367179|             8.167688003|                        6.1|    40149.96575|1545154.813|Unit 9446 Box 095...|
|    80527.47208|       8.093512681|               5.0427468|                        4.1|    47224.35984|1707045.722|6368 John Motorwa...|
|    86294.99909|        6.62745694|             8.011897853|                       4.07|    47560.77534| 2146925.34|030 Larry Park Su...|
|    82173.62608|       4.0

In [57]:
## filter avg area income greater than 80000
df.filter("Avg_Area_Income > 80000").select(['Avg_Area_Income','Area_Population']).show(5)

+---------------+---------------+
|Avg_Area_Income|Area_Population|
+---------------+---------------+
|    80175.75416|    26748.42842|
|    81885.92718|    40149.96575|
|    80527.47208|    47224.35984|
|    86294.99909|    47560.77534|
|    82173.62608|    38853.91807|
+---------------+---------------+
only showing top 5 rows



## Groupby operations

In [58]:
## read again the data
df = spark.read.format("csv").load("data/housing.csv", header=True, sep=',',inferSchema=True, multiLine=True)
df = df = df.withColumnRenamed("Avg. Area Income", "Avg_Area_Income") \
       .withColumnRenamed("Avg. Area House Age", "Avg_Area_House_Age") \
       .withColumnRenamed("Avg. Area Number of Rooms", "Avg_Area_Number_of_Rooms") \
       .withColumnRenamed("Avg. Area Number of Bedrooms", "Avg_Area_Number_of_Bedrooms") \
       .withColumnRenamed("Area Population", "Area_Population")
       
## pyspark seed
random.seed(42)

## show the first 5 rows
df.show(5)

+---------------+------------------+------------------------+---------------------------+---------------+-----------+--------------------+
|Avg_Area_Income|Avg_Area_House_Age|Avg_Area_Number_of_Rooms|Avg_Area_Number_of_Bedrooms|Area_Population|      Price|             Address|
+---------------+------------------+------------------------+---------------------------+---------------+-----------+--------------------+
|    79545.45857|       5.682861322|             7.009188143|                       4.09|     23086.8005|1059033.558|208 Michael Ferry...|
|    79248.64245|       6.002899808|             6.730821019|                       3.09|    40173.07217|1505890.915|188 Johnson Views...|
|    61287.06718|        5.86588984|              8.51272743|                       5.13|     36882.1594|1058987.988|9127 Elizabeth St...|
|    63345.24005|       7.188236095|             5.586728665|                       3.26|    34310.24283|1260616.807|USS Barnett\nFPO ...|
|    59982.19723|       5.0

In [59]:
## group by avg area number of bedrooms
df.groupBy("Avg_Area_Number_of_Bedrooms").count().show()

+---------------------------+-----+
|Avg_Area_Number_of_Bedrooms|count|
+---------------------------+-----+
|                       3.26|   25|
|                       4.19|   33|
|                        2.4|   19|
|                       5.05|   13|
|                       5.13|   13|
|                       5.48|   10|
|                       2.41|    9|
|                       6.44|    7|
|                       6.17|    7|
|                       3.02|   34|
|                       4.23|   33|
|                        5.4|   13|
|                       6.27|   11|
|                       4.02|   33|
|                       6.43|   18|
|                       3.08|   26|
|                       4.36|   29|
|                        3.5|   32|
|                        6.1|   19|
|                       2.28|   12|
+---------------------------+-----+
only showing top 20 rows



In [60]:
## group by avg area number of bedrooms and sum avg area income
df.groupBy("Avg_Area_Number_of_Bedrooms").sum('Avg_Area_Income').show()

+---------------------------+--------------------+
|Avg_Area_Number_of_Bedrooms|sum(Avg_Area_Income)|
+---------------------------+--------------------+
|                       3.26|  1678189.9352099996|
|                       4.19|  2197466.3429199997|
|                        2.4|  1288603.9395100002|
|                       5.05|        891658.42447|
|                       5.13|   820151.4083499999|
|                       5.48|        705207.80489|
|                       2.41|        634125.52629|
|                       6.44|        466709.38056|
|                       6.17|        521614.37352|
|                       3.02|  2315050.3622700004|
|                       4.23|  2249686.5653600004|
|                        5.4|   859719.3765799999|
|                       6.27|        725897.71919|
|                       4.02|  2206801.2514700005|
|                       6.43|  1219523.4166499998|
|                       3.08|  1776050.5639700003|
|                       4.36|  