In [1]:
%run -i /opt/setup_spark.py

In [2]:
sqlContext = SQLContext(sc)

## Load house prices

In [3]:
text_RDD = sc.textFile("houses.txt")

In [17]:
def mapper_parse_lines(line):
    """Parse line into (neighborhoood, price) pair"""
    words = line.split()
    return (words[1], float(words[2]))

In [18]:
house_prices_RDD = text_RDD.map(mapper_parse_lines)

In [19]:
house_prices_RDD.collect()

[(u'Downtown', 400000.0), (u'Downtown', 240000.0), (u'Hilltop', 650000.0)]

In [20]:
house_prices_df = sqlContext.createDataFrame(house_prices_RDD, ["neighborhood", "price"])

In [21]:
house_prices_df.show()

+------------+--------+
|neighborhood|   price|
+------------+--------+
|    Downtown|400000.0|
|    Downtown|240000.0|
|     Hilltop|650000.0|
+------------+--------+



In [22]:
house_prices_df.printSchema()

root
 |-- neighborhood: string (nullable = true)
 |-- price: double (nullable = true)



## Load inflation

In [23]:
inflation_text_RDD = sc.textFile("inflation.txt")

In [24]:
def mapper_parse__inflation_lines(line):
    """Parse line into (neighborhoood, inflation) pair"""
    words = line.split()
    return (words[0], float(words[1]))

In [25]:
inflation_RDD = inflation_text_RDD.map(mapper_parse__inflation_lines)

In [26]:
inflation_RDD.collect()

[(u'Downtown', 2.1), (u'Hilltop', 4.5)]

In [27]:
inflation_df = sqlContext.createDataFrame(inflation_RDD, ["neighborhood", "inflation"])

In [28]:
inflation_df.printSchema()

root
 |-- neighborhood: string (nullable = true)
 |-- inflation: double (nullable = true)



## join

In [30]:
house_prices_df.join(inflation_df).collect()

[Row(neighborhood=u'Downtown', price=400000.0, neighborhood=u'Downtown', inflation=2.1),
 Row(neighborhood=u'Downtown', price=400000.0, neighborhood=u'Hilltop', inflation=4.5),
 Row(neighborhood=u'Downtown', price=240000.0, neighborhood=u'Downtown', inflation=2.1),
 Row(neighborhood=u'Downtown', price=240000.0, neighborhood=u'Hilltop', inflation=4.5),
 Row(neighborhood=u'Hilltop', price=650000.0, neighborhood=u'Downtown', inflation=2.1),
 Row(neighborhood=u'Hilltop', price=650000.0, neighborhood=u'Hilltop', inflation=4.5)]

In [31]:
def mapper_multiply_price_inflation(row):
    return (row["neighborhood"], row["price"] * row["inflation"])

In [38]:
house_prices_nextyear_join = house_prices_df.join(inflation_df, "neighborhood")

In [59]:
house_prices_nextyear_df = \
    house_prices_nextyear_join.select("neighborhood", 
    (house_prices_nextyear_join.price * (1+house_prices_nextyear_join.inflation/100.)).alias("price"))

In [60]:
house_prices_nextyear_df.show()

+------------+------------------+
|neighborhood|             price|
+------------+------------------+
|    Downtown|408399.99999999994|
|    Downtown|245039.99999999997|
|     Hilltop|          679250.0|
+------------+------------------+



## reduce

In [61]:
house_prices_nextyear_df.groupBy(house_prices_nextyear_df.neighborhood).sum().show()

+------------+-----------------+
|neighborhood|       sum(price)|
+------------+-----------------+
|    Downtown|653439.9999999999|
|     Hilltop|         679250.0|
+------------+-----------------+

