# WakeTeam Stream Processing

## Prerequisits

In [1]:
pip install tweepy

Note: you may need to restart the kernel to use updated packages.


In [2]:
import sys
import os
import requests
import tweepy
import json
import pandas as pd
import numpy as np
from pyspark import sql
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

## Create tables with city hobbies values and city images urls

In [3]:
city_list = ['Barcelona', 'Bilbao', 'Ibiza', 'Madrid', 'Oviedo', 'Sevilla', 'Valencia']
beach = [8, 2, 9, 0, 1, 0, 8]
city = [9, 5, 3, 9, 2, 7, 7]
nature = [4, 7, 9, 3, 8, 3, 6]
party = [8, 4, 9, 9, 3, 6, 7]
d = {'Beach': beach, 'City': city, 'Nature': nature, 'Party': party}

cities = pd.DataFrame(data=d, index=city_list)

cities

Unnamed: 0,Beach,City,Nature,Party
Barcelona,8,9,4,8
Bilbao,2,5,7,4
Ibiza,9,3,9,9
Madrid,0,9,3,9
Oviedo,1,2,8,3
Sevilla,0,7,3,6
Valencia,8,7,6,7


In [4]:
urls = [
    "https://cdn-image.departures.com/sites/default/files/1559596629/sagrada-familia-barcelona-BARCELONA0619.jpg",
    "https://www.telegraph.co.uk/content/dam/Travel/Destinations/Europe/Spain/Bilbao/art-scene-bilbao-attractions-xlarge.jpg",
    "https://www.athenaadvisers.com/wp-content/uploads/2018/07/Ibiza-3-787x564.jpeg",
    "https://www.fodors.com/wp-content/uploads/2018/12/UltimateMadrid__HERO_shutterstock_624145955.jpg",
    "https://static3.elcomercio.es/www/multimedia/202007/17/media/cortadas/pisos19072020-ktDD-U110825374864LV-624x385@RC.jpg",
    "https://aws.traveler.es/prod/designs/v1/assets/940x633/109723.jpg",
    "https://image.jimcdn.com/app/cms/image/transf/none/path/s528009aeedef3395/image/if4bf85251cca6c2d/version/1541518713/image.jpg"
]
data = {'image_url': urls}

cities_images = pd.DataFrame(data=data, index=city_list)

cities_images

Unnamed: 0,image_url
Barcelona,https://cdn-image.departures.com/sites/default...
Bilbao,https://www.telegraph.co.uk/content/dam/Travel...
Ibiza,https://www.athenaadvisers.com/wp-content/uplo...
Madrid,https://www.fodors.com/wp-content/uploads/2018...
Oviedo,https://static3.elcomercio.es/www/multimedia/2...
Sevilla,https://aws.traveler.es/prod/designs/v1/assets...
Valencia,https://image.jimcdn.com/app/cms/image/transf/...


## Create streaming dataframe for Flats

In [5]:
# Initialize SparkSession
spark = SparkSession \
    .builder \
    .appName("QualityLife") \
    .getOrCreate()

In [6]:
# Create flats_df stream
flats_df_stream = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "broker:29092") \
  .option("subscribe", "flats") \
  .load()

#flats_df_stream.printSchema()

In [7]:
# Create schema for flats_df
schema_flats = StructType(
    [
        StructField('house_city', StringType(), True),
        StructField('house_rooms', StringType(), True),
        StructField('house_code', StringType(), True),
        StructField('house_rent', StringType(), True)
        
    ]
)

flats_df = flats_df_stream.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)", "timestamp") \
    .withColumn("value", from_json("value", schema_flats)) \
    .select(col('key'), col("timestamp"), col('value.*'))

#flats_df.printSchema()

In [8]:
# Create table to store query output in memory
flats_df.writeStream \
 .outputMode("append") \
 .format("memory") \
 .option("truncate", "false") \
 .queryName("flats_all") \
 .start()

<pyspark.sql.streaming.StreamingQuery at 0x7f00db26cad0>

In [9]:
spark.sql("select * from flats_all order by timestamp desc").show(truncate = False)

+---+---------+----------+-----------+----------+----------+
|key|timestamp|house_city|house_rooms|house_code|house_rent|
+---+---------+----------+-----------+----------+----------+
+---+---------+----------+-----------+----------+----------+



## Create streaming for families

In [10]:
# Create families_df_stream
families_df_stream = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "broker:29092") \
  .option("subscribe", "families") \
  .load()

#families_df_stream.printSchema()

In [11]:
# Create schema for families_df
schema_families = StructType(
    [
        StructField('people_city', StringType(), True),
        StructField('people_members', StringType(), True),
        StructField('people_party', StringType(), True),
        StructField('people_beach', StringType(), True),
        StructField('tweet_id', StringType(), True),
        StructField('people_salary', StringType(), True),
        StructField('people_age', StringType(), True),
        StructField('people_name', StringType(), True),
        StructField('people_nature', StringType(), True)
    ]
)

families_df = families_df_stream.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)", "timestamp") \
    .withColumn("value", from_json("value", schema_families)) \
    .select(col('key'), col("timestamp"), col('value.*'))

#families_df.printSchema()

In [12]:
# Create table to store query output in memory
families_df.writeStream \
 .outputMode("append") \
 .format("memory") \
 .option("truncate", "false") \
 .queryName("families_all") \
 .start()

<pyspark.sql.streaming.StreamingQuery at 0x7f00fc24d250>

In [13]:
spark.sql("select * from families_all order by timestamp desc").show(truncate = False)

+---+---------+-----------+--------------+------------+------------+--------+-------------+----------+-----------+-------------+
|key|timestamp|people_city|people_members|people_party|people_beach|tweet_id|people_salary|people_age|people_name|people_nature|
+---+---------+-----------+--------------+------------+------------+--------+-------------+----------+-----------+-------------+
+---+---------+-----------+--------------+------------+------------+--------+-------------+----------+-----------+-------------+



In [14]:
# TWITTER AUTHENTICATION
consumer_key = ""
consumer_secret = ""
access_token = ""
access_token_secret = ""

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

twitter_api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

In [15]:
# DEFINE FUNCTIONS FOR DATA STREAM

# best_city
def best_city(df, hobbies):
    df = df.sub(hobbies, axis=1).abs()
    df['total'] = df.sum(axis=1)
    df = df.sort_values('total')
    city_name = df.index.values[0]
    return city_name
 

# flats_city
def flats_city(city_name, members, max_rent):
    query = (
        "SELECT * FROM flats_all "
        "WHERE house_city='{}' AND house_rooms>={} AND house_rent<={} "
        "ORDER BY house_rent DESC LIMIT 1".format(city_name, members, max_rent)
    )
    city_flats = spark.sql(query).toPandas()
    if city_flats.empty:
        flat_code = 0
    else:
        flat_code = int(city_flats.house_code)
    return flat_code


# reply_tweet_image
def reply_tweet_image(tweet_id, reply_text, city_name):
    image_url = cities_images[cities_images.index.values == city_name].image_url[0]
    filename = 'temp.jpg'
    request = requests.get(image_url, stream=True)
    if request.status_code == 200:
        with open(filename, 'wb') as image:
            for chunk in request:
                image.write(chunk)

        twitter_api.update_with_media(filename=filename, status=reply_text, in_reply_to_status_id=tweet_id)
        os.remove(filename)
    else:
        twitter_api.update_status(status=reply_text, in_reply_to_status_id=tweet_id)
     

In [16]:
# Define process_row function for each stream row
# process_row
def process_row(row):
    # Retrieve values
    people_beach = int(row['people_beach'])
    people_city = int(row['people_city'])
    people_nature = int(row['people_nature'])
    people_party = int(row['people_party'])
    people_members = int(row['people_members'])
    people_salary = int(row['people_salary'])
    people_name = row['people_name']
    tweet_id = int(row['tweet_id'])
    
    # Retrieve best city
    hobbies = [people_beach, people_city, people_nature, people_party]
    city_name = best_city(cities, hobbies)
    
    # Retrieve best flat
    max_rent = people_salary * 0.3 / 12 
    
    flat_code = flats_city(city_name, people_members, max_rent)
    
    #flat_code = 123512
    
    # Reply tweet
    if flat_code == 0:
        reply_text = (
            "@dlpexercisepro1 Hi {}, your future house is waiting for you in {}. "
            "Please, contact #QualityLife and we'll gladly help you find it! "
            "#WakeTeam #mdaedem".format(people_name, city_name)
        )
    else:
        reply_text = (
            "@dlpexercisepro1 Hi {}, your future house is waiting for you in {} with code {}. "
            "Please, contact #QualityLife for further details! "
            "#WakeTeam #mdaedem".format(people_name,city_name,flat_code)
        )
    
    reply_tweet_image(tweet_id, reply_text, city_name)


In [17]:
# Start stream
families_df_query = families_df.writeStream \
 .foreach(process_row) \
 .start()