In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

def IntegerSafe(value): # In case there are non-integer type to be converted.
    try:
        return int(value)
    except:
        return None

ss = SparkSession.builder.getOrCreate()
sc = ss.sparkContext

business = sc.textFile("../Data/SF_business/filtered_registered_business_sf.csv")\
             .map(lambda x : x.split(','))\
             .map(lambda x : (IntegerSafe(x[0]), x[1], x[2], x[3], x[4]))

supervisor = sc.textFile("../Data/SF_business/supervisor_sf.csv")\
               .map(lambda x : x.split(","))\
               .map(lambda x: (IntegerSafe(x[0]), IntegerSafe(x[1])))
              

business_schema = StructType([ StructField("zip", IntegerType(), True),
                               StructField("name", StringType(), False),
                               StructField("street", StringType(), True),
                               StructField("city", StringType(), True),
                               StructField("state", StringType(), True)
                            ])

supervisor_schema = StructType([ StructField("zip", IntegerType(), False),
                    StructField("id", IntegerType(), False)
                    ])

business_df = ss.createDataFrame(business, business_schema)
supervisor_df = ss.createDataFrame(supervisor, supervisor_schema)

22/02/22 19:39:31 WARN Utils: Your hostname, Fans-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.167.87 instead (on interface en0)
22/02/22 19:39:31 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/02/22 19:39:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## inner joins

In [4]:
business_df.show(1)
supervisor_df.show(1)

+-----+-----------------+-----------------+-------------+-----+
|  zip|             name|           street|         city|state|
+-----+-----------------+-----------------+-------------+-----+
|94123|Tournahu George L|3301 Broderick St|San Francisco|   CA|
+-----+-----------------+-----------------+-------------+-----+
only showing top 1 row

+-----+---+
|  zip| id|
+-----+---+
|94102|  8|
+-----+---+
only showing top 1 row



In [6]:
business_df.join(supervisor_df,'zip').show(1)

[Stage 6:>                                                          (0 + 2) / 2]

+-----+--------------------+--------------------+-------------+-----+---+
|  zip|                name|              street|         city|state| id|
+-----+--------------------+--------------------+-------------+-----+---+
|94109|Stephens Institut...|1835-49 Van Ness Ave|San Francisco|   CA|  2|
+-----+--------------------+--------------------+-------------+-----+---+
only showing top 1 row



                                                                                

## outer joins

In [9]:
business_df.join(supervisor_df,'zip','outer').show(5)

[Stage 21:>                                                         (0 + 2) / 2]

+----+--------------------+--------------------+-----------+-----+----+
| zip|                name|              street|       city|state|  id|
+----+--------------------+--------------------+-----------+-----+----+
|1752|Communication Tec...|400 Donald Lynch ...|Marlborough|   MA|null|
|1752|Hologic Limited P...|       250 Campus Dr|Marlborough|   MA|null|
|1752|Navilyst Medical Inc|        26 Forest St|Marlborough|   MA|null|
|1752|Sunovion Pharmace...|     84 Waterford Dr|Marlborough|   MA|null|
|1752| Rewalk Robotics Inc|200 Donald Lynch ...|Marlborough|   MA|null|
+----+--------------------+--------------------+-----------+-----+----+
only showing top 5 rows



                                                                                

## leftsemi join

In [10]:
business_df.join(supervisor_df,'zip','leftsemi').show(5)

+-----+--------------------+--------------------+-------------+-----+
|  zip|                name|              street|         city|state|
+-----+--------------------+--------------------+-------------+-----+
|94109|Stephens Institut...|1835-49 Van Ness Ave|San Francisco|   CA|
|94109|Stephens Institut...|        1055 Pine St|San Francisco|   CA|
|94109|     Alioto F Co Inc|    440 Jefferson St|San Francisco|   CA|
|94109|     Haines Robert D|   786-792 Sutter St|San Francisco|   CA|
|94109|Avis Rent A Car S...|         675 Post St|San Francisco|   CA|
+-----+--------------------+--------------------+-------------+-----+
only showing top 5 rows



In [11]:
sc.stop()