In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder.appName("CompanyBroadcast").master("local[*]").getOrCreate()

In [3]:
company_df = spark.read.option("header", True).csv("data/company_data/companies.csv")

In [4]:
company_df.show()

+---+-------------+----------+
| id|      company|country_id|
+---+-------------+----------+
|  1|       Mybuzz|        11|
|  2|Chatterbridge|         3|
|  3|       Skyble|         7|
|  4|   Brainverse|         4|
|  5|   Jabbertype|         7|
|  6|     Zoombeat|        12|
|  7|     Tanoodle|         8|
|  8|      Feedmix|        13|
|  9|      Meembee|        20|
| 10|     Riffpath|         7|
| 11|      Dynabox|        19|
| 12|   Browsetype|         3|
| 13|      Dynazzy|        20|
| 14|       Demizz|        19|
| 15|    Riffpedia|        18|
| 16|         Zava|        13|
| 17|      Pixonyx|        20|
| 18|       Yambee|        15|
| 19|        Yombu|         7|
| 20|        Voomm|        14|
+---+-------------+----------+
only showing top 20 rows



In [5]:
country_df = spark.read.option("header", True).csv("data/company_data/countries.csv")

In [6]:
country_df.show()

+---+-------------+
| id|      country|
+---+-------------+
|  1|       Russia|
|  2|        Yemen|
|  3|       Sweden|
|  4|  Philippines|
|  5|     Malaysia|
|  6|       France|
|  7|       Greece|
|  8|    Argentina|
|  9|      Ecuador|
| 10|         Peru|
| 11|        China|
| 12|United States|
| 13|        Malta|
| 14|      Somalia|
| 15|      Nigeria|
| 16|        Italy|
| 17|        Spain|
| 18|        Niger|
| 19|   Bangladesh|
| 20|      Ukraine|
+---+-------------+



In [7]:
from pyspark.sql.functions import broadcast
join_df = company_df.join(broadcast(country_df), company_df["country_id"] == country_df["id"], "inner")

In [8]:
join_df.show()

+---+-------------+----------+---+-------------+
| id|      company|country_id| id|      country|
+---+-------------+----------+---+-------------+
|  1|       Mybuzz|        11| 11|        China|
|  2|Chatterbridge|         3|  3|       Sweden|
|  3|       Skyble|         7|  7|       Greece|
|  4|   Brainverse|         4|  4|  Philippines|
|  5|   Jabbertype|         7|  7|       Greece|
|  6|     Zoombeat|        12| 12|United States|
|  7|     Tanoodle|         8|  8|    Argentina|
|  8|      Feedmix|        13| 13|        Malta|
|  9|      Meembee|        20| 20|      Ukraine|
| 10|     Riffpath|         7|  7|       Greece|
| 11|      Dynabox|        19| 19|   Bangladesh|
| 12|   Browsetype|         3|  3|       Sweden|
| 13|      Dynazzy|        20| 20|      Ukraine|
| 14|       Demizz|        19| 19|   Bangladesh|
| 15|    Riffpedia|        18| 18|        Niger|
| 16|         Zava|        13| 13|        Malta|
| 17|      Pixonyx|        20| 20|      Ukraine|
| 18|       Yambee| 

In [9]:
join_df.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- BroadcastHashJoin [country_id#19], [id#57], Inner, BuildRight, false
   :- Filter isnotnull(country_id#19)
   :  +- FileScan csv [id#17,company#18,country_id#19] Batched: false, DataFilters: [isnotnull(country_id#19)], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/d:/BigData/Spark/data/company_data/companies.csv], PartitionFilters: [], PushedFilters: [IsNotNull(country_id)], ReadSchema: struct<id:string,company:string,country_id:string>
   +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, string, false]),false), [plan_id=134]
      +- Filter isnotnull(id#57)
         +- FileScan csv [id#57,country#58] Batched: false, DataFilters: [isnotnull(id#57)], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/d:/BigData/Spark/data/company_data/countries.csv], PartitionFilters: [], PushedFilters: [IsNotNull(id)], ReadSchema: struct<id:string,country:string>




In [10]:
join_df.select("company", "country").filter(join_df["country"] == "United States") .show()

+----------+-------------+
|   company|      country|
+----------+-------------+
|  Zoombeat|United States|
|   Skilith|United States|
|    Meedoo|United States|
|     Twimm|United States|
|Bubbletube|United States|
|   Innojam|United States|
+----------+-------------+

