# Generic Joins Syntax

* df1.join(df2, "df1Key" =="df2Key")
* df1.join(df2).where("df1Key" == "df2Key")
* df1.join(df2).filter("df1Key" == "df2Key")
* df1.join(df2, "df1Key" == "df2Key", "inner")
* df.join(df2,['msid'],'left')

# Join Types
Source: https://jaceklaskowski.gitbooks.io/mastering-spark-sql/spark-sql-joins.html


<table id="join-types" class="tableblock frame-all grid-all spread">
<caption class="title">Table 2. Join Types</caption>
<colgroup>
<col style="width: 33.3333%;">
<col style="width: 33.3333%;">
<col style="width: 33.3334%;">
</colgroup>
<thead>
<tr>
<th class="tableblock halign-left valign-top">SQL</th>
<th class="tableblock halign-left valign-top">Name (joinType)</th>
<th class="tableblock halign-left valign-top">JoinType</th>
</tr>
</thead>
<tbody>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock"><a id="CROSS"></a> <code>CROSS</code></p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock"><a id="cross"></a> <code>cross</code></p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock"><a id="Cross"></a> <code>Cross</code></p></td>
</tr>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock"><a id="INNER"></a> <code>INNER</code></p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock"><a id="inner"></a> <code>inner</code></p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock"><a id="Inner"></a> <code>Inner</code></p></td>
</tr>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock"><a id="FullOuter"></a><a id="FULL_OUTER"></a> <code>FULL OUTER</code></p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock"><code>outer</code>, <code>full</code>, <code>fullouter</code></p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock"><code>FullOuter</code></p></td>
</tr>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock"><a id="LEFT_ANTI"></a> <code>LEFT ANTI</code></p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock"><code>leftanti</code></p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock"><a id="LeftAnti"></a> <code>LeftAnti</code></p></td>
</tr>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock"><a id="LEFT_OUTER"></a> <code>LEFT OUTER</code></p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock"><code>leftouter</code>, <code>left</code></p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock"><a id="LeftOuter"></a> <code>LeftOuter</code></p></td>
</tr>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock"><a id="LEFT_SEMI"></a> <code>LEFT SEMI</code></p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock"><code>leftsemi</code></p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock"><a id="LeftSemi"></a> <code>LeftSemi</code></p></td>
</tr>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock"><a id="RIGHT_OUTER"></a> <code>RIGHT OUTER</code></p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock"><code>rightouter</code>, <code>right</code></p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock"><a id="RightOuter"></a> <code>RightOuter</code></p></td>
</tr>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock"><a id="NATURAL"></a> <code>NATURAL</code></p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">Special case for <code>Inner</code>, <code>LeftOuter</code>, <code>RightOuter</code>, <code>FullOuter</code></p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock"><code>NaturalJoin</code></p></td>
</tr>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock"><a id="using"></a><a id="USING"></a> <code>USING</code></p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">Special case for <code>Inner</code>, <code>LeftOuter</code>, <code>LeftSemi</code>, <code>RightOuter</code>, <code>FullOuter</code>, <code>LeftAnti</code></p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock"><a id="UsingJoin"></a> <code>UsingJoin</code></p></td>
</tr>
</tbody>
</table>
<div id="ExistenceJoin" class="paragraph">
<p><code>ExistenceJoin</code> is an artifical join type used to express an existential sub-query, that is often referred to as <strong>existential join</strong>.</p>
</div>
<div class="admonitionblock note">
<table>
<tr>
<td class="icon">
<div class="title">Note</div>
</td>
<td class="content">
<a href="#LeftAnti">LeftAnti</a> and <a href="#ExistenceJoin">ExistenceJoin</a> are special cases of <a href="#LeftOuter">LeftOuter</a>.
</td>
</tr>
</table>
</div>
<div class="paragraph">
<p>You can also find that Spark SQL uses the following two families of joins:</p>
</div>
<div class="ulist">
<ul>
<li>
<p><a id="InnerLike"></a> <code>InnerLike</code> with <a href="#Inner">Inner</a> and <a href="#Cross">Cross</a></p>
</li>
<li>
<p><a id="LeftExistence"></a> <code>LeftExistence</code> with <a href="#LeftSemi">LeftSemi</a>, <a href="#LeftAnti">LeftAnti</a> and <a href="#ExistenceJoin">ExistenceJoin</a></p>
</li>
</ul>
</div>
<div class="admonitionblock tip">
<table>
<tr>
<td class="icon">
<div class="title">Tip</div>
</td>
<td class="content">
Name are case-insensitive and can use the underscore (<code>_</code>) at any position, i.e. <code>left_anti</code> and <code>LEFT_ANTI</code> are equivalent.
</td>
</tr>
</table>
</div>
<div class="admonitionblock note">
<table>
<tr>
<td class="icon">
<div class="title">Note</div>
</td>
<td class="content">
Spark SQL offers different <a href="spark-sql-SparkStrategy-JoinSelection.html#join-selection-requirements">join strategies</a> with <a href="spark-sql-joins-broadcast.html">Broadcast Joins (aka Map-Side Joins)</a> among them that are supposed to optimize your join queries over large distributed datasets.
</td>
</tr>
</table>

In [2]:
import sys
sys.path.append('/home/iceberg/notebooks/PyCon_LT_Workshop')

from helpers.utils import get_spark_session, get_yellow_taxi_data, get_dim_data
from pyspark.sql import functions as f
spark = get_spark_session("joins")

yellow_taxi_data = get_yellow_taxi_data(spark=spark)

dim_taxi_zones, dim_rates, dim_payments, dim_vendor = get_dim_data(spark)



In [3]:
yellow_taxi_data.columns, dim_vendor.columns

(['VendorID',
  'tpep_pickup_datetime',
  'tpep_dropoff_datetime',
  'passenger_count',
  'trip_distance',
  'RatecodeID',
  'store_and_fwd_flag',
  'PULocationID',
  'DOLocationID',
  'payment_type',
  'fare_amount',
  'extra',
  'mta_tax',
  'tip_amount',
  'tolls_amount',
  'improvement_surcharge',
  'total_amount',
  'congestion_surcharge',
  'airport_fee'],
 ['vendor_id', 'vendor_name'])

In [4]:
joined = yellow_taxi_data.filter("fare_amount>100").join(dim_vendor, yellow_taxi_data.VendorID == dim_vendor.vendor_id, "full")

In [5]:
joined.select("VendorID", "vendor_name").distinct().head(5)

[Row(VendorID=2, vendor_name='VeriFone Inc.'),
 Row(VendorID=1, vendor_name='Creative Mobile Technologies'),
 Row(VendorID=6, vendor_name=None)]