In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

spark = SparkSession\
    .builder\
    .appName("df-merge")\
    .getOrCreate()

In [2]:
mp_files = spark.read.format("csv")\
  .option("header", "true")\
  .option("inferSchema", "true")\
  .load("mp_files.csv")

In [3]:
mp_files.show(5, False)

+-------+-----------------------------------------------------------+
|file_id|s3_location                                                |
+-------+-----------------------------------------------------------+
|0      |s3://bucket/prefix/367a3ab9-2b5a-4997-804b-8cea2d32b2e5.mp3|
|1      |s3://bucket/prefix/1bb77dfa-8576-4f0c-a96a-027ac646b631.mp3|
|2      |s3://bucket/prefix/edf7ea24-0287-4836-b286-fa769734d0a7.mp3|
|3      |s3://bucket/prefix/8e7ae1ff-5b45-438e-ac09-23f30f3f63e2.mp3|
|4      |s3://bucket/prefix/f80cf03c-4deb-4e65-9604-6e9ded2d1b68.mp3|
+-------+-----------------------------------------------------------+
only showing top 5 rows



In [4]:
mp_meta = spark.read.format("csv")\
  .option("header", "true")\
  .option("inferSchema", "true")\
  .load("mp_meta.csv")

In [5]:
mp_meta.show(5, False)

+-------+----+----------+----------+--------------------------+----+
|file_id|lob |region    |cust_id   |created                   |uid |
+-------+----+----------+----------+--------------------------+----+
|0      |lob1|us-west   |8555632908|2020-07-23 21:55:36.587912|obhd|
|1      |lob2|us-west   |7221348949|2020-07-23 21:55:36.587958|x62d|
|2      |lob1|us-central|4218789251|2020-07-23 21:55:36.588151|oxjd|
|3      |lob2|us-central|2473658569|2020-07-23 21:55:36.588178|33nd|
|4      |lob1|us-central|3667278745|2020-07-23 21:55:36.588200|a1td|
+-------+----+----------+----------+--------------------------+----+
only showing top 5 rows



In [6]:
mp_meta.columns + mp_files.columns

['file_id',
 'lob',
 'region',
 'cust_id',
 'created',
 'uid',
 'file_id',
 's3_location']

In [7]:
mp = mp_meta.alias('m').join(mp_files.alias('f'),\
        F.col('m.file_id')==F.col('f.file_id'))\
    .select([F.col('m.'+xx) for xx in mp_meta.columns] + [F.col('f.s3_location')])

In [8]:
mp.show(5, False)

+-------+----+----------+----------+--------------------------+----+-----------------------------------------------------------+
|file_id|lob |region    |cust_id   |created                   |uid |s3_location                                                |
+-------+----+----------+----------+--------------------------+----+-----------------------------------------------------------+
|0      |lob1|us-west   |8555632908|2020-07-23 21:55:36.587912|obhd|s3://bucket/prefix/367a3ab9-2b5a-4997-804b-8cea2d32b2e5.mp3|
|1      |lob2|us-west   |7221348949|2020-07-23 21:55:36.587958|x62d|s3://bucket/prefix/1bb77dfa-8576-4f0c-a96a-027ac646b631.mp3|
|2      |lob1|us-central|4218789251|2020-07-23 21:55:36.588151|oxjd|s3://bucket/prefix/edf7ea24-0287-4836-b286-fa769734d0a7.mp3|
|3      |lob2|us-central|2473658569|2020-07-23 21:55:36.588178|33nd|s3://bucket/prefix/8e7ae1ff-5b45-438e-ac09-23f30f3f63e2.mp3|
|4      |lob1|us-central|3667278745|2020-07-23 21:55:36.588200|a1td|s3://bucket/prefix/f80cf03c-4

In [10]:
mp.write.csv('mp_merge.csv',header=True, mode="overwrite")

In [11]:
!ls *.csv

mp_files.csv  mp_meta.csv

mp_merge.csv:
part-00000-5ac967f6-a463-47a9-8d9f-aa1b219a76d5-c000.csv  _SUCCESS


In [14]:
mp.write.mode("overwrite").parquet('mp_merge.parquet')