In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

spark = SparkSession\
    .builder\
    .appName("df-merge")\
    .getOrCreate()

In [2]:
vt_files = spark.read.format("csv")\
  .option("header", "true")\
  .option("inferSchema", "true")\
  .load("vt_files.csv")

In [3]:
vt_files.show(5, False)

+-------+-----------------------------------------------------------+
|file_id|s3_location                                                |
+-------+-----------------------------------------------------------+
|0      |s3://bucket/prefix/367a3ab9-2b5a-4997-804b-8cea2d32b2e5.mp3|
|1      |s3://bucket/prefix/1bb77dfa-8576-4f0c-a96a-027ac646b631.mp3|
|2      |s3://bucket/prefix/edf7ea24-0287-4836-b286-fa769734d0a7.mp3|
|3      |s3://bucket/prefix/8e7ae1ff-5b45-438e-ac09-23f30f3f63e2.mp3|
|4      |s3://bucket/prefix/f80cf03c-4deb-4e65-9604-6e9ded2d1b68.mp3|
+-------+-----------------------------------------------------------+
only showing top 5 rows



In [4]:
vt_meta = spark.read.format("csv")\
  .option("header", "true")\
  .option("inferSchema", "true")\
  .load("vt_meta.csv")

In [8]:
vt_meta.show(5, False)

+-------+------+----------+-------+----------+--------------------------+
|file_id|lob   |team      |crew_id|client_id |creation_ts               |
+-------+------+----------+-------+----------+--------------------------+
|0      |retail|us-west   |uobh   |8555632908|2020-07-23 21:55:36.587912|
|1      |fas   |us-west   |ux62   |7221348949|2020-07-23 21:55:36.587958|
|2      |retail|us-central|uoxj   |4218789251|2020-07-23 21:55:36.588151|
|3      |fas   |us-central|u33n   |2473658569|2020-07-23 21:55:36.588178|
|4      |retail|us-central|ua1t   |3667278745|2020-07-23 21:55:36.588200|
+-------+------+----------+-------+----------+--------------------------+
only showing top 5 rows



In [9]:
vt_meta.columns + vt_files.columns

['file_id',
 'lob',
 'team',
 'crew_id',
 'client_id',
 'creation_ts',
 'file_id',
 's3_location']

In [10]:
vt = vt_meta.alias('m').join(vt_files.alias('f'),\
        F.col('m.file_id')==F.col('f.file_id'))\
    .select([F.col('m.'+xx) for xx in vt_meta.columns] + [F.col('f.s3_location')])

In [11]:
vt.show(5, False)

+-------+------+----------+-------+----------+--------------------------+-----------------------------------------------------------+
|file_id|lob   |team      |crew_id|client_id |creation_ts               |s3_location                                                |
+-------+------+----------+-------+----------+--------------------------+-----------------------------------------------------------+
|0      |retail|us-west   |uobh   |8555632908|2020-07-23 21:55:36.587912|s3://bucket/prefix/367a3ab9-2b5a-4997-804b-8cea2d32b2e5.mp3|
|1      |fas   |us-west   |ux62   |7221348949|2020-07-23 21:55:36.587958|s3://bucket/prefix/1bb77dfa-8576-4f0c-a96a-027ac646b631.mp3|
|2      |retail|us-central|uoxj   |4218789251|2020-07-23 21:55:36.588151|s3://bucket/prefix/edf7ea24-0287-4836-b286-fa769734d0a7.mp3|
|3      |fas   |us-central|u33n   |2473658569|2020-07-23 21:55:36.588178|s3://bucket/prefix/8e7ae1ff-5b45-438e-ac09-23f30f3f63e2.mp3|
|4      |retail|us-central|ua1t   |3667278745|2020-07-23 21:55

In [12]:
vt.write.csv('vt_merge.csv',header=True)

In [13]:
!ls *.csv

vt_files.csv  vt_meta.csv

vt_merge.csv:
part-00000-ae6affa4-6224-4e57-bf60-2fe695a03218-c000.csv  _SUCCESS


In [14]:
vt.write.parquet('vt_merge.parquet')