In [0]:
# test reading one split
df = spark.read.option("multiline", True).json('dbfs:/user/dblpv13/dblpv13.*.json.gz')

In [0]:
df.printSchema()

root
 |-- _id: string (nullable = true)
 |-- abstract: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _id: string (nullable = true)
 |    |    |-- avatar: string (nullable = true)
 |    |    |-- bio: string (nullable = true)
 |    |    |-- email: string (nullable = true)
 |    |    |-- gid: string (nullable = true)
 |    |    |-- homepage: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- name_zh: string (nullable = true)
 |    |    |-- oid: string (nullable = true)
 |    |    |-- oid_zh: string (nullable = true)
 |    |    |-- orcid: string (nullable = true)
 |    |    |-- org: string (nullable = true)
 |    |    |-- org_zh: string (nullable = true)
 |    |    |-- orgid: string (nullable = true)
 |    |    |-- orgs: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- orgs_zh: array (nullable = true)
 |    |    |    |-- element:

In [0]:
import pyspark.sql.functions as F
from delta.tables import *

In [0]:
#Select the desired columns
df1 = df.select(F.col('_id').alias('Publication ID'), F.explode(F.col('authors._id')).alias('Author ID'), F.col('authors.name').alias('Name'))
display(df1.limit(10))

Publication ID,Author ID,Name
5c8b2a1c4895d9cbc670bab1,5602b09945cedb3396015cb2,List(John Berry)
5c8b2a1c4895d9cbc670bab2,53f42e0ddabfaee2a1c85188,"List(Matthias Hernandez, Tal Hassner, Jongmoo Choi, Gérard G. Medioni)"
5c8b2a1c4895d9cbc670bab2,53f459e4dabfaeecd69f841e,"List(Matthias Hernandez, Tal Hassner, Jongmoo Choi, Gérard G. Medioni)"
5c8b2a1c4895d9cbc670bab2,5607936245cedb3396b025d9,"List(Matthias Hernandez, Tal Hassner, Jongmoo Choi, Gérard G. Medioni)"
5c8b2a1c4895d9cbc670bab2,53f43a4adabfaefedbaed4ed,"List(Matthias Hernandez, Tal Hassner, Jongmoo Choi, Gérard G. Medioni)"
5c8b2a1c4895d9cbc670baba,,"List(Muhammad Amjad, Muhammad Khalil Afzal, Tariq Umer, Byung-Seo Kim)"
5c8b2a1c4895d9cbc670baba,53f43802dabfaec09f18c0ce,"List(Muhammad Amjad, Muhammad Khalil Afzal, Tariq Umer, Byung-Seo Kim)"
5c8b2a1c4895d9cbc670baba,53f4296ddabfaec09f0e16f1,"List(Muhammad Amjad, Muhammad Khalil Afzal, Tariq Umer, Byung-Seo Kim)"
5c8b2a1c4895d9cbc670baba,53f45adbdabfaedf4361c2d8,"List(Muhammad Amjad, Muhammad Khalil Afzal, Tariq Umer, Byung-Seo Kim)"
5c8b2a1c4895d9cbc670bac0,56cb18c7c35f4f3c656624c2,"List(Long-jun Dong, Weiwei Shu, Xibing Li, Guangjie Han, Wei Zou)"


In [0]:
df2 = df1.select(F.col('Publication ID').alias('PublicationID'), F.col('Author ID').alias('AuthorID'), F.explode(F.col('Name')).alias('Names'))

In [0]:
df2 = (df2.withColumn('FirstName', F.split(df2['Names'], ' ').getItem(0)) 
       .withColumn('MiddleName_Arr', F.split(df2['Names'], ' ')) 
       .withColumn('LastName', F.reverse(F.split(df2['Names'], ' '))[0])).distinct()

In [0]:
df2 = df2.withColumn('MNArr_Len', F.size('MiddleName_Arr')-2)

In [0]:
df3 = df2.withColumn('MiddleName', F.slice(F.lit(df2.MiddleName_Arr), F.lit(2), F.lit(df2.MNArr_Len)))

In [0]:
display(df3.limit(5))

PublicationID,AuthorID,Names,FirstName,MiddleName_Arr,LastName,MNArr_Len,MiddleName
5c8b2a1c4895d9cbc670bac0,53f43e68dabfaee4dc7bbb3b,Wei Zou,Wei,"List(Wei, Zou)",Zou,0,List()
5c8b2ae64895d9cbc670fb65,,Alaa Mohasseb,Alaa,"List(Alaa, Mohasseb)",Mohasseb,0,List()
5c8b2bd34895d9cbc6714dd0,53f42c92dabfaec09f108ca3,Jordan DeKraker,Jordan,"List(Jordan, DeKraker)",DeKraker,0,List()
5c8b2d3d4895d9cbc671c24d,562ceb3445cedb3398d00982,Mengxing Huang,Mengxing,"List(Mengxing, Huang)",Huang,0,List()
5c8b2d3d4895d9cbc671c25a,53f42d28dabfaeb1a7b87db1,Mehdi Behzad,Mehdi,"List(Mehdi, Behzad)",Behzad,0,List()


In [0]:
#MiddleName array to string type
df3 = df3.withColumn("MiddleName", F.concat_ws(",", F.col("MiddleName")))

In [0]:
df4 = df3.drop('Names', 'MiddleName_Arr', 'MNArr_Len')

In [0]:
df5 = df4.withColumn('ID', F.monotonically_increasing_id())

In [0]:
first_filter = df.withColumn("PublicationID", F.col("_id"))

In [0]:
#Join on the original dataframe with PublicationID
last_df = first_filter.join(df5.select('ID', 'PublicationID', 'AuthorID', 'FirstName', 'LastName', 'MiddleName'), on=['PublicationID'])

In [0]:
last_df.write.format("delta").mode("overwrite").saveAsTable("authors_table")