In [2]:
import pyspark
from pyspark.sql.types import BooleanType
from pyspark.sql.functions import udf
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql import Window

In [3]:
#Let us create our spark session
spark = (
    SparkSession.builder
        .appName("Stack Overflow Data Wrangling")
        .config("spark.jars", "../jars/postgresql-42.2.8.jar")
        .getOrCreate()
)

### Step1: Data Extraction 

In [4]:
#Now let's load all the datasets we'll be using.
answers = spark.read.csv("stackoverflow/answers.csv", header=True, inferSchema=True, multiLine=True)
questions = spark.read.csv("stackoverflow/questions.csv",header=True, inferSchema=True, multiLine=True)
users = spark.read.csv("stackoverflow/users.csv",header=True, inferSchema=True, multiLine=True)
questiontags = spark.read.csv("stackoverflow/question_tags.csv", header=True, inferSchema=True, multiLine=True)

In [4]:
answers.dtypes

[('id', 'string'),
 ('user_id', 'string'),
 ('question_id', 'string'),
 ('body', 'string'),
 ('score', 'string'),
 ('comment_count', 'string'),
 ('created_at', 'string')]

In [115]:
#Function to know the shape of our dataframes
def spark_shape(self):
    return(self.count(), len(self.columns))
pyspark.sql.dataframe.DataFrame.shape = spark_shape

In [116]:
#Let's see the total number of rows and columns
answers.shape()

(9367215, 7)

In [117]:
#Let's see the shape of questions dataframe
questions.shape()

(6773193, 9)

In [118]:
#Let's see shape of users
users.shape()

(273489, 12)

In [119]:
#Let's also see the shape of question_tags
questiontags.shape()

(633700, 2)

In [120]:
#Overview of the columns in users dataframe
users.columns

['id',
 'display_name',
 'reputation',
 'website_url',
 'location',
 'about_me',
 'views',
 'up_votes',
 'down_votes',
 'image_url',
 'created_at',
 'updated_at']

In [121]:
print('Total Records of Users = {}'.format(users.count()))
users.show(2)

Total Records of Users = 273489
+-------+------------+----------+--------------------+--------------------+--------+-----+--------+----------+--------------------+-------------------+-------------------+
|     id|display_name|reputation|         website_url|            location|about_me|views|up_votes|down_votes|           image_url|         created_at|         updated_at|
+-------+------------+----------+--------------------+--------------------+--------+-----+--------+----------+--------------------+-------------------+-------------------+
|8357266|      suryan|         7|https://twitter.c...|Bangalore, Karnat...|    null|    8|       0|         0|https://www.grava...|2017-07-24 10:55:23|2019-06-19 05:00:16|
|2602456|         Avi|         1|https://avtechtoo...|              Canada|    null|    0|       0|         0|                null|2013-07-20 15:10:25|2019-07-08 20:43:40|
+-------+------------+----------+--------------------+--------------------+--------+-----+--------+---------

In [122]:
#Let's see the distinct countries we have.
countries = users.groupBy('location').count()
print(countries.show())

+--------------------+-----+
|            location|count|
+--------------------+-----+
|  Nowshera, Pakistan|    1|
|           Bangalore|  165|
|San Francisco Bay...|   18|
|Eden Prairie, MN,...|    4|
|     Beograd, Serbia|    4|
|Cluj-Napoca, Cluj...|   33|
|Montreal, Quebec,...|    2|
|                Utah|   46|
| Aalsmeer, Nederland|    1|
|    Tlemcen, Algérie|    2|
|Tirupur, Tamil Na...|    4|
|São Gonçalo, RJ, ...|    1|
|       Suzhou, China|    3|
|Izmir, İzmir, Turkey|   11|
| Bayern, Deutschland|   16|
|       Toruń, Polska|    4|
|Newtown, Kolkata,...|    1|
|  Verona, VR, Italia|   19|
|Santa Marta, Magd...|    1|
|           kathmandu|    5|
+--------------------+-----+
only showing top 20 rows

None


In [123]:
users.select('display_name', 'location').show(10)

+------------------+--------------------+
|      display_name|            location|
+------------------+--------------------+
|            suryan|Bangalore, Karnat...|
|               Avi|              Canada|
|              Matt|Pennsylvania, Uni...|
|          Wing Fan|                null|
|             A.Raw|New Delhi, Delhi,...|
|           Ringo64|                null|
|Hirotaka Nishimiya|          日本 Tōkyō|
|           Anuroop|                null|
|      Franco Buhay|                null|
|     Kartik Juneja|Gharaunda, Haryan...|
+------------------+--------------------+
only showing top 10 rows



### Step2: Data Transformation

In [124]:
#Let's store users coming from Canada in a new dataframe called country
country = users.where(users.location.contains('Canada'))

In [125]:
#Let us see the first 5 of our new dataframe
country.select('display_name','location').show(100)

+--------------------+--------------------+
|        display_name|            location|
+--------------------+--------------------+
|                 Avi|              Canada|
|               0-DAY|              Canada|
|        Jeremy Banks|              Canada|
|            siyi wei| Toronto, ON, Canada|
|    Michael Sheinman| Grimsby, ON, Canada|
|               James|British Columbia,...|
|             Mohamed| Toronto, ON, Canada|
|             PhillyJ|Newmarket, ON, Ca...|
|               Simon|              Canada|
|         Judd Foster|British Columbia,...|
|                 MPG|Mississauga, ON, ...|
|          Tejas Alva| Toronto, ON, Canada|
|         e.b_al-issa|              Canada|
|               Basil|Etobicoke, Toront...|
|     MirageCommander|Montreal, QC, Canada|
|       Alex O'Malley|              Canada|
|           max pinch|Quebec City, QC, ...|
|                Sare|              Canada|
|                Doum|Québec City, QC, ...|
|        Alex Manuele| Halifax, 

In [126]:
country.shape()

(3329, 12)

In [127]:
#Extracting city and contry into new columns
city_coun = F.split(country['location'], ',')
country = country.withColumn('city', city_coun.getItem(0))
country = country.withColumn('country', city_coun.getItem(2))

In [128]:
#Showing results after spliting city and country. 
#NB:We'll need to refine the code so as to make location with just country appear on country column.
country.select('display_name','city','country').show(5)

+----------------+-------+-------+
|    display_name|   city|country|
+----------------+-------+-------+
|             Avi| Canada|   null|
|           0-DAY| Canada|   null|
|    Jeremy Banks| Canada|   null|
|        siyi wei|Toronto| Canada|
|Michael Sheinman|Grimsby| Canada|
+----------------+-------+-------+
only showing top 5 rows



In [143]:
#Renaming our columns which has same names as other columns from our datasets
answers = answers.withColumnRenamed('id', 'answer_id')
answers = answers.withColumnRenamed('answer_user_id', 'user_id')
answers = answers.withColumnRenamed('score', 'answer_score')
answers = answers.withColumnRenamed('body', 'answer_body')
answers = answers.withColumnRenamed('created_at', 'answer_created_at')
answers = answers.withColumnRenamed('comment_count', 'answer_comment_count')
answers = answers.withColumnRenamed('question_id', 'ans_question_id')

In [144]:
answers.columns

['answer_id',
 'user_id',
 'ans_question_id',
 'answer_body',
 'answer_score',
 'answer_comment_count',
 'answer_created_at']

In [132]:
#Renaming our column from id to user_id
country = country.withColumnRenamed('id', 'user_id')

In [133]:
#Let's see our columns in our country dataframe after splitting location into city and country
country.columns

['user_id',
 'display_name',
 'reputation',
 'website_url',
 'location',
 'about_me',
 'views',
 'up_votes',
 'down_votes',
 'image_url',
 'created_at',
 'updated_at',
 'city',
 'country']

In [134]:
#Let's see the columns we have in our questions dataset
questions.columns

['id',
 'user_id',
 'title',
 'body',
 'accepted_answer_id',
 'score',
 'view_count',
 'comment_count',
 'created_at']

In [135]:
#Noticing that some columns have names similar to the ones in users dataframe, we have to rename them.
questions = questions.withColumnRenamed('id','question_id')
questions = questions.withColumnRenamed('created_at', 'question_created_at')
questions = questions.withColumnRenamed('body', 'question_body')
questions = questions.withColumnRenamed('score', 'question_score')
questions = questions.withColumnRenamed('comment_count', 'question_comment_count')

In [136]:
#Let's confirm if our rename was successful
questions.columns

['question_id',
 'user_id',
 'title',
 'question_body',
 'accepted_answer_id',
 'question_score',
 'view_count',
 'question_comment_count',
 'question_created_at']

In [145]:
#Let's pick question with 20 view counts before we join.
v_counts = questions.filter(questions['view_count'] >= 20)

In [155]:
#Joining our new dataframe with contains view_counts >=20 to our users from a particular country dataframe
v_counts = v_counts.join(country, on='user_id', how='left')

In [156]:
#Let's show the first 2 rows from our joined dataframe
v_counts.show(2)

+-------+-----------+--------------------+--------------------+------------------+--------------+----------+----------------------+-------------------+---------------+----------+-----------------+-------------------+--------------------+-----+--------+----------+--------------------+-------------------+-------------------+-------+-------+---------------+----------+-----------------+-------------------+--------------------+-----+--------+----------+--------------------+-------------------+-------------------+-------+-------+
|user_id|question_id|               title|       question_body|accepted_answer_id|question_score|view_count|question_comment_count|question_created_at|   display_name|reputation|      website_url|           location|            about_me|views|up_votes|down_votes|           image_url|         created_at|         updated_at|   city|country|   display_name|reputation|      website_url|           location|            about_me|views|up_votes|down_votes|           image_u

In [157]:
v_counts.shape()

(18, 35)

In [159]:
#For our final task, let's join answers dataframe to our v_counts dataframe which we just joined from users from a particular country and questions
users_ques_ans = v_counts.join(answers, on='user_id', how='left')

In [160]:
#Now, lets see how our new dataframe looks like. 
#Its supposed to have all users from Canada, questions and answers
users_ques_ans.shape()

(18, 41)

### Step3: Data Loading

In [168]:
#After creating our schema and table in sql. we now write our data from users_ques_ans dataframe into the table.
#Let's use spark to write the results into this table
users_ques_ans.write.format('jdbc').options(
    url='jdbc:postgresql://localhost/postgres',
    driver ='org.postgresql.Driver',
    user = 'postgres',
    password = 'postgres1234',
    dbtable = 'stackoverflow_filtered.results'
).save(mode='append')

Py4JJavaError: An error occurred while calling o1089.save.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 22 in stage 154.0 failed 1 times, most recent failure: Lost task 22.0 in stage 154.0 (TID 1073, localhost, executor driver): java.sql.BatchUpdateException: Batch entry 0 INSERT INTO stackoverflow_filtered.results ("user_id","question_id","title","question_body","accepted_answer_id","question_score","view_count","question_comment_count","question_created_at","display_name","reputation","website_url","location","about_me","views","up_votes","down_votes","image_url","created_at","updated_at","city","country","display_name","reputation","website_url","location","about_me","views","up_votes","down_votes","image_url","created_at","updated_at","city","country","answer_id","ans_question_id","answer_body","answer_score","answer_comment_count","answer_created_at") VALUES ('501102','56537914','Stripe - Monthly Subscription with First Period Length','<p>We offer a <strong>Pro Monthly Subscription</strong> where the monthly rate is $50 but we require the first 2-months to be pre-paid upfront. </p>

<p>Or you could rephrase another way; the first month of a Subscription is paid and the second month is pre-paid.</p>

<p>With another payment processor (FastSpring) we could configure a <strong>First Period Length</strong> on Subscriptions. For example, a <strong>Pro Monthly Subscription</strong> is purchased but the <strong>First Period Length</strong> is set to <code>2-months</code>.</p>

<p>Here''s a scenario:</p>

<ol>
<li>Subscription is purchased on July 1st</li>
<li>First 2-months pre-paid on July 1st = $100</li>
<li>Next charge date is September 1st = $50</li>
<li>Charges ($50) continue monthly until Subscription is cancelled</li>
</ol>

<p>We haven''t found a clean way to accomplish the same <strong>First Period Length</strong> with Stripe.</p>

<p>Any idea how something similar could be configured with Stripe?</p>
','56567424','0','30','0','2019-06-11 06:40:58','geoffrey.mcgill','2310','http://object.net','Canada','"<p><a href=""http://object.net/"" rel=""nofollow"">Object.NET</a> - Frameworks and Tools for .NET Developers</p>',NULL,NULL,NULL,NULL,NULL,NULL,'Canada',NULL,'geoffrey.mcgill','2310','http://object.net','Canada','"<p><a href=""http://object.net/"" rel=""nofollow"">Object.NET</a> - Frameworks and Tools for .NET Developers</p>',NULL,NULL,NULL,NULL,NULL,NULL,'Canada',NULL,NULL,NULL,NULL,NULL,NULL,NULL) was aborted: ERROR: column "display_name" specified more than once
  Position: 342  Call getNextException to see other errors in the batch.
	at org.postgresql.jdbc.BatchResultHandler.handleError(BatchResultHandler.java:148)
	at org.postgresql.core.ResultHandlerDelegate.handleError(ResultHandlerDelegate.java:50)
	at org.postgresql.core.v3.QueryExecutorImpl.processResults(QueryExecutorImpl.java:2234)
	at org.postgresql.core.v3.QueryExecutorImpl.execute(QueryExecutorImpl.java:510)
	at org.postgresql.jdbc.PgStatement.executeBatch(PgStatement.java:853)
	at org.postgresql.jdbc.PgPreparedStatement.executeBatch(PgPreparedStatement.java:1546)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.savePartition(JdbcUtils.scala:672)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$saveTable$1.apply(JdbcUtils.scala:834)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$saveTable$1.apply(JdbcUtils.scala:834)
	at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:935)
	at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:935)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.postgresql.util.PSQLException: ERROR: column "display_name" specified more than once
  Position: 342
	at org.postgresql.core.v3.QueryExecutorImpl.receiveErrorResponse(QueryExecutorImpl.java:2497)
	at org.postgresql.core.v3.QueryExecutorImpl.processResults(QueryExecutorImpl.java:2233)
	... 18 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
	at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:935)
	at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:933)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.foreachPartition(RDD.scala:933)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.saveTable(JdbcUtils.scala:834)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:68)
	at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:45)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:86)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:80)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:80)
	at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:676)
	at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:676)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:676)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:285)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:271)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.sql.BatchUpdateException: Batch entry 0 INSERT INTO stackoverflow_filtered.results ("user_id","question_id","title","question_body","accepted_answer_id","question_score","view_count","question_comment_count","question_created_at","display_name","reputation","website_url","location","about_me","views","up_votes","down_votes","image_url","created_at","updated_at","city","country","display_name","reputation","website_url","location","about_me","views","up_votes","down_votes","image_url","created_at","updated_at","city","country","answer_id","ans_question_id","answer_body","answer_score","answer_comment_count","answer_created_at") VALUES ('501102','56537914','Stripe - Monthly Subscription with First Period Length','<p>We offer a <strong>Pro Monthly Subscription</strong> where the monthly rate is $50 but we require the first 2-months to be pre-paid upfront. </p>

<p>Or you could rephrase another way; the first month of a Subscription is paid and the second month is pre-paid.</p>

<p>With another payment processor (FastSpring) we could configure a <strong>First Period Length</strong> on Subscriptions. For example, a <strong>Pro Monthly Subscription</strong> is purchased but the <strong>First Period Length</strong> is set to <code>2-months</code>.</p>

<p>Here''s a scenario:</p>

<ol>
<li>Subscription is purchased on July 1st</li>
<li>First 2-months pre-paid on July 1st = $100</li>
<li>Next charge date is September 1st = $50</li>
<li>Charges ($50) continue monthly until Subscription is cancelled</li>
</ol>

<p>We haven''t found a clean way to accomplish the same <strong>First Period Length</strong> with Stripe.</p>

<p>Any idea how something similar could be configured with Stripe?</p>
','56567424','0','30','0','2019-06-11 06:40:58','geoffrey.mcgill','2310','http://object.net','Canada','"<p><a href=""http://object.net/"" rel=""nofollow"">Object.NET</a> - Frameworks and Tools for .NET Developers</p>',NULL,NULL,NULL,NULL,NULL,NULL,'Canada',NULL,'geoffrey.mcgill','2310','http://object.net','Canada','"<p><a href=""http://object.net/"" rel=""nofollow"">Object.NET</a> - Frameworks and Tools for .NET Developers</p>',NULL,NULL,NULL,NULL,NULL,NULL,'Canada',NULL,NULL,NULL,NULL,NULL,NULL,NULL) was aborted: ERROR: column "display_name" specified more than once
  Position: 342  Call getNextException to see other errors in the batch.
	at org.postgresql.jdbc.BatchResultHandler.handleError(BatchResultHandler.java:148)
	at org.postgresql.core.ResultHandlerDelegate.handleError(ResultHandlerDelegate.java:50)
	at org.postgresql.core.v3.QueryExecutorImpl.processResults(QueryExecutorImpl.java:2234)
	at org.postgresql.core.v3.QueryExecutorImpl.execute(QueryExecutorImpl.java:510)
	at org.postgresql.jdbc.PgStatement.executeBatch(PgStatement.java:853)
	at org.postgresql.jdbc.PgPreparedStatement.executeBatch(PgPreparedStatement.java:1546)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.savePartition(JdbcUtils.scala:672)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$saveTable$1.apply(JdbcUtils.scala:834)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$saveTable$1.apply(JdbcUtils.scala:834)
	at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:935)
	at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:935)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: org.postgresql.util.PSQLException: ERROR: column "display_name" specified more than once
  Position: 342
	at org.postgresql.core.v3.QueryExecutorImpl.receiveErrorResponse(QueryExecutorImpl.java:2497)
	at org.postgresql.core.v3.QueryExecutorImpl.processResults(QueryExecutorImpl.java:2233)
	... 18 more
