In [1]:
import pyspark
from pyspark.sql.types import BooleanType
from pyspark.sql.functions import udf
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql import Window

In [2]:
#Let us create our spark session
spark = (
    SparkSession.builder
        .appName("Stack Overflow Data Wrangling")
        .config("spark.jars", "../jars/postgresql-42.2.8.jar")
        .getOrCreate()
)

### Step1: Data Extraction 

In [3]:
#Now let's load all the datasets we'll be using.
answers = spark.read.csv("stackoverflow/answers.csv", header=True, inferSchema=True, multiLine=True)
questions = spark.read.csv("stackoverflow/questions.csv",header=True, inferSchema=True, multiLine=True)
users = spark.read.csv("stackoverflow/users.csv",header=True, inferSchema=True, multiLine=True)
questiontags = spark.read.csv("stackoverflow/question_tags.csv", header=True, inferSchema=True, multiLine=True)

In [4]:
answers.dtypes

[('id', 'string'),
 ('user_id', 'string'),
 ('question_id', 'string'),
 ('body', 'string'),
 ('score', 'string'),
 ('comment_count', 'string'),
 ('created_at', 'string')]

In [5]:
#Function to know the shape of our dataframes
def spark_shape(self):
    return(self.count(), len(self.columns))
pyspark.sql.dataframe.DataFrame.shape = spark_shape

In [6]:
#Let's see the total number of rows and columns
answers.shape()

(9367215, 7)

In [7]:
#Let's see the shape of questions dataframe
questions.shape()

(6773193, 9)

In [8]:
#Let's see shape of users
users.shape()

(273489, 12)

In [9]:
#Let's also see the shape of question_tags
questiontags.shape()

(633700, 2)

In [10]:
#Overview of the columns in users dataframe
users.columns

['id',
 'display_name',
 'reputation',
 'website_url',
 'location',
 'about_me',
 'views',
 'up_votes',
 'down_votes',
 'image_url',
 'created_at',
 'updated_at']

In [11]:
print('Total Records of Users = {}'.format(users.count()))
users.show(2)

Total Records of Users = 273489
+-------+------------+----------+--------------------+--------------------+--------+-----+--------+----------+--------------------+-------------------+-------------------+
|     id|display_name|reputation|         website_url|            location|about_me|views|up_votes|down_votes|           image_url|         created_at|         updated_at|
+-------+------------+----------+--------------------+--------------------+--------+-----+--------+----------+--------------------+-------------------+-------------------+
|8357266|      suryan|         7|https://twitter.c...|Bangalore, Karnat...|    null|    8|       0|         0|https://www.grava...|2017-07-24 10:55:23|2019-06-19 05:00:16|
|2602456|         Avi|         1|https://avtechtoo...|              Canada|    null|    0|       0|         0|                null|2013-07-20 15:10:25|2019-07-08 20:43:40|
+-------+------------+----------+--------------------+--------------------+--------+-----+--------+---------

In [13]:
#Let's see the distinct countries we have.
countries = users.groupBy('location').count()
print(countries.show())

+--------------------+-----+
|            location|count|
+--------------------+-----+
|  Nowshera, Pakistan|    1|
|           Bangalore|  165|
|San Francisco Bay...|   18|
|Eden Prairie, MN,...|    4|
|     Beograd, Serbia|    4|
|Cluj-Napoca, Cluj...|   33|
|Montreal, Quebec,...|    2|
|                Utah|   46|
| Aalsmeer, Nederland|    1|
|    Tlemcen, Algérie|    2|
|Tirupur, Tamil Na...|    4|
|São Gonçalo, RJ, ...|    1|
|       Suzhou, China|    3|
|Izmir, İzmir, Turkey|   11|
| Bayern, Deutschland|   16|
|       Toruń, Polska|    4|
|Newtown, Kolkata,...|    1|
|  Verona, VR, Italia|   19|
|Santa Marta, Magd...|    1|
|           kathmandu|    5|
+--------------------+-----+
only showing top 20 rows

None


In [14]:
users.select('display_name', 'location').show(10)

+------------------+--------------------+
|      display_name|            location|
+------------------+--------------------+
|            suryan|Bangalore, Karnat...|
|               Avi|              Canada|
|              Matt|Pennsylvania, Uni...|
|          Wing Fan|                null|
|             A.Raw|New Delhi, Delhi,...|
|           Ringo64|                null|
|Hirotaka Nishimiya|          日本 Tōkyō|
|           Anuroop|                null|
|      Franco Buhay|                null|
|     Kartik Juneja|Gharaunda, Haryan...|
+------------------+--------------------+
only showing top 10 rows



### Step2: Data Transformation

In [34]:
users = users.withColumnRenamed('id', 'user_id').withColumnRenamed('created_at', 'user_created_at').withColumnRenamed('updated_at','user_updated_at')

In [35]:
#Let's store users coming from Canada in a new dataframe called country
country = users.where(users.location.contains('Canada'))

In [36]:
#Let us see the first 5 of our new dataframe
country.select('display_name','location').show(100)

+--------------------+--------------------+
|        display_name|            location|
+--------------------+--------------------+
|                 Avi|              Canada|
|               0-DAY|              Canada|
|        Jeremy Banks|              Canada|
|            siyi wei| Toronto, ON, Canada|
|    Michael Sheinman| Grimsby, ON, Canada|
|               James|British Columbia,...|
|             Mohamed| Toronto, ON, Canada|
|             PhillyJ|Newmarket, ON, Ca...|
|               Simon|              Canada|
|         Judd Foster|British Columbia,...|
|                 MPG|Mississauga, ON, ...|
|          Tejas Alva| Toronto, ON, Canada|
|         e.b_al-issa|              Canada|
|               Basil|Etobicoke, Toront...|
|     MirageCommander|Montreal, QC, Canada|
|       Alex O'Malley|              Canada|
|           max pinch|Quebec City, QC, ...|
|                Sare|              Canada|
|                Doum|Québec City, QC, ...|
|        Alex Manuele| Halifax, 

In [37]:
country.shape()

(3329, 12)

In [38]:
#Extracting city and contry into new columns
city_coun = F.split(country['location'], ',')
country = country.withColumn('city', city_coun.getItem(0))
country = country.withColumn('country', city_coun.getItem(2))

In [39]:
#Showing results after spliting city and country. 
#NB:We'll need to refine the code so as to make location with just country appear on country column.
country.select('display_name','city','country').show(5)

+----------------+-------+-------+
|    display_name|   city|country|
+----------------+-------+-------+
|             Avi| Canada|   null|
|           0-DAY| Canada|   null|
|    Jeremy Banks| Canada|   null|
|        siyi wei|Toronto| Canada|
|Michael Sheinman|Grimsby| Canada|
+----------------+-------+-------+
only showing top 5 rows



In [79]:
#Renaming our columns which has same names as other columns from our datasets
answers = answers.withColumnRenamed('id', 'answer_id')
answers = answers.withColumnRenamed('created_at', 'answer_created_at')
answers = answers.withColumnRenamed('body', 'answer_body')
answers = answers.withColumnRenamed('score', 'answer_score')
answers = answers.withColumnRenamed('comment_count', 'answer_comment_count')

answers = answers.withColumnRenamed('answer_user_id', 'user_id')
answers = answers.withColumnRenamed('ans_question_id', 'question_id')

In [80]:
answers.columns

['answer_id',
 'user_id',
 'question_id',
 'answer_body',
 'answer_score',
 'answer_comment_count',
 'answer_created_at']

In [81]:
#Renaming our column from id to user_id
country = country.withColumnRenamed('id', 'user_id')

In [82]:
#Let's see our columns in our country dataframe after splitting location into city and country
country.columns

['user_id',
 'display_name',
 'reputation',
 'website_url',
 'location',
 'about_me',
 'views',
 'up_votes',
 'down_votes',
 'image_url',
 'user_created_at',
 'user_updated_at',
 'city',
 'country']

In [83]:
#Let's see the columns we have in our questions dataset
questions.columns

['question_id',
 'user_id',
 'title',
 'question_body',
 'accepted_answer_id',
 'question_score',
 'view_count',
 'comment_count',
 'question_created_at']

In [84]:
#Noticing that some columns have names similar to the ones in users dataframe, we have to rename them.
questions = questions.withColumnRenamed('id','question_id')
questions = questions.withColumnRenamed('created_at', 'question_created_at')
questions = questions.withColumnRenamed('body', 'question_body')
questions = questions.withColumnRenamed('score', 'question_score')
questions = questions.withColumnRenamed('question_comment_count', 'comment_count')

In [85]:
#Let's confirm if our rename was successful
questions.columns

['question_id',
 'user_id',
 'title',
 'question_body',
 'accepted_answer_id',
 'question_score',
 'view_count',
 'comment_count',
 'question_created_at']

In [86]:
#An inner join of users in Canada and questions dataframe.
users_country = country.join(questions, on='user_id', how='left')
users_country.columns

['user_id',
 'display_name',
 'reputation',
 'website_url',
 'location',
 'about_me',
 'views',
 'up_votes',
 'down_votes',
 'image_url',
 'user_created_at',
 'user_updated_at',
 'city',
 'country',
 'question_id',
 'title',
 'question_body',
 'accepted_answer_id',
 'question_score',
 'view_count',
 'comment_count',
 'question_created_at']

In [87]:
#Let's pick questions with at least 20 view counts.
users_country = users_country.filter(users_country['view_count'] >= 20)

In [94]:
#Final task is to join our resultant table to the answers dataframe.
users_country = users_country.join(answers, on=['question_id','user_id'], how='left')
users_country.columns

['question_id',
 'user_id',
 'display_name',
 'reputation',
 'website_url',
 'location',
 'about_me',
 'views',
 'up_votes',
 'down_votes',
 'image_url',
 'user_created_at',
 'user_updated_at',
 'city',
 'country',
 'title',
 'question_body',
 'accepted_answer_id',
 'question_score',
 'view_count',
 'comment_count',
 'question_created_at',
 'answer_id',
 'answer_body',
 'answer_score',
 'answer_comment_count',
 'answer_created_at',
 'answer_id',
 'answer_body',
 'answer_score',
 'answer_comment_count',
 'answer_created_at',
 'answer_id',
 'answer_body',
 'answer_score',
 'answer_comment_count',
 'answer_created_at']

### Step3: Data Loading

In [93]:
#After creating our schema and table in sql. we now write our data from users_ques_ans dataframe into the table.
#Let's use spark to write the results into this table
users_country.write.format('jdbc').options(
    url='jdbc:postgresql://localhost:5432/postgres',
    driver ='org.postgresql.Driver',
    user = 'postgres',
    password = 'postgres1234',
    dbtable = 'stackoverflow_filtered.results'
).save(mode='append')

Py4JJavaError: An error occurred while calling o1185.save.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 52.0 failed 1 times, most recent failure: Lost task 1.0 in stage 52.0 (TID 681, localhost, executor driver): java.sql.BatchUpdateException: Batch entry 0 INSERT INTO stackoverflow_filtered.results ("question_id","user_id","display_name","reputation","website_url","location","about_me","views","up_votes","down_votes","image_url","user_created_at","user_updated_at","city","country","title","question_body","accepted_answer_id","question_score","view_count","comment_count","question_created_at","answer_id","answer_body","answer_score","answer_comment_count","answer_created_at","answer_id","answer_body","answer_score","answer_comment_count","answer_created_at") VALUES ('54927792','10923030','t.lore','44',NULL,'London, ON, Canada',NULL,'17','11','0','https://www.gravatar.com/avatar/69b82512196f0b8773e826f7e8ac64e7?s=128&d=identicon&r=PG&f=1','2019-01-16 14:40:02','2019-05-17 18:31:42','London',' Canada','Error Code: 1054. Unknown column ''sdate'' in ''where clause''','<p>I got this error in my Query, do you have any idea how can I put the <code>sdate</code> in the 2 layer subquery?</p>

<pre><code>select
at.startDate as sdate, at.dau as DAU,
(
select count(distinct d.uid) from
 (select ses.uid from dsession as ses where ses.startDate = sdate group by ses.uid
  union all
  select res.uid from rsession as res where res.startDate = sdate group by res.uid) as te
) as MAU, (SELECT DAU/MAU) as AVG
from
attendance as at 
</code></pre>

<p>it works if I query alone the subquery but when I merge it to the main query, the <code>sdate</code> got unknown. any idea?</p>

<p>I tried to replace <code>sdate</code> on <code>where</code> as <code>at.startDate</code> but still got unknown <code>at.startDate</code> column.</p>
','54928187','0','41','0','2019-02-28 14:20:09',NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL) was aborted: ERROR: column "answer_id" specified more than once
  Position: 427  Call getNextException to see other errors in the batch.
	at org.postgresql.jdbc.BatchResultHandler.handleError(BatchResultHandler.java:148)
	at org.postgresql.core.ResultHandlerDelegate.handleError(ResultHandlerDelegate.java:50)
	at org.postgresql.core.v3.QueryExecutorImpl.processResults(QueryExecutorImpl.java:2234)
	at org.postgresql.core.v3.QueryExecutorImpl.execute(QueryExecutorImpl.java:510)
	at org.postgresql.jdbc.PgStatement.executeBatch(PgStatement.java:853)
	at org.postgresql.jdbc.PgPreparedStatement.executeBatch(PgPreparedStatement.java:1546)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.savePartition(JdbcUtils.scala:672)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$saveTable$1.apply(JdbcUtils.scala:834)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$saveTable$1.apply(JdbcUtils.scala:834)
	at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:935)
	at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:935)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.postgresql.util.PSQLException: ERROR: column "answer_id" specified more than once
  Position: 427
	at org.postgresql.core.v3.QueryExecutorImpl.receiveErrorResponse(QueryExecutorImpl.java:2497)
	at org.postgresql.core.v3.QueryExecutorImpl.processResults(QueryExecutorImpl.java:2233)
	... 18 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
	at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:935)
	at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:933)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.foreachPartition(RDD.scala:933)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.saveTable(JdbcUtils.scala:834)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:68)
	at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:45)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:86)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:80)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:80)
	at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:676)
	at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:676)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:676)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:285)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:271)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.sql.BatchUpdateException: Batch entry 0 INSERT INTO stackoverflow_filtered.results ("question_id","user_id","display_name","reputation","website_url","location","about_me","views","up_votes","down_votes","image_url","user_created_at","user_updated_at","city","country","title","question_body","accepted_answer_id","question_score","view_count","comment_count","question_created_at","answer_id","answer_body","answer_score","answer_comment_count","answer_created_at","answer_id","answer_body","answer_score","answer_comment_count","answer_created_at") VALUES ('54927792','10923030','t.lore','44',NULL,'London, ON, Canada',NULL,'17','11','0','https://www.gravatar.com/avatar/69b82512196f0b8773e826f7e8ac64e7?s=128&d=identicon&r=PG&f=1','2019-01-16 14:40:02','2019-05-17 18:31:42','London',' Canada','Error Code: 1054. Unknown column ''sdate'' in ''where clause''','<p>I got this error in my Query, do you have any idea how can I put the <code>sdate</code> in the 2 layer subquery?</p>

<pre><code>select
at.startDate as sdate, at.dau as DAU,
(
select count(distinct d.uid) from
 (select ses.uid from dsession as ses where ses.startDate = sdate group by ses.uid
  union all
  select res.uid from rsession as res where res.startDate = sdate group by res.uid) as te
) as MAU, (SELECT DAU/MAU) as AVG
from
attendance as at 
</code></pre>

<p>it works if I query alone the subquery but when I merge it to the main query, the <code>sdate</code> got unknown. any idea?</p>

<p>I tried to replace <code>sdate</code> on <code>where</code> as <code>at.startDate</code> but still got unknown <code>at.startDate</code> column.</p>
','54928187','0','41','0','2019-02-28 14:20:09',NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL) was aborted: ERROR: column "answer_id" specified more than once
  Position: 427  Call getNextException to see other errors in the batch.
	at org.postgresql.jdbc.BatchResultHandler.handleError(BatchResultHandler.java:148)
	at org.postgresql.core.ResultHandlerDelegate.handleError(ResultHandlerDelegate.java:50)
	at org.postgresql.core.v3.QueryExecutorImpl.processResults(QueryExecutorImpl.java:2234)
	at org.postgresql.core.v3.QueryExecutorImpl.execute(QueryExecutorImpl.java:510)
	at org.postgresql.jdbc.PgStatement.executeBatch(PgStatement.java:853)
	at org.postgresql.jdbc.PgPreparedStatement.executeBatch(PgPreparedStatement.java:1546)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.savePartition(JdbcUtils.scala:672)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$saveTable$1.apply(JdbcUtils.scala:834)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$saveTable$1.apply(JdbcUtils.scala:834)
	at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:935)
	at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:935)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: org.postgresql.util.PSQLException: ERROR: column "answer_id" specified more than once
  Position: 427
	at org.postgresql.core.v3.QueryExecutorImpl.receiveErrorResponse(QueryExecutorImpl.java:2497)
	at org.postgresql.core.v3.QueryExecutorImpl.processResults(QueryExecutorImpl.java:2233)
	... 18 more
