In [1]:
# different with the code for cluster
import findspark
findspark.init()
#
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from datetime import datetime, date, timedelta
from pyspark.sql import SQLContext, Row
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.mllib.stat import Statistics
from pyspark.mllib.linalg import Vectors
from pyspark.sql.window import Window
from pyspark.sql import DataFrame
import pyspark.sql.functions as func
import re
import math
import sys
import os
from optparse import OptionParser
import string
from typing import Iterable
from pyspark.sql.functions import udf, col
from pyspark.sql import functions as F
from pyspark.ml import Pipeline
from pyspark.ml.feature import RegexTokenizer
from pyspark.ml.feature import StopWordsRemover

In [2]:
spark = SparkSession.builder \
    .appName("Spark NLP2")\
    .master("local[*]")\
    .config("spark.driver.memory","4G")\
    .config("spark.driver.maxResultSize", "2G") \
    .config("spark.kryoserializer.buffer.max", "500m")\
    .getOrCreate()



In [3]:
#import sparknlp
#from sparknlp.pretrained import PretrainedPipeline

In [4]:
# Functions
def classify_Food(discription):
    b=any([(x in new_dict['Food']) for x in set(discription)])
    if b:
        return 1
    else:
        return 0
    
classify_Food_udf=udf(classify_Food,IntegerType())

def classify_Event(discription):
    b=any([(x in new_dict['Event']) for x in set(discription)])
    if b:
        return 1
    else:
        return 0
    
classify_Event_udf=udf(classify_Event,IntegerType())

def classify_People(discription):
    b=any([(x in new_dict['People']) for x in set(discription)])
    if b:
        return 1
    else:
        return 0
    
classify_People_udf=udf(classify_People,IntegerType())

def classify_Activity(discription):
    b=any([(x in new_dict['Activity']) for x in set(discription)])
    if b:
        return 1
    else:
        return 0
    
classify_Activity_udf=udf(classify_Activity,IntegerType())

def classify_Travel(discription):
    b=any([(x in new_dict['Travel']) for x in set(discription)])
    if b:
        return 1
    else:
        return 0
    
classify_Travel_udf=udf(classify_Travel,IntegerType())

def classify_Transportation(discription):
    b=any([(x in new_dict['Transportation']) for x in set(discription)])
    if b:
        return 1
    else:
        return 0
    
classify_Transportation_udf=udf(classify_Transportation,IntegerType())

def classify_Utility(discription):
    b=any([(x in new_dict['Utility']) for x in set(discription)])
    if b:
        return 1
    else:
        return 0
    
classify_Utility_udf=udf(classify_Utility,IntegerType())

def classify_Cash(discription):
    b=any([(x in new_dict['Cash']) for x in set(discription)])
    if b:
        return 1
    else:
        return 0
    
classify_Cash_udf=udf(classify_Cash,IntegerType())

def classify_Illegal(discription):
    b=any([(x in new_dict['Illegal/Sarcasm']) for x in set(discription)])
    if b:
        return 1
    else:
        return 0
    
classify_Illegal_udf=udf(classify_Illegal,IntegerType())

# convert to dummy variable
def convert_dummy(x):
    if x>0:
        return 1
    else:
        return 0
convert_dummy_udf=udf(convert_dummy,IntegerType())

countTokens = udf(lambda words: len(words), IntegerType())


In [5]:
inputFile = spark\
    .read\
    .option("inferSchema","true")\
    .option("header", "true")\
    .csv("/Users/yichuan/Desktop/Venmo project/data/venmoSample.csv")

In [6]:
inputFile = inputFile.na.fill('')

In [7]:
my_window = Window.partitionBy('user1')
inputFile = inputFile.withColumn("min_date", min(inputFile['datetime']).over(my_window))
inputFile = inputFile.withColumn("diff_date", datediff('datetime','min_date'))
inputFile = inputFile.withColumn("customer_lifetime", F.when(inputFile['diff_date']==0,0).otherwise(inputFile['diff_date']/30+1).cast(IntegerType()))


In [8]:
#-----2. emoji explore -------------------------------------------------------------------------------
# remove punctuation
# remove punctuation first, so punctuation will not count as emoji
punctuations = '~|`|\!|@|#|$|%|^|&|\*|\(|\)|-|\+|=|_|\{|\}|\[|\]|;|:|\?|\.|,|<|>|/|\'|\"'
# remove punctuation
inputFile=inputFile\
.withColumn('description_rm_pun',regexp_replace(col('description'),punctuations, ' '))
# keep emoji
inputFile=inputFile\
.withColumn('description_emoji',regexp_replace(col('description_rm_pun'),'[\w\s]', ''))
# keep text
inputFile=inputFile\
.withColumn('description_word',regexp_replace(col('description_rm_pun'),'[^\w\s]', ''))

inputFile = inputFile.withColumn('lower_words', lower(col('description_word')))


In [9]:
inputFile = inputFile.withColumn("total_tokens", countTokens(col("description")))
inputFile = inputFile.withColumn("year", year("datetime"))
inputFile = inputFile.withColumn("month", month("datetime"))

In [9]:


## count tokens in each transaction
inputFile = inputFile.withColumn("total_word_tokens", countTokens(col("description_word"))) 
inputFile = inputFile.withColumn("total_emoji_tokens", countTokens(col("description_emoji"))) 
inputFile = inputFile.withColumn("if_emoji_only", inputFile['total_emoji_tokens'] == inputFile['total_tokens'])
inputFile = inputFile.withColumn("is_emoji", inputFile['total_emoji_tokens'] > 0)


In [10]:
## Emoji Analysis
agg_data = inputFile.groupBy("year", "month").agg(
                    count('user1').alias("total_transactions_per_month"),
                    sum('total_tokens').alias("total_tokens_per_month"),
                    sum('total_emoji_tokens').alias("total_emoji_tokens_per_month"),
                    (F.sum(F.col("if_emoji_only").cast("long")).alias("total_emoji_only_per_month"))).orderBy("year", "month")


agg_data = agg_data.withColumn('percent_of_emoji', (F.col("total_emoji_tokens_per_month") / F.col("total_tokens_per_month")))           
agg_data = agg_data.withColumn('percent_of_emoji_only', (F.col("total_emoji_only_per_month") / F.col("total_transactions_per_month")))                     
agg_data = agg_data.sort('year', 'month', ascending=True)

user_averageEmoji = inputFile.groupBy("year", "month", "user1").agg(
                    count('user1').alias("total_transactions_per_month"),
                    (F.sum(F.col("is_emoji").cast("long")).alias("total_emoji_transactions_per_month")))    

user_averageEmoji = user_averageEmoji.withColumn('emoji_avg',user_averageEmoji['total_emoji_transactions_per_month']/user_averageEmoji['total_transactions_per_month'])

### We need this as output ###
plot_user_emoji = user_averageEmoji.groupBy("year", "month").agg(
    avg("emoji_avg").alias("avg_emoji_usage"),
stddev_pop("emoji_avg").alias("sd_emoji_usage"))
plot_user_emoji = plot_user_emoji.sort('year', 'month', ascending=True)


first_emoji_date = inputFile.filter(col("total_emoji_tokens") > 0).groupBy("user1").agg(min('datetime'))
first_emoji_date = first_emoji_date.withColumn("year", year("min(datetime)"))
first_emoji_date = first_emoji_date.withColumn("month", month("min(datetime)"))

plot_first_emoji = first_emoji_date.groupBy("year", "month").count().sort('year', 'month', ascending=True)


In [11]:
inputFile.show()

+-----+-------+----------------+-------------------+--------------------+-----------+--------------------+-------------------+---------+-----------------+--------------------+-----------------+--------------------+------------+----+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------------+-----------------+------------------+-------------+--------+
|user1|  user2|transaction_type|           datetime|         description|is_business|            story_id|           min_date|diff_date|customer_lifetime|  description_rm_pun|description_emoji|    description_word|total_tokens|year|month|                text|            document|            sentence|               token|             checked|               lemma|                stem|                 pos|        lemma_result|tokenized_words_filtered|total_word_tokens|total_emoji_token

-----

In [3]:
emoji_dict = spark\
    .read\
    .option("inferSchema","true")\
    .option("header", "true")\
    .csv("/Users/yichuan/Desktop/Venmo project/data/Venmo_Emoji_Classification_Dictionary.csv")
word_dict = spark\
    .read\
    .option("inferSchema","true")\
    .option("header", "true")\
    .csv("/Users/yichuan/Desktop/Venmo project/data/Venmo_Word_Classification_Dictionary.csv")

In [4]:
new_dict = word_dict.toPandas().to_dict(orient='list')
filtered = {k: [x for x in v if x is not None] for k, v in new_dict.items()}
new_dict.clear()
new_dict.update(filtered)

emoji_dict = emoji_dict.toPandas().to_dict(orient='list')
filtered = {k: [x for x in v if x is not None] for k, v in emoji_dict.items()}
emoji_dict.clear()
emoji_dict.update(filtered)


In [16]:
for col in emoji_dict.keys():
    new_dict[col]=new_dict[col]+ emoji_dict[col]
    
inputFile = inputFile.withColumn("Food_emoji", classify_Food_udf("description_emoji"))
inputFile = inputFile.withColumn("Event_emoji", classify_Event_udf("description_emoji"))
inputFile = inputFile.withColumn("People_emoji", classify_People_udf("description_emoji"))
inputFile = inputFile.withColumn("Activity_emoji", classify_Activity_udf("description_emoji"))
inputFile = inputFile.withColumn("Travel_emoji", classify_Travel_udf("description_emoji"))
inputFile = inputFile.withColumn("Transportation_emoji", classify_Transportation_udf("description_emoji"))
inputFile = inputFile.withColumn("Utility_emoji", classify_Utility_udf("description_emoji"))
inputFile = inputFile.withColumn("Food_word", classify_Food_udf("tokenized_words_filtered"))
inputFile = inputFile.withColumn("Event_word", classify_Event_udf("tokenized_words_filtered"))
inputFile = inputFile.withColumn("People_word", classify_People_udf("tokenized_words_filtered"))
inputFile = inputFile.withColumn("Activity_word", classify_Activity_udf("tokenized_words_filtered"))
inputFile = inputFile.withColumn("Travel_word", classify_Travel_udf("tokenized_words_filtered"))
inputFile = inputFile.withColumn("Transportation_word", classify_Transportation_udf("tokenized_words_filtered"))
inputFile = inputFile.withColumn("Utility_word", classify_Utility_udf("tokenized_words_filtered"))
inputFile = inputFile.withColumn("Cash_word", classify_Cash_udf("tokenized_words_filtered"))
inputFile = inputFile.withColumn("Illegal_word", classify_Illegal_udf("tokenized_words_filtered"))
inputFile = inputFile.withColumn("Event", F.col("Event_emoji")+F.col('Event_word'))
inputFile = inputFile.withColumn("Travel", F.col("Travel_emoji")+F.col('Travel_word'))
inputFile = inputFile.withColumn("Food", F.col("Food_emoji")+F.col('Food_word'))
inputFile = inputFile.withColumn("Activity", F.col("Activity_emoji")+F.col('Activity_word'))
inputFile = inputFile.withColumn("Transportation", F.col("Transportation_emoji")+F.col('Transportation_word'))
inputFile = inputFile.withColumn("People", F.col("People_emoji")+F.col('People_word'))
inputFile = inputFile.withColumn("Utility", F.col("Utility_emoji")+F.col('Utility_word'))
inputFile = inputFile.withColumn("Event",  convert_dummy_udf("Event"))
inputFile = inputFile.withColumn("Travel",  convert_dummy_udf("Travel"))
inputFile = inputFile.withColumn("Food",  convert_dummy_udf("Food"))
inputFile = inputFile.withColumn("Activity",  convert_dummy_udf("Activity"))
inputFile = inputFile.withColumn("Transportation",  convert_dummy_udf("Transportation"))
inputFile = inputFile.withColumn("People",  convert_dummy_udf("People"))
inputFile = inputFile.withColumn("Utility",  convert_dummy_udf("Utility"))
inputFile = inputFile.withColumn("Total_Sum_Category_Dummies", F.col("Event")+ F.col('Travel') + F.col('Food') +F.col('Activity') + F.col('Transportation') + F.col('People') + F.col('Utility') + F.col('Illegal_word') + F.col('Cash_word'))

    

In [17]:
agg_categories = inputFile.groupBy("year", "month").agg(
                    count('user1').alias("total_transactions_per_month"),
                    sum('Total_Sum_Category_Dummies').alias("total_dummies_per_month"),
                    sum('Event').alias("Events_per_month"),
                    sum('Travel').alias("Travel_per_month"),
                    sum('Food').alias("Food_per_month"),
                    sum('Activity').alias("Activity_per_month"),
                    sum('Transportation').alias("Transportation_per_month"),
                    sum('People').alias("People_per_month"),
                    sum('Utility').alias("Utility_per_month"),
                    sum('Illegal_word').alias("Illegal_word_per_month"),
                    sum('Cash_word').alias("Cash_word_per_month")).orderBy("year", "month")


agg_categories_customer_lifetime = inputFile.groupBy("customer_lifetime").agg(
                    count('user1').alias("total_transactions_per_customer_stage"),
                    sum('Total_Sum_Category_Dummies').alias("total_dummies_per_customer_stage"),
                    sum('Event').alias("Events_per_customer_stage"),
                    sum('Travel').alias("Travel_per_customer_stage"),
                    sum('Food').alias("Food_per_customer_stage"),
                    sum('Activity').alias("Activity_per_month"),
                    sum('Transportation').alias("Transportation_per_customer_stage"),
                    sum('People').alias("People_per_customer_stage"),
                    sum('Utility').alias("Utility_per_customer_stage"),
                    sum('Illegal_word').alias("Illegal_word_per_customer_stage"),
                    sum('Cash_word').alias("Cash_word_per_customer_stage")).orderBy("customer_lifetime")

In [19]:
agg_categories.show()

Py4JJavaError: An error occurred while calling o797.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 34.0 failed 1 times, most recent failure: Lost task 2.0 in stage 34.0 (TID 75, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/anaconda3/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 267, in main
    ("%d.%d" % sys.version_info[:2], version))
Exception: Python in worker has different version 2.7 than that in driver 3.7, PySpark cannot run with different minor versions.Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:81)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:64)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1124)
	at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1130)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$class.foreach(Iterator.scala:891)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
	at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:224)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.writeIteratorToStream(PythonUDFRunner.scala:50)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread$$anonfun$run$1.apply(PythonRunner.scala:346)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1945)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:195)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2158)
	at org.apache.spark.rdd.RDD$$anonfun$reduce$1.apply(RDD.scala:1035)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.reduce(RDD.scala:1017)
	at org.apache.spark.rdd.RDD$$anonfun$takeOrdered$1.apply(RDD.scala:1439)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.takeOrdered(RDD.scala:1426)
	at org.apache.spark.sql.execution.TakeOrderedAndProjectExec.executeCollect(limit.scala:136)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3389)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3369)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2764)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:254)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:291)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/anaconda3/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 267, in main
    ("%d.%d" % sys.version_info[:2], version))
Exception: Python in worker has different version 2.7 than that in driver 3.7, PySpark cannot run with different minor versions.Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:81)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:64)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1124)
	at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1130)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$class.foreach(Iterator.scala:891)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
	at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:224)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.writeIteratorToStream(PythonUDFRunner.scala:50)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread$$anonfun$run$1.apply(PythonRunner.scala:346)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1945)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:195)
