# Install Dependencies

In [1]:
!pip --default-timeout=100 install pyspark -i https://pypi.tuna.tsinghua.edu.cn/simple

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple


# Read Json

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("COMP5349 A2 Data Loading") \
    .getOrCreate()

data = "Assignment_2_data/test.json"
init_df = spark.read.json(data)

In [3]:
from pyspark.sql.functions import explode
from pyspark.sql.functions import udf

data_df= init_df.select((explode("data").alias('data')))
paragraph_df = data_df.select(explode("data.paragraphs").alias("paragraph"))

# data_df.printSchema()
# paragraph_df.printSchema()
# paragraph_df.show(5)

In [4]:
paragraph_unrolled_df = paragraph_df.select("paragraph.context" , explode("paragraph.qas").alias("qas")) \
                                    .select("context", "qas.id", "qas.question", "qas.is_impossible", explode("qas.answers").alias("answer")) \
                                    .select("context", "id", "question", "is_impossible", "answer.answer_start", "answer.text")

paragraph_unrolled_df.show(5)
# paragraph_unrolled_df.printSchema()

+--------------------+--------------------+--------------------+-------------+------------+--------------------+
|             context|                  id|            question|is_impossible|answer_start|                text|
+--------------------+--------------------+--------------------+-------------+------------+--------------------+
|Exhibit 10.16 SUP...|LohaCompanyltd_20...|Highlight the par...|        false|          14|     SUPPLY CONTRACT|
|Exhibit 10.16 SUP...|LohaCompanyltd_20...|Highlight the par...|        false|         143|         The seller:|
|Exhibit 10.16 SUP...|LohaCompanyltd_20...|Highlight the par...|        false|          49|The buyer/End-Use...|
|Exhibit 10.16 SUP...|LohaCompanyltd_20...|Highlight the par...|        false|       10985|The Contract is v...|
|Exhibit 10.16 SUP...|LohaCompanyltd_20...|Highlight the par...|        false|       10691|It will be govern...|
+--------------------+--------------------+--------------------+-------------+------------+-----

In [99]:
import re
from pyspark.sql.types import ArrayType, StringType, LongType
from pyspark.sql.functions import split

@udf(returnType = ArrayType(StringType()))
def segmentContext(str):
    res = [repr(i)+','+str[i:i+4096] for i in range(0, len(str) - 2048, 2048)]
    return res

# res = segmentContext("1111122222\n\n\n\n\n333334444455")
# print(res)

paragraph_preproc_df = paragraph_unrolled_df.withColumn("sequence", explode(segmentContext("context"))).drop("context") \
                                            .withColumn("index", split("sequence",",").getItem(0)) \
                                            .withColumn("source", split("sequence",",").getItem(1)) \
                                            .drop("sequence")
# paragraph_preproc_df.show(5)
# paragraph_preproc_df.printSchema()

+--------------------+--------------------+-------------+------------+---------------+-----+--------------------+
|                  id|            question|is_impossible|answer_start|           text|index|              source|
+--------------------+--------------------+-------------+------------+---------------+-----+--------------------+
|LohaCompanyltd_20...|Highlight the par...|        false|          14|SUPPLY CONTRACT|    0|Exhibit 10.16 SUP...|
|LohaCompanyltd_20...|Highlight the par...|        false|          14|SUPPLY CONTRACT| 2048|                 F-1|
|LohaCompanyltd_20...|Highlight the par...|        false|          14|SUPPLY CONTRACT| 4096|r the goods in ac...|
|LohaCompanyltd_20...|Highlight the par...|        false|          14|SUPPLY CONTRACT| 6144|alue and date of ...|
|LohaCompanyltd_20...|Highlight the par...|        false|          14|SUPPLY CONTRACT| 8192|e Sellers shall a...|
+--------------------+--------------------+-------------+------------+---------------+--

In [101]:
from pyspark.sql.functions import col
@udf(returnType = LongType())
def getEndPos(startPos, text):
    length = len(text)
    return startPos + length

@udf(returnType = StringType())
def getOverlappedPos(start, end, index):
    if start > int(index) and start < int(index) + 4096:
        if end < int(index) + 4096 :
            return str(start) + ','+ str(end)
        else:
            return str(start) + ','+ str(index+4096)
    if end > int(index) and end < int(index) + 4096:
        if start < int(index):
            return str(index) + ','+ str(end)
    else:
        return "0,0"
    

positive_sample_preproc_df = paragraph_preproc_df.filter("is_impossible==False").drop("is_impossible") \
                                         .withColumn("answer_end", getEndPos("answer_start", "text")) \
                                         .withColumn("overlapped", getOverlappedPos("answer_start", "answer_end", "index")) \
                                         .drop("answer_start").drop("answer_end") \
                                         .withColumn("answer_start", split("overlapped",",").getItem(0).cast("int")) \
                                         .withColumn("answer_end", split("overlapped",",").getItem(1).cast("int")) \
                                         .drop("overlapped").drop("index") \
                                         .filter((col("answer_start") != 0) & (col("answer_end") != 0))
#                                          .select("source", "question", "answer_start", "answer_end")
# positive_sample_preproc_df.printSchema()               
positive_sample_preproc_df.show(5)

Py4JJavaError: An error occurred while calling o2345.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 50.0 failed 1 times, most recent failure: Lost task 0.0 in stage 50.0 (TID 51) (MyMechRevo executor driver): java.net.SocketException: Software caused connection abort: socket write error
	at java.net.SocketOutputStream.socketWrite0(Native Method)
	at java.net.SocketOutputStream.socketWrite(Unknown Source)
	at java.net.SocketOutputStream.write(Unknown Source)
	at java.io.BufferedOutputStream.flushBuffer(Unknown Source)
	at java.io.BufferedOutputStream.write(Unknown Source)
	at java.io.DataOutputStream.write(Unknown Source)
	at java.io.FilterOutputStream.write(Unknown Source)
	at org.apache.spark.api.python.PythonRDD$.write$1(PythonRDD.scala:295)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$writeIteratorToStream$1(PythonRDD.scala:307)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$writeIteratorToStream$1$adapted(PythonRDD.scala:307)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:307)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.writeIteratorToStream(PythonUDFRunner.scala:53)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.$anonfun$run$1(PythonRunner.scala:434)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:2019)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:269)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2454)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2403)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2402)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2402)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1160)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1160)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1160)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2642)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2584)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2573)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:938)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2214)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2235)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2254)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:476)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:429)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:48)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3715)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2728)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3706)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3704)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2728)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2935)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:287)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:326)
	at sun.reflect.GeneratedMethodAccessor65.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Unknown Source)
Caused by: java.net.SocketException: Software caused connection abort: socket write error
	at java.net.SocketOutputStream.socketWrite0(Native Method)
	at java.net.SocketOutputStream.socketWrite(Unknown Source)
	at java.net.SocketOutputStream.write(Unknown Source)
	at java.io.BufferedOutputStream.flushBuffer(Unknown Source)
	at java.io.BufferedOutputStream.write(Unknown Source)
	at java.io.DataOutputStream.write(Unknown Source)
	at java.io.FilterOutputStream.write(Unknown Source)
	at org.apache.spark.api.python.PythonRDD$.write$1(PythonRDD.scala:295)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$writeIteratorToStream$1(PythonRDD.scala:307)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$writeIteratorToStream$1$adapted(PythonRDD.scala:307)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:307)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.writeIteratorToStream(PythonUDFRunner.scala:53)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.$anonfun$run$1(PythonRunner.scala:434)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:2019)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:269)
