In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("yarn")\
.config('spark.executor.cores', '2')\
.config('spark.executor.memory', '7G')\
.config('spark.driver.memory','4G')\
.config('spark.executor.instances','4')\
.appName('bdse62')\
.getOrCreate()

In [2]:
abstract=spark.read.csv('hdfs:///bdse71/ABS')

# Feedforward Neural Network

In [3]:
testDF = spark.read.parquet('hdfs:///data/jso_hierarchy.parquet')

In [4]:
testDF.show()

+------------------+--------+-------+-------+
|Application_Number|mono_ipc|tri_ipc|qua_ipc|
+------------------+--------+-------+-------+
| PCT/CA2018/000243|       B|    A61|   B25J|
| PCT/CA2019/050200|       G|    G02|   G02C|
| PCT/CL2016/050019|       G|    G06|   G06Q|
| PCT/CN2012/070253|       A|    A43|   A43B|
| PCT/DE2014/100160|       F|    F41|   F41H|
| PCT/DE2016/100332|       C|    C10|   C10B|
| PCT/DK2005/000435|       A|    A61|   A61K|
| PCT/EP1999/004038|       E|    E21|   E21B|
| PCT/EP1999/008598|       A|    A61|   A61K|
| PCT/EP2000/004724|       A|    A23|   A23G|
| PCT/EP2000/011562|       C|    C11|   C11D|
| PCT/EP2001/002279|       C|    A01|   A01N|
| PCT/EP2001/003614|       B|    F16|   F16B|
| PCT/EP2001/003789|       A|    A22|   A22B|
| PCT/EP2001/010090|       C|    C12|   C12N|
| PCT/EP2001/013071|       C|    C07|   C08F|
| PCT/EP2001/014943|       B|    B60|   B60T|
| PCT/EP2003/007473|       C|    A01|   A01N|
| PCT/EP2003/011110|       A|    A

## TF-IDF計算

testDF = spark.read.json('hdfs:///data/df_mono_ipc.json')

In [5]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, RegexTokenizer
from pyspark.ml.feature import StopWordsRemover

In [6]:
abstract=abstract.dropna()
abstract.show()

+-----------------+--------------------+
|              _c0|                 _c1|
+-----------------+--------------------+
|PCT/IB2015/050868|For the productio...|
|PCT/EP2013/003311|A composition com...|
|PCT/AU2010/000148|An agricultural a...|
|PCT/CN2013/072341|A detection metho...|
|PCT/EP2013/058979|The invention rel...|
|PCT/EP2016/063487|The present inven...|
|PCT/GB2009/001774|A compound having...|
|PCT/EP2013/077705|A compound with t...|
|PCT/EP2009/006204|The invention rel...|
|PCT/KR2017/010450|The present inven...|
|PCT/EP2010/000270|The present inven...|
|PCT/EP2013/077695|A compound of for...|
|PCT/KR2012/004595|The present funct...|
|PCT/JP2015/004803|A zoom lens accor...|
|PCT/AT2017/000070|The invention rel...|
|PCT/KR2015/006148|The present inven...|
|PCT/IB2014/001029|A conjugate of fo...|
|PCT/GB2012/052526|A method for prom...|
|PCT/EP2001/009832|Liquid crystal mi...|
|PCT/KR2000/000814|The present inven...|
+-----------------+--------------------+
only showing top

abstractJoin two tables

In [7]:
joinExpression = testDF['Application_Number']==abstract["_c0"]
joinType = "inner"

In [8]:
testDF.schema

StructType(List(StructField(Application_Number,StringType,true),StructField(mono_ipc,StringType,true),StructField(tri_ipc,StringType,true),StructField(qua_ipc,StringType,true)))

In [9]:
joinedDF=testDF.join(abstract,joinExpression,joinType).drop('_c0')
joinedDF.filter(joinedDF.mono_ipc=='E').show()

+------------------+--------+-------+-------+--------------------+
|Application_Number|mono_ipc|tri_ipc|qua_ipc|                 _c1|
+------------------+--------+-------+-------+--------------------+
| PCT/CN2014/000929|       E|    E21|   E21C|A harrow excavato...|
| PCT/EP2000/011063|       E|    E05|   E05B|The invention rel...|
| PCT/DE2000/002722|       E|    E05|   G06F|The invention rel...|
| PCT/EP2001/011433|       E|    E01|   E02D|The invention rel...|
| PCT/IB2016/055812|       E|    E21|   E02D|A percussion devi...|
| PCT/KR2015/007369|       E|    E01|   E01C|The present inven...|
| PCT/US2001/006951|       E|    E21|   E21B|A petroleum well ...|
| PCT/EP2001/003208|       E|    E05|   E05B|The invention rel...|
| PCT/EP2012/069088|       E|    E21|   E21B|The present inven...|
| PCT/CN2015/071782|       E|    E21|   E21C|A reciprocating i...|
| PCT/CA2001/000121|       E|    E05|   E05B|A latch assembly ...|
| PCT/CN2014/000009|       E|    E21|   E21C|An easily removab

In [10]:
joinedDF=joinedDF.filter(joinedDF.mono_ipc=='E').select('Application_Number','qua_ipc','_c1');joinedDF.show()

+------------------+-------+--------------------+
|Application_Number|qua_ipc|                 _c1|
+------------------+-------+--------------------+
| PCT/CN2014/000929|   E21C|A harrow excavato...|
| PCT/EP2000/011063|   E05B|The invention rel...|
| PCT/DE2000/002722|   G06F|The invention rel...|
| PCT/EP2001/011433|   E02D|The invention rel...|
| PCT/IB2016/055812|   E02D|A percussion devi...|
| PCT/KR2015/007369|   E01C|The present inven...|
| PCT/US2001/006951|   E21B|A petroleum well ...|
| PCT/EP2001/003208|   E05B|The invention rel...|
| PCT/EP2012/069088|   E21B|The present inven...|
| PCT/CN2015/071782|   E21C|A reciprocating i...|
| PCT/CA2001/000121|   E05B|A latch assembly ...|
| PCT/CN2014/000009|   E21C|An easily removab...|
| PCT/AU2016/050241|   E21B|Orientation data ...|
| PCT/US2001/009607|   E21B|A flow completion...|
| PCT/CA2000/001056|   E05B|A latch assembly ...|
| PCT/SE2002/000043|   E04F|A floorboard and ...|
| PCT/NO2010/000081|   E21B|Method and system...|


### stack the model

In [11]:
tokenizer = RegexTokenizer(inputCol="_c1", outputCol="words",pattern="\\W")
remover = StopWordsRemover(inputCol="words", outputCol="removededWords") # stopWords=yourownstopwords
vectorizer = CountVectorizer(inputCol="removededWords", outputCol="rawFeatures")
#idf = IDF(inputCol="rawFeatures", outputCol="features") # minDocFreq=2, TF小於2的就忽略
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=2)
pipeline = Pipeline(stages=[tokenizer,remover ,vectorizer, idf])
model = pipeline.fit(joinedDF)

In [12]:
import numpy as np
total_counts = model.transform(joinedDF).select('rawFeatures').rdd.map(lambda row: row['rawFeatures'].toArray()).reduce(lambda x,y: [x[i]+y[i] for i in range(len(y))])
vocabList = model.stages[2].vocabulary
d = {'vocabList':vocabList,'counts':total_counts}
spark.createDataFrame(np.array(list(d.values())).T.tolist(),list(d.keys())).show()

+----------+------+
| vocabList|counts|
+----------+------+
|       one|1128.0|
|      said|1094.0|
| invention| 942.0|
|     least| 887.0|
|     means| 842.0|
|     first| 824.0|
|   element| 811.0|
|    device| 761.0|
|    member| 661.0|
|  provided| 644.0|
|   surface| 636.0|
| comprises| 634.0|
|comprising| 631.0|
|    second| 625.0|
|  position| 565.0|
|       end| 558.0|
|      part| 515.0|
|    system| 503.0|
|     fluid| 496.0|
|   relates| 488.0|
+----------+------+
only showing top 20 rows



```spark.createDataFrame(np.array(list(d.values())).T.tolist(),list(d.keys())).write.csv('hdfs:///data/G_words_Of_Bad_Jan14.csv',header=True,mode='overwrite')```

In [13]:
tfidf=model.transform(joinedDF)

In [14]:
tfidf.createOrReplaceTempView("table1")
spark.sql('select count(*) as num_of_sample from table1').show()

+-------------+
|num_of_sample|
+-------------+
|         1545|
+-------------+



In [15]:
tfidf.columns

['Application_Number',
 'qua_ipc',
 '_c1',
 'words',
 'removededWords',
 'rawFeatures',
 'features']

In [16]:
tfidfForTrain=spark.sql('select qua_ipc,rawFeatures, features from table1')

In [17]:
tfidfForTrain.show()

+-------+--------------------+--------------------+
|qua_ipc|         rawFeatures|            features|
+-------+--------------------+--------------------+
|   E21C|(7842,[0,9,11,14,...|(7842,[0,9,11,14,...|
|   E05B|(7842,[0,1,2,3,4,...|(7842,[0,1,2,3,4,...|
|   G06F|(7842,[1,2,4,6,11...|(7842,[1,2,4,6,11...|
|   E02D|(7842,[0,1,2,3,8,...|(7842,[0,1,2,3,8,...|
|   E02D|(7842,[0,3,7,15,1...|(7842,[0,3,7,15,1...|
|   E01C|(7842,[2,9,10,12,...|(7842,[2,9,10,12,...|
|   E21B|(7842,[0,7,9,10,1...|(7842,[0,7,9,10,1...|
|   E05B|(7842,[0,1,2,3,6,...|(7842,[0,1,2,3,6,...|
|   E21B|(7842,[0,2,3,5,6,...|(7842,[0,2,3,5,6,...|
|   E21C|(7842,[9,12,16,24...|(7842,[9,12,16,24...|
|   E05B|(7842,[5,13,14,35...|(7842,[5,13,14,35...|
|   E21C|(7842,[7,11,12,15...|(7842,[7,11,12,15...|
|   E21B|(7842,[3,7,10,15,...|(7842,[3,7,10,15,...|
|   E21B|(7842,[5,8,11,12,...|(7842,[5,8,11,12,...|
|   E05B|(7842,[5,13,14,31...|(7842,[5,13,14,31...|
|   E04F|(7842,[0,3,4,5,10...|(7842,[0,3,4,5,10...|
|   E21B|(78

In [18]:
from pyspark.ml.linalg import Vectors # !!!!caution: not from pyspark.mllib.linalg import Vectors
from pyspark.ml import Pipeline
from pyspark.ml.feature import IndexToString,StringIndexer, VectorIndexer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [19]:
labelIndexer = StringIndexer(inputCol='qua_ipc',outputCol='indexedLabel').fit(tfidfForTrain)
labelIndexer.transform(tfidfForTrain).show(5, True)
stage1=labelIndexer.transform(tfidfForTrain)

+-------+--------------------+--------------------+------------+
|qua_ipc|         rawFeatures|            features|indexedLabel|
+-------+--------------------+--------------------+------------+
|   E21C|(7842,[0,9,11,14,...|(7842,[0,9,11,14,...|        14.0|
|   E05B|(7842,[0,1,2,3,4,...|(7842,[0,1,2,3,4,...|         3.0|
|   G06F|(7842,[1,2,4,6,11...|(7842,[1,2,4,6,11...|        37.0|
|   E02D|(7842,[0,1,2,3,8,...|(7842,[0,1,2,3,8,...|         4.0|
|   E02D|(7842,[0,3,7,15,1...|(7842,[0,3,7,15,1...|         4.0|
+-------+--------------------+--------------------+------------+
only showing top 5 rows



In [20]:
stage1.select('qua_ipc','indexedLabel').createOrReplaceTempView('table2')
spark.sql('select distinct indexedLabel, qua_ipc from table2').show(n=100)

+------------+-------+
|indexedLabel|qua_ipc|
+------------+-------+
|        37.0|   G06F|
|        28.0|   E03B|
|        29.0|   E05G|
|        45.0|   G08G|
|        36.0|   G02B|
|        42.0|   G01M|
|         7.0|   E04H|
|         4.0|   E02D|
|         6.0|   E03D|
|         0.0|   E21B|
|        41.0|   G01H|
|         2.0|   E06B|
|         3.0|   E05B|
|        19.0|   E21D|
|        32.0|   D07B|
|         1.0|   E04B|
|        24.0|   E05C|
|        43.0|   G05B|
|        30.0|   E21F|
|         5.0|   E04F|
|        44.0|   G07D|
|        22.0|   E03F|
|         9.0|   E05D|
|        25.0|   E06C|
|        23.0|   E01H|
|        12.0|   E04D|
|        21.0|   E03C|
|        18.0|   E02B|
|        26.0|   G01N|
|        11.0|   E04G|
|        34.0|   G01K|
|        20.0|   E01D|
|        16.0|   E01C|
|        15.0|   E01F|
|        38.0|   D05C|
|        14.0|   E21C|
|        10.0|   E04C|
|         8.0|   E01B|
|        27.0|   D06N|
|        35.0|   G01V|
|        40

In [21]:
(trainingData, testData) = stage1.randomSplit([0.6, 0.4])
#trainingData.show(2,truncate=False)
#testData.show(2,truncate=False)

In [22]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
layers = [7729, 64, 46]
FNN = MultilayerPerceptronClassifier(featuresCol="features" ,labelCol="indexedLabel", maxIter=300, layers=layers)
FNNModel = FNN.fit(trainingData).transform(testData)

In [23]:
FNNModel.show()

Py4JJavaError: An error occurred while calling o557.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 12 in stage 126.0 failed 4 times, most recent failure: Lost task 12.3 in stage 126.0 (TID 264426, bdse51.example.org, executor 2): org.apache.spark.SparkException: Failed to execute user defined function(ProbabilisticClassificationModel$$Lambda$3544/0x000000084130e040: (struct<type:tinyint,size:int,indices:array<int>,values:array<double>>) => struct<type:tinyint,size:int,indices:array<int>,values:array<double>>)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:729)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:340)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:872)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:872)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:313)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:127)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: java.lang.IllegalArgumentException: requirement failed: A & B Dimension mismatch!
	at scala.Predef$.require(Predef.scala:281)
	at org.apache.spark.ml.ann.BreezeUtil$.dgemm(BreezeUtil.scala:41)
	at org.apache.spark.ml.ann.AffineLayerModel.eval(Layer.scala:164)
	at org.apache.spark.ml.ann.FeedForwardModel.forward(Layer.scala:508)
	at org.apache.spark.ml.ann.FeedForwardModel.predictRaw(Layer.scala:561)
	at org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel.predictRaw(MultilayerPerceptronClassifier.scala:297)
	at org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel.predictRaw(MultilayerPerceptronClassifier.scala:260)
	at org.apache.spark.ml.classification.ProbabilisticClassificationModel.$anonfun$transform$2(ProbabilisticClassifier.scala:120)
	... 17 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2008)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2007)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2007)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:973)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:973)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:973)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2239)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2188)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2177)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:775)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2120)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2139)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:467)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:420)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:47)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3627)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2697)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3618)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3616)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2697)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2904)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:300)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:337)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: org.apache.spark.SparkException: Failed to execute user defined function(ProbabilisticClassificationModel$$Lambda$3544/0x000000084130e040: (struct<type:tinyint,size:int,indices:array<int>,values:array<double>>) => struct<type:tinyint,size:int,indices:array<int>,values:array<double>>)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:729)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:340)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:872)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:872)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:313)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:127)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more
Caused by: java.lang.IllegalArgumentException: requirement failed: A & B Dimension mismatch!
	at scala.Predef$.require(Predef.scala:281)
	at org.apache.spark.ml.ann.BreezeUtil$.dgemm(BreezeUtil.scala:41)
	at org.apache.spark.ml.ann.AffineLayerModel.eval(Layer.scala:164)
	at org.apache.spark.ml.ann.FeedForwardModel.forward(Layer.scala:508)
	at org.apache.spark.ml.ann.FeedForwardModel.predictRaw(Layer.scala:561)
	at org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel.predictRaw(MultilayerPerceptronClassifier.scala:297)
	at org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel.predictRaw(MultilayerPerceptronClassifier.scala:260)
	at org.apache.spark.ml.classification.ProbabilisticClassificationModel.$anonfun$transform$2(ProbabilisticClassifier.scala:120)
	... 17 more


In [None]:
prob_FNN=FNNModel.select('probability').collect();prob_FNN[:10]

In [None]:
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",labels=labelIndexer.labels)
labelConverter.transform(FNNModel).select('features','qua_ipc','predictedLabel','probability').show()

In [None]:
labelConverter.transform(FNNModel).select('indexedLabel','prediction').createOrReplaceTempView('table2')
predictionAndTarget = spark.sql('select indexedLabel as label, prediction from table2')

evaluatorMulti = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
acc = evaluatorMulti.evaluate(predictionAndTarget)

In [None]:
print(acc)

labelConverter.transform(FNNModel).select('features','qua_ipc','predictedLabel','probability').write.json('hdfs:///data/predicted_results_FNN.json',compression='bzip2',mode='overwrite')