In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("yarn")\
.config('spark.executor.cores', '2')\
.config('spark.executor.memory', '7G')\
.config('spark.driver.memory','4G')\
.config('spark.executor.instances','8')\
.appName('bdse62')\
.getOrCreate()

In [2]:
abstract=spark.read.csv('hdfs:///bdse71/ABS')

# Feedforward Neural Network

In [3]:
testDF = spark.read.parquet('hdfs:///data/jso_hierarchy.parquet')

In [4]:
testDF.show()

+------------------+--------+-------+-------+
|Application_Number|mono_ipc|tri_ipc|qua_ipc|
+------------------+--------+-------+-------+
| PCT/CA2018/000243|       B|    A61|   B25J|
| PCT/CA2019/050200|       G|    G02|   G02C|
| PCT/CL2016/050019|       G|    G06|   G06Q|
| PCT/CN2012/070253|       A|    A43|   A43B|
| PCT/DE2014/100160|       F|    F41|   F41H|
| PCT/DE2016/100332|       C|    C10|   C10B|
| PCT/DK2005/000435|       A|    A61|   A61K|
| PCT/EP1999/004038|       E|    E21|   E21B|
| PCT/EP1999/008598|       A|    A61|   A61K|
| PCT/EP2000/004724|       A|    A23|   A23G|
| PCT/EP2000/011562|       C|    C11|   C11D|
| PCT/EP2001/002279|       C|    A01|   A01N|
| PCT/EP2001/003614|       B|    F16|   F16B|
| PCT/EP2001/003789|       A|    A22|   A22B|
| PCT/EP2001/010090|       C|    C12|   C12N|
| PCT/EP2001/013071|       C|    C07|   C08F|
| PCT/EP2001/014943|       B|    B60|   B60T|
| PCT/EP2003/007473|       C|    A01|   A01N|
| PCT/EP2003/011110|       A|    A

## TF-IDF計算

testDF = spark.read.json('hdfs:///data/df_mono_ipc.json')

In [5]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, RegexTokenizer
from pyspark.ml.feature import StopWordsRemover

In [6]:
abstract=abstract.dropna()
abstract.show()

+-----------------+--------------------+
|              _c0|                 _c1|
+-----------------+--------------------+
|PCT/IB2015/050868|For the productio...|
|PCT/EP2013/003311|A composition com...|
|PCT/AU2010/000148|An agricultural a...|
|PCT/CN2013/072341|A detection metho...|
|PCT/EP2013/058979|The invention rel...|
|PCT/EP2016/063487|The present inven...|
|PCT/GB2009/001774|A compound having...|
|PCT/EP2013/077705|A compound with t...|
|PCT/EP2009/006204|The invention rel...|
|PCT/KR2017/010450|The present inven...|
|PCT/EP2010/000270|The present inven...|
|PCT/EP2013/077695|A compound of for...|
|PCT/KR2012/004595|The present funct...|
|PCT/JP2015/004803|A zoom lens accor...|
|PCT/AT2017/000070|The invention rel...|
|PCT/KR2015/006148|The present inven...|
|PCT/IB2014/001029|A conjugate of fo...|
|PCT/GB2012/052526|A method for prom...|
|PCT/EP2001/009832|Liquid crystal mi...|
|PCT/KR2000/000814|The present inven...|
+-----------------+--------------------+
only showing top

abstractJoin two tables

In [7]:
joinExpression = testDF['Application_Number']==abstract["_c0"]
joinType = "inner"

In [8]:
testDF.schema

StructType(List(StructField(Application_Number,StringType,true),StructField(mono_ipc,StringType,true),StructField(tri_ipc,StringType,true),StructField(qua_ipc,StringType,true)))

In [9]:
joinedDF=testDF.join(abstract,joinExpression,joinType).drop('_c0')
joinedDF.filter(joinedDF.mono_ipc=='E').show()

+------------------+--------+-------+-------+--------------------+
|Application_Number|mono_ipc|tri_ipc|qua_ipc|                 _c1|
+------------------+--------+-------+-------+--------------------+
| PCT/CN2014/000929|       E|    E21|   E21C|A harrow excavato...|
| PCT/EP2000/011063|       E|    E05|   E05B|The invention rel...|
| PCT/DE2000/002722|       E|    E05|   G06F|The invention rel...|
| PCT/EP2001/011433|       E|    E01|   E02D|The invention rel...|
| PCT/IB2016/055812|       E|    E21|   E02D|A percussion devi...|
| PCT/KR2015/007369|       E|    E01|   E01C|The present inven...|
| PCT/US2001/006951|       E|    E21|   E21B|A petroleum well ...|
| PCT/EP2001/003208|       E|    E05|   E05B|The invention rel...|
| PCT/EP2012/069088|       E|    E21|   E21B|The present inven...|
| PCT/CN2015/071782|       E|    E21|   E21C|A reciprocating i...|
| PCT/CA2001/000121|       E|    E05|   E05B|A latch assembly ...|
| PCT/CN2014/000009|       E|    E21|   E21C|An easily removab

In [10]:
joinedDF=joinedDF.filter(joinedDF.mono_ipc=='E').select('Application_Number','qua_ipc','_c1');joinedDF.show()

+------------------+-------+--------------------+
|Application_Number|qua_ipc|                 _c1|
+------------------+-------+--------------------+
| PCT/CN2014/000929|   E21C|A harrow excavato...|
| PCT/EP2000/011063|   E05B|The invention rel...|
| PCT/DE2000/002722|   G06F|The invention rel...|
| PCT/EP2001/011433|   E02D|The invention rel...|
| PCT/IB2016/055812|   E02D|A percussion devi...|
| PCT/KR2015/007369|   E01C|The present inven...|
| PCT/US2001/006951|   E21B|A petroleum well ...|
| PCT/EP2001/003208|   E05B|The invention rel...|
| PCT/EP2012/069088|   E21B|The present inven...|
| PCT/CN2015/071782|   E21C|A reciprocating i...|
| PCT/CA2001/000121|   E05B|A latch assembly ...|
| PCT/CN2014/000009|   E21C|An easily removab...|
| PCT/AU2016/050241|   E21B|Orientation data ...|
| PCT/US2001/009607|   E21B|A flow completion...|
| PCT/CA2000/001056|   E05B|A latch assembly ...|
| PCT/SE2002/000043|   E04F|A floorboard and ...|
| PCT/NO2010/000081|   E21B|Method and system...|


### stack the model

In [11]:
tokenizer = RegexTokenizer(inputCol="_c1", outputCol="words",pattern="\\W")
remover = StopWordsRemover(inputCol="words", outputCol="removededWords") # stopWords=yourownstopwords
vectorizer = CountVectorizer(inputCol="removededWords", outputCol="rawFeatures")
#idf = IDF(inputCol="rawFeatures", outputCol="features") # minDocFreq=2, TF小於2的就忽略
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=2)
pipeline = Pipeline(stages=[tokenizer,remover ,vectorizer, idf])
model = pipeline.fit(joinedDF)

In [12]:
import numpy as np
total_counts = model.transform(joinedDF).select('rawFeatures').rdd.map(lambda row: row['rawFeatures'].toArray()).reduce(lambda x,y: [x[i]+y[i] for i in range(len(y))])
vocabList = model.stages[2].vocabulary
d = {'vocabList':vocabList,'counts':total_counts}
spark.createDataFrame(np.array(list(d.values())).T.tolist(),list(d.keys())).show()

+----------+------+
| vocabList|counts|
+----------+------+
|      said|1092.0|
|       one|1088.0|
| invention| 936.0|
|     least| 859.0|
|     means| 841.0|
|   element| 808.0|
|     first| 781.0|
|    device| 745.0|
|  provided| 636.0|
|    member| 629.0|
| comprises| 628.0|
|comprising| 625.0|
|   surface| 622.0|
|    second| 587.0|
|  position| 553.0|
|       end| 543.0|
|      part| 513.0|
|   relates| 484.0|
|       two| 469.0|
|      wall| 456.0|
+----------+------+
only showing top 20 rows



In [13]:
spark.createDataFrame(np.array(list(d.values())).T.tolist(),list(d.keys())).write.csv('hdfs:///data/L_words_Of_Bad_Jan14.csv',header=True,mode='overwrite')

In [14]:
tfidf=model.transform(joinedDF)

In [15]:
tfidf.createOrReplaceTempView("table1")
spark.sql('select count(*) as num_of_sample from table1').show()

+-------------+
|num_of_sample|
+-------------+
|         1480|
+-------------+



In [16]:
tfidf.columns

['Application_Number',
 'qua_ipc',
 '_c1',
 'words',
 'removededWords',
 'rawFeatures',
 'features']

In [17]:
tfidfForTrain=spark.sql('select qua_ipc,rawFeatures, features from table1')

In [18]:
tfidfForTrain.show()

+-------+--------------------+--------------------+
|qua_ipc|         rawFeatures|            features|
+-------+--------------------+--------------------+
|   E21C|(7639,[1,8,10,14,...|(7639,[1,8,10,14,...|
|   E05B|(7639,[0,1,2,3,4,...|(7639,[0,1,2,3,4,...|
|   G06F|(7639,[0,2,4,5,10...|(7639,[0,2,4,5,10...|
|   E02D|(7639,[0,1,2,3,8,...|(7639,[0,1,2,3,8,...|
|   E02D|(7639,[1,3,7,15,1...|(7639,[1,3,7,15,1...|
|   E01C|(7639,[2,8,11,12,...|(7639,[2,8,11,12,...|
|   E21B|(7639,[1,7,8,10,1...|(7639,[1,7,8,10,1...|
|   E05B|(7639,[0,1,2,3,5,...|(7639,[0,1,2,3,5,...|
|   E21B|(7639,[1,2,3,5,6,...|(7639,[1,2,3,5,6,...|
|   E21C|(7639,[8,11,16,26...|(7639,[8,11,16,26...|
|   E05B|(7639,[6,13,14,36...|(7639,[6,13,14,36...|
|   E21C|(7639,[7,10,11,15...|(7639,[7,10,11,15...|
|   E21B|(7639,[3,7,12,15,...|(7639,[3,7,12,15,...|
|   E21B|(7639,[6,9,10,11,...|(7639,[6,9,10,11,...|
|   E05B|(7639,[6,13,14,32...|(7639,[6,13,14,32...|
|   E04F|(7639,[1,3,4,6,10...|(7639,[1,3,4,6,10...|
|   E21B|(76

In [19]:
from pyspark.ml.linalg import Vectors # !!!!caution: not from pyspark.mllib.linalg import Vectors
from pyspark.ml import Pipeline
from pyspark.ml.feature import IndexToString,StringIndexer, VectorIndexer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [20]:
labelIndexer = StringIndexer(inputCol='qua_ipc',outputCol='indexedLabel').fit(tfidfForTrain)
labelIndexer.transform(tfidfForTrain).show(5, True)
stage1=labelIndexer.transform(tfidfForTrain)

+-------+--------------------+--------------------+------------+
|qua_ipc|         rawFeatures|            features|indexedLabel|
+-------+--------------------+--------------------+------------+
|   E21C|(7639,[1,8,10,14,...|(7639,[1,8,10,14,...|        14.0|
|   E05B|(7639,[0,1,2,3,4,...|(7639,[0,1,2,3,4,...|         3.0|
|   G06F|(7639,[0,2,4,5,10...|(7639,[0,2,4,5,10...|        42.0|
|   E02D|(7639,[0,1,2,3,8,...|(7639,[0,1,2,3,8,...|         4.0|
|   E02D|(7639,[1,3,7,15,1...|(7639,[1,3,7,15,1...|         4.0|
+-------+--------------------+--------------------+------------+
only showing top 5 rows



In [21]:
(trainingData, testData) = stage1.randomSplit([0.6, 0.4])
#trainingData.show(2,truncate=False)
#testData.show(2,truncate=False)

In [22]:
from pyspark.ml.classification import LogisticRegression
logr = LogisticRegression(featuresCol='features', labelCol='indexedLabel')
stage2=logr.fit(trainingData).transform(testData)

In [23]:
stage2.show()

+-------+--------------------+--------------------+------------+--------------------+--------------------+----------+
|qua_ipc|         rawFeatures|            features|indexedLabel|       rawPrediction|         probability|prediction|
+-------+--------------------+--------------------+------------+--------------------+--------------------+----------+
|   G06F|(7639,[0,2,4,5,10...|(7639,[0,2,4,5,10...|        42.0|[15.6640025456533...|[7.46096478311355...|      10.0|
|   E02D|(7639,[0,1,2,3,8,...|(7639,[0,1,2,3,8,...|         4.0|[1.77767684816139...|[0.00230542461085...|      10.0|
|   E21B|(7639,[1,7,8,10,1...|(7639,[1,7,8,10,1...|         0.0|[52.0229897849143...|[1.0,2.6766560717...|       0.0|
|   E21B|(7639,[3,7,12,15,...|(7639,[3,7,12,15,...|         0.0|[39.7653993751308...|[0.99999999999395...|       0.0|
|   E21B|(7639,[6,9,10,11,...|(7639,[6,9,10,11,...|         0.0|[52.0800113979501...|[0.99999999999999...|       0.0|
|   E05B|(7639,[6,13,14,32...|(7639,[6,13,14,32...|     

In [24]:
prob_log=stage2.select('probability').collect();prob_log[:10]

[Row(probability=DenseVector([0.0007, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.9991, 0.0, 0.0, 0.0, 0.0, 0.0001, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])),
 Row(probability=DenseVector([0.0023, 0.0, 0.0003, 0.0112, 0.0911, 0.1273, 0.0002, 0.003, 0.0013, 0.2226, 0.2468, 0.0003, 0.0004, 0.0008, 0.0, 0.0, 0.1197, 0.0008, 0.0, 0.0065, 0.0001, 0.0027, 0.0013, 0.0003, 0.0024, 0.1574, 0.0, 0.0001, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0001, 0.0001, 0.0001, 0.0, 0.0, 0.0001, 0.0, 0.0001, 0.0, 0.0, 0.0, 0.0])),
 Row(probability=DenseVector([1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])),
 Row(probability=DenseVector([1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

In [25]:
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",labels=labelIndexer.labels)
labelConverter.transform(stage2).select('features','qua_ipc','predictedLabel').show()

+--------------------+-------+--------------+
|            features|qua_ipc|predictedLabel|
+--------------------+-------+--------------+
|(7639,[0,2,4,5,10...|   G06F|          E04C|
|(7639,[0,1,2,3,8,...|   E02D|          E04C|
|(7639,[1,7,8,10,1...|   E21B|          E21B|
|(7639,[3,7,12,15,...|   E21B|          E21B|
|(7639,[6,9,10,11,...|   E21B|          E21B|
|(7639,[6,13,14,32...|   E05B|          E05B|
|(7639,[0,4,10,11,...|   E21B|          E21B|
|(7639,[0,1,2,4,5,...|   E01F|          E21B|
|(7639,[2,4,7,9,10...|   E04G|          E03D|
|(7639,[0,1,4,9,10...|   E06B|          E06B|
|(7639,[0,1,3,4,5,...|   E04F|          E04F|
|(7639,[0,1,2,3,4,...|   E02D|          E04D|
|(7639,[4,5,6,8,9,...|   E04F|          E04B|
|(7639,[5,8,11,14,...|   E05B|          E05B|
|(7639,[0,6,14,30,...|   E05B|          E05B|
|(7639,[1,2,9,10,2...|   E21B|          E21B|
|(7639,[26,31,32,3...|   E21B|          E21B|
|(7639,[2,5,6,10,1...|   E04B|          E04B|
|(7639,[1,2,4,15,1...|   E21B|    

In [26]:
labelConverter.transform(stage2).select('indexedLabel','prediction').createOrReplaceTempView('table2')
predictionAndTarget = spark.sql('select indexedLabel as label, prediction from table2')

evaluatorMulti = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
acc = evaluatorMulti.evaluate(predictionAndTarget)

In [27]:
print(acc)

0.4483591662799168
