In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("yarn")\
.config('spark.executor.cores', '2')\
.config('spark.executor.memory', '7G')\
.config('spark.driver.memory','4G')\
.config('spark.executor.instances','8')\
.appName('bdse62')\
.getOrCreate()

In [2]:
abstract=spark.read.csv('hdfs:///bdse71/ABS')

# Random Forest

In [3]:
testDF = spark.read.parquet('hdfs:///data/jso_hierarchy.parquet')

In [4]:
testDF.show()

+------------------+--------+-------+-------+
|Application_Number|mono_ipc|tri_ipc|qua_ipc|
+------------------+--------+-------+-------+
| PCT/CA2018/000243|       B|    A61|   B25J|
| PCT/CA2019/050200|       G|    G02|   G02C|
| PCT/CL2016/050019|       G|    G06|   G06Q|
| PCT/CN2012/070253|       A|    A43|   A43B|
| PCT/DE2014/100160|       F|    F41|   F41H|
| PCT/DE2016/100332|       C|    C10|   C10B|
| PCT/DK2005/000435|       A|    A61|   A61K|
| PCT/EP1999/004038|       E|    E21|   E21B|
| PCT/EP1999/008598|       A|    A61|   A61K|
| PCT/EP2000/004724|       A|    A23|   A23G|
| PCT/EP2000/011562|       C|    C11|   C11D|
| PCT/EP2001/002279|       C|    A01|   A01N|
| PCT/EP2001/003614|       B|    F16|   F16B|
| PCT/EP2001/003789|       A|    A22|   A22B|
| PCT/EP2001/010090|       C|    C12|   C12N|
| PCT/EP2001/013071|       C|    C07|   C08F|
| PCT/EP2001/014943|       B|    B60|   B60T|
| PCT/EP2003/007473|       C|    A01|   A01N|
| PCT/EP2003/011110|       A|    A

## TF-IDF計算

testDF = spark.read.json('hdfs:///data/df_mono_ipc.json')

In [5]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, RegexTokenizer
from pyspark.ml.feature import StopWordsRemover

In [6]:
abstract=abstract.dropna()
abstract.show()

+-----------------+--------------------+
|              _c0|                 _c1|
+-----------------+--------------------+
|PCT/IB2015/050868|For the productio...|
|PCT/EP2013/003311|A composition com...|
|PCT/AU2010/000148|An agricultural a...|
|PCT/CN2013/072341|A detection metho...|
|PCT/EP2013/058979|The invention rel...|
|PCT/EP2016/063487|The present inven...|
|PCT/GB2009/001774|A compound having...|
|PCT/EP2013/077705|A compound with t...|
|PCT/EP2009/006204|The invention rel...|
|PCT/KR2017/010450|The present inven...|
|PCT/EP2010/000270|The present inven...|
|PCT/EP2013/077695|A compound of for...|
|PCT/KR2012/004595|The present funct...|
|PCT/JP2015/004803|A zoom lens accor...|
|PCT/AT2017/000070|The invention rel...|
|PCT/KR2015/006148|The present inven...|
|PCT/IB2014/001029|A conjugate of fo...|
|PCT/GB2012/052526|A method for prom...|
|PCT/EP2001/009832|Liquid crystal mi...|
|PCT/KR2000/000814|The present inven...|
+-----------------+--------------------+
only showing top

abstractJoin two tables

In [7]:
joinExpression = testDF['Application_Number']==abstract["_c0"]
joinType = "inner"

In [8]:
testDF.schema

StructType(List(StructField(Application_Number,StringType,true),StructField(mono_ipc,StringType,true),StructField(tri_ipc,StringType,true),StructField(qua_ipc,StringType,true)))

In [9]:
joinedDF=testDF.join(abstract,joinExpression,joinType).drop('_c0')
joinedDF.filter(joinedDF.mono_ipc=='E').show()

+------------------+--------+-------+-------+--------------------+
|Application_Number|mono_ipc|tri_ipc|qua_ipc|                 _c1|
+------------------+--------+-------+-------+--------------------+
| PCT/CN2014/000929|       E|    E21|   E21C|A harrow excavato...|
| PCT/EP2000/011063|       E|    E05|   E05B|The invention rel...|
| PCT/DE2000/002722|       E|    E05|   G06F|The invention rel...|
| PCT/EP2001/011433|       E|    E01|   E02D|The invention rel...|
| PCT/IB2016/055812|       E|    E21|   E02D|A percussion devi...|
| PCT/KR2015/007369|       E|    E01|   E01C|The present inven...|
| PCT/US2001/006951|       E|    E21|   E21B|A petroleum well ...|
| PCT/EP2001/003208|       E|    E05|   E05B|The invention rel...|
| PCT/EP2012/069088|       E|    E21|   E21B|The present inven...|
| PCT/CN2015/071782|       E|    E21|   E21C|A reciprocating i...|
| PCT/CA2001/000121|       E|    E05|   E05B|A latch assembly ...|
| PCT/CN2014/000009|       E|    E21|   E21C|An easily removab

In [10]:
joinedDF=joinedDF.filter(joinedDF.mono_ipc=='E').select('Application_Number','qua_ipc','_c1');joinedDF.show()

+------------------+-------+--------------------+
|Application_Number|qua_ipc|                 _c1|
+------------------+-------+--------------------+
| PCT/CN2014/000929|   E21C|A harrow excavato...|
| PCT/EP2000/011063|   E05B|The invention rel...|
| PCT/DE2000/002722|   G06F|The invention rel...|
| PCT/EP2001/011433|   E02D|The invention rel...|
| PCT/IB2016/055812|   E02D|A percussion devi...|
| PCT/KR2015/007369|   E01C|The present inven...|
| PCT/US2001/006951|   E21B|A petroleum well ...|
| PCT/EP2001/003208|   E05B|The invention rel...|
| PCT/EP2012/069088|   E21B|The present inven...|
| PCT/CN2015/071782|   E21C|A reciprocating i...|
| PCT/CA2001/000121|   E05B|A latch assembly ...|
| PCT/CN2014/000009|   E21C|An easily removab...|
| PCT/AU2016/050241|   E21B|Orientation data ...|
| PCT/US2001/009607|   E21B|A flow completion...|
| PCT/CA2000/001056|   E05B|A latch assembly ...|
| PCT/SE2002/000043|   E04F|A floorboard and ...|
| PCT/NO2010/000081|   E21B|Method and system...|


### stack the model

In [11]:
tokenizer = RegexTokenizer(inputCol="_c1", outputCol="words",pattern="\\W")
remover = StopWordsRemover(inputCol="words", outputCol="removededWords") # stopWords=yourownstopwords
vectorizer = CountVectorizer(inputCol="removededWords", outputCol="rawFeatures")
#idf = IDF(inputCol="rawFeatures", outputCol="features") # minDocFreq=2, TF小於2的就忽略
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=2)
pipeline = Pipeline(stages=[tokenizer,remover ,vectorizer, idf])
model = pipeline.fit(joinedDF)

In [12]:
import numpy as np
total_counts = model.transform(joinedDF).select('rawFeatures').rdd.map(lambda row: row['rawFeatures'].toArray()).reduce(lambda x,y: [x[i]+y[i] for i in range(len(y))])
vocabList = model.stages[2].vocabulary
d = {'vocabList':vocabList,'counts':total_counts}
spark.createDataFrame(np.array(list(d.values())).T.tolist(),list(d.keys())).show()

+----------+------+
| vocabList|counts|
+----------+------+
|      said|1092.0|
|       one|1089.0|
| invention| 938.0|
|     least| 859.0|
|     means| 841.0|
|   element| 808.0|
|     first| 781.0|
|    device| 746.0|
|  provided| 636.0|
|    member| 629.0|
| comprises| 628.0|
|comprising| 625.0|
|   surface| 622.0|
|    second| 587.0|
|  position| 556.0|
|       end| 543.0|
|      part| 513.0|
|   relates| 484.0|
|       two| 469.0|
|      wall| 456.0|
+----------+------+
only showing top 20 rows



In [13]:
spark.createDataFrame(np.array(list(d.values())).T.tolist(),list(d.keys())).write.csv('hdfs:///data/L_words_Of_Bad_Jan14.csv',header=True,mode='overwrite')

In [14]:
tfidf=model.transform(joinedDF)

In [15]:
tfidf.createOrReplaceTempView("table1")
spark.sql('select count(*) as num_of_sample from table1').show()

+-------------+
|num_of_sample|
+-------------+
|         1483|
+-------------+



In [16]:
tfidf.columns

['Application_Number',
 'qua_ipc',
 '_c1',
 'words',
 'removededWords',
 'rawFeatures',
 'features']

In [17]:
tfidfForTrain=spark.sql('select qua_ipc,rawFeatures, features from table1')

In [18]:
tfidfForTrain.show()

+-------+--------------------+--------------------+
|qua_ipc|         rawFeatures|            features|
+-------+--------------------+--------------------+
|   E21C|(7650,[1,8,10,14,...|(7650,[1,8,10,14,...|
|   E05B|(7650,[0,1,2,3,4,...|(7650,[0,1,2,3,4,...|
|   G06F|(7650,[0,2,4,5,10...|(7650,[0,2,4,5,10...|
|   E02D|(7650,[0,1,2,3,8,...|(7650,[0,1,2,3,8,...|
|   E02D|(7650,[1,3,7,15,1...|(7650,[1,3,7,15,1...|
|   E01C|(7650,[2,8,11,12,...|(7650,[2,8,11,12,...|
|   E21B|(7650,[1,7,8,10,1...|(7650,[1,7,8,10,1...|
|   E05B|(7650,[0,1,2,3,5,...|(7650,[0,1,2,3,5,...|
|   E21B|(7650,[1,2,3,5,6,...|(7650,[1,2,3,5,6,...|
|   E21C|(7650,[8,11,16,26...|(7650,[8,11,16,26...|
|   E05B|(7650,[6,13,14,36...|(7650,[6,13,14,36...|
|   E21C|(7650,[7,10,11,15...|(7650,[7,10,11,15...|
|   E21B|(7650,[3,7,12,15,...|(7650,[3,7,12,15,...|
|   E21B|(7650,[6,9,10,11,...|(7650,[6,9,10,11,...|
|   E05B|(7650,[6,13,14,32...|(7650,[6,13,14,32...|
|   E04F|(7650,[1,3,4,6,10...|(7650,[1,3,4,6,10...|
|   E21B|(76

In [19]:
from pyspark.ml.linalg import Vectors # !!!!caution: not from pyspark.mllib.linalg import Vectors
from pyspark.ml import Pipeline
from pyspark.ml.feature import IndexToString,StringIndexer, VectorIndexer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [20]:
labelIndexer = StringIndexer(inputCol='qua_ipc',outputCol='indexedLabel').fit(tfidfForTrain)
labelIndexer.transform(tfidfForTrain).show(5, True)
stage1=labelIndexer.transform(tfidfForTrain)

+-------+--------------------+--------------------+------------+
|qua_ipc|         rawFeatures|            features|indexedLabel|
+-------+--------------------+--------------------+------------+
|   E21C|(7650,[1,8,10,14,...|(7650,[1,8,10,14,...|        14.0|
|   E05B|(7650,[0,1,2,3,4,...|(7650,[0,1,2,3,4,...|         3.0|
|   G06F|(7650,[0,2,4,5,10...|(7650,[0,2,4,5,10...|        42.0|
|   E02D|(7650,[0,1,2,3,8,...|(7650,[0,1,2,3,8,...|         4.0|
|   E02D|(7650,[1,3,7,15,1...|(7650,[1,3,7,15,1...|         4.0|
+-------+--------------------+--------------------+------------+
only showing top 5 rows



In [21]:
(trainingData, testData) = stage1.randomSplit([0.6, 0.4])
#trainingData.show(2,truncate=False)
#testData.show(2,truncate=False)

In [24]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'indexedLabel', maxDepth=12, maxBins=7650)
rfModel = rf.fit(trainingData).transform(testData)

In [25]:
rfModel.show()

+-------+--------------------+--------------------+------------+--------------------+--------------------+----------+
|qua_ipc|         rawFeatures|            features|indexedLabel|       rawPrediction|         probability|prediction|
+-------+--------------------+--------------------+------------+--------------------+--------------------+----------+
|   G06F|(7650,[0,2,4,5,10...|(7650,[0,2,4,5,10...|        42.0|[3.34817519075010...|[0.16740875953750...|       0.0|
|   E21C|(7650,[8,11,16,26...|(7650,[8,11,16,26...|        14.0|[4.05898687169191...|[0.20294934358459...|       0.0|
|   E05B|(7650,[6,13,14,36...|(7650,[6,13,14,36...|         3.0|[4.55235016619144...|[0.22761750830957...|       0.0|
|   E21B|(7650,[0,2,8,10,1...|(7650,[0,2,8,10,1...|         0.0|[8.60157948765966...|[0.43007897438298...|       0.0|
|   E05D|(7650,[1,2,3,4,6,...|(7650,[1,2,3,4,6,...|         9.0|[2.67911807451886...|[0.13395590372594...|       2.0|
|   E04G|(7650,[2,4,7,9,10...|(7650,[2,4,7,9,10...|     

In [26]:
prob_RF=rfModel.select('probability').collect();prob_RF[:10]

[Row(probability=DenseVector([0.1674, 0.1639, 0.0944, 0.0504, 0.0543, 0.0416, 0.026, 0.0322, 0.0266, 0.0184, 0.0452, 0.0396, 0.0257, 0.0137, 0.0217, 0.0214, 0.0262, 0.0204, 0.0175, 0.015, 0.0098, 0.013, 0.0104, 0.0065, 0.0086, 0.0067, 0.0039, 0.002, 0.0028, 0.0018, 0.0017, 0.0006, 0.0, 0.0009, 0.0017, 0.0, 0.0, 0.0015, 0.0011, 0.0, 0.001, 0.0012, 0.0, 0.0019, 0.0014])),
 Row(probability=DenseVector([0.2029, 0.089, 0.091, 0.051, 0.0568, 0.0477, 0.0345, 0.0357, 0.0282, 0.0205, 0.0374, 0.033, 0.0279, 0.0155, 0.0241, 0.0237, 0.0318, 0.0232, 0.0222, 0.0186, 0.0082, 0.0143, 0.0107, 0.007, 0.0099, 0.0072, 0.0053, 0.0022, 0.0034, 0.0021, 0.0013, 0.0009, 0.0, 0.0011, 0.0018, 0.0, 0.0, 0.0022, 0.0012, 0.0, 0.0012, 0.0016, 0.0, 0.0022, 0.0013])),
 Row(probability=DenseVector([0.2276, 0.0725, 0.0812, 0.107, 0.043, 0.0374, 0.0288, 0.0296, 0.0273, 0.017, 0.0306, 0.0418, 0.0228, 0.0129, 0.0329, 0.0219, 0.0267, 0.0187, 0.0157, 0.0128, 0.0054, 0.0125, 0.0089, 0.0048, 0.0086, 0.0297, 0.0043, 0.0016, 0.0

In [27]:
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",labels=labelIndexer.labels)
labelConverter.transform(rfModel).select('features','qua_ipc','predictedLabel').show()

+--------------------+-------+--------------+
|            features|qua_ipc|predictedLabel|
+--------------------+-------+--------------+
|(7650,[0,2,4,5,10...|   G06F|          E21B|
|(7650,[8,11,16,26...|   E21C|          E21B|
|(7650,[6,13,14,36...|   E05B|          E21B|
|(7650,[0,2,8,10,1...|   E21B|          E21B|
|(7650,[1,2,3,4,6,...|   E05D|          E06B|
|(7650,[2,4,7,9,10...|   E04G|          E21B|
|(7650,[0,1,2,3,4,...|   E02D|          E21B|
|(7650,[0,3,4,7,10...|   E06B|          E21B|
|(7650,[0,1,2,3,6,...|   E21B|          E21B|
|(7650,[1,2,3,4,5,...|   E04G|          E21B|
|(7650,[4,5,6,8,9,...|   E04F|          E04F|
|(7650,[5,8,11,14,...|   E05B|          E21B|
|(7650,[0,1,2,3,4,...|   E05B|          E05B|
|(7650,[2,11,21,33...|   G07C|          E21B|
|(7650,[1,2,9,10,2...|   E21B|          E21B|
|(7650,[4,10,14,18...|   E05B|          E05B|
|(7650,[2,10,37,41...|   E03D|          E03D|
|(7650,[1,3,4,6,9,...|   E03D|          E03D|
|(7650,[0,1,3,6,8,...|   E06B|    

In [28]:
labelConverter.transform(rfModel).select('indexedLabel','prediction').createOrReplaceTempView('table2')
predictionAndTarget = spark.sql('select indexedLabel as label, prediction from table2')

evaluatorMulti = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
acc = evaluatorMulti.evaluate(predictionAndTarget)

In [29]:
print(acc)

0.29820039926648567
