In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("yarn")\
.config('spark.executor.cores', '2')\
.config('spark.executor.memory', '9G')\
.config('spark.executor.instances','10')\
.appName('bdse62')\
.getOrCreate()

In [2]:
abstract=spark.read.csv('hdfs:///bdse71/ABS')

# Random Forest

In [3]:
testDF = spark.read.json('hdfs:///data/df_mono_ipc.json')

In [4]:
testDF.show()

+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+----------------+------------------+--------------------+--------------+--------+
|              Agents|          Applicants|Application_Number|   Designated_States|           Inventors|  PublicationNo_Name|Publication_Date|Publication_Number|               Title|ipc_simplified|mono_ipc|
+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+----------------+------------------+--------------------+--------------+--------+
|'ОСИПОВА, Наталья...|[' [CY]/[CY] (All...| PCT/IB2011/002779|[[BF,  BJ,  CF,  ...|'ЯСНЕЦОВ, Владими...|1. WO2012028965 -...|      08-03-2012|    WO/2012/028965|[EN), COMBINATION...|           [A]|       A|
|                 nan|[' [BY]/[BY] (AM,...| PCT/BY2015/000005|[[BF,  BJ,  CF,  ...|'ЖАВНЕРКО, Геннад...|1. WO2017070769 -...|      04-05-2017|    WO/2017/070769|[EN), COMPO

## TF-IDF計算

In [6]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, RegexTokenizer
from pyspark.ml.feature import StopWordsRemover

In [7]:
abstract.show()

+-----------------+--------------------+
|              _c0|                 _c1|
+-----------------+--------------------+
|PCT/EP2001/009832|Liquid crystal mi...|
|PCT/KR2000/000814|The present inven...|
|PCT/IL2000/000667|The present inven...|
|PCT/DK2001/000750|The invention rel...|
|PCT/EP2001/012972|The invention rel...|
|PCT/EP2007/000257|The invention rel...|
|PCT/RU2001/000190|The inventive met...|
|PCT/IL2000/000459|The present inven...|
|PCT/KR2006/002440|A diagnostic kit ...|
|PCT/KR2001/001941|The present inven...|
|PCT/SE2002/000250|To facilitate eg ...|
|PCT/DE2001/001579|The invention rel...|
|PCT/IL2001/000088|A method and circ...|
|PCT/KR2001/000965|A preferable embo...|
|PCT/GB2001/002487|New spisulosine d...|
|PCT/HU2001/000068|A system of appar...|
|PCT/US2000/028670|Novel antigenpres...|
|PCT/GB2001/002151|A compound of for...|
|PCT/DK2000/000496|The present inven...|
|PCT/US2000/025466|A method and appa...|
+-----------------+--------------------+
only showing top

### Join two tables

In [8]:
joinExpression = testDF['Application_Number']==abstract["_c0"]
joinType = "inner"

In [9]:
joinedDF=testDF.join(abstract,joinExpression,joinType).select('Application_Number','mono_ipc','_c1')
joinedDF.show()

+------------------+--------+--------------------+
|Application_Number|mono_ipc|                 _c1|
+------------------+--------+--------------------+
| PCT/AT2002/000007|       B|The invention rel...|
| PCT/AU2001/001240|       A|An electrode asse...|
| PCT/CH2000/000616|       A|The invention rel...|
| PCT/DE2001/002185|       G|The invention rel...|
| PCT/DE2001/003336|       G|According to the ...|
| PCT/DE2001/003917|       H|The invention rel...|
| PCT/EP2001/003772|       C|The invention rel...|
| PCT/EP2001/004360|       C|A method for dete...|
| PCT/EP2001/005589|       F|With a view to ra...|
| PCT/EP2001/005723|       C|The invention rel...|
| PCT/EP2001/005780|       H|The invention rel...|
| PCT/EP2001/005780|       C|The invention rel...|
| PCT/FI2001/000982|       C|This invention re...|
| PCT/FR2000/002789|       C|The invention rel...|
| PCT/FR2000/002789|       A|The invention rel...|
| PCT/IB2001/000244|       C|The present inven...|
| PCT/IB2001/001581|       C|Th

In [10]:
joinedDF.write.json('hdfs:///data/mergedData.json',compression='bzip2')

IllegalArgumentException: Codec [true] is not available. Known codecs are bzip2, deflate, uncompressed, lz4, gzip, snappy, none.

### stack the model

In [10]:
tokenizer = RegexTokenizer(inputCol="_c1", outputCol="words",pattern="\\W")
remover = StopWordsRemover(inputCol="words", outputCol="removededWords") # stopWords=yourownstopwords
vectorizer = CountVectorizer(inputCol="removededWords", outputCol="rawFeatures")
#idf = IDF(inputCol="rawFeatures", outputCol="features") # minDocFreq=2, TF小於2的就忽略
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=2)
pipeline = Pipeline(stages=[tokenizer,remover ,vectorizer, idf])
model = pipeline.fit(joinedDF)

In [11]:
import numpy as np
total_counts = model.transform(joinedDF).select('rawFeatures').rdd.map(lambda row: row['rawFeatures'].toArray()).reduce(lambda x,y: [x[i]+y[i] for i in range(len(y))])
vocabList = model.stages[2].vocabulary
d = {'vocabList':vocabList,'counts':total_counts}
spark.createDataFrame(np.array(list(d.values())).T.tolist(),list(d.keys())).show()

+----------+------+
| vocabList|counts|
+----------+------+
| invention|3494.0|
|      said|2348.0|
|       one|2238.0|
|     least|1949.0|
|   relates|1886.0|
|    method|1742.0|
|comprising|1563.0|
|    device|1280.0|
|     means|1267.0|
| comprises|1026.0|
|     first|1022.0|
|   surface| 961.0|
|    system| 955.0|
|       use| 946.0|
|      also| 942.0|
|  material| 910.0|
|    second| 910.0|
|  provided| 893.0|
|   present| 833.0|
|   wherein| 811.0|
+----------+------+
only showing top 20 rows



In [36]:
spark.createDataFrame(np.array(list(d.values())).T.tolist(),list(d.keys())).write.csv('hdfs:///data/words_Of_Bad_Jan12.csv',header=True,mode='overwrite')

In [12]:
tfidf=model.transform(joinedDF)

In [13]:
tfidf.createOrReplaceTempView("table1")
spark.sql('select count(*) as num_of_sample from table1').show()

+--------+
|count(1)|
+--------+
|    4091|
+--------+



In [14]:
tfidf.columns

['Application_Number',
 'mono_ipc',
 '_c1',
 'words',
 'removededWords',
 'rawFeatures',
 'features']

In [15]:
tfidfForTrain=spark.sql('select mono_ipc,rawFeatures, features from table1')

In [16]:
tfidfForTrain.show()

+--------+--------------------+--------------------+
|mono_ipc|         rawFeatures|            features|
+--------+--------------------+--------------------+
|       B|(17518,[0,3,4,6,7...|(17518,[0,3,4,6,7...|
|       A|(17518,[6,29,95,1...|(17518,[6,29,95,1...|
|       A|(17518,[0,3,4,5,1...|(17518,[0,3,4,5,1...|
|       G|(17518,[0,4,5,14,...|(17518,[0,4,5,14,...|
|       G|(17518,[0,1,7,15,...|(17518,[0,1,7,15,...|
|       H|(17518,[0,3,4,5,8...|(17518,[0,3,4,5,8...|
|       C|(17518,[0,1,4,5,2...|(17518,[0,1,4,5,2...|
|       C|(17518,[5,6,10,16...|(17518,[5,6,10,16...|
|       F|(17518,[1,17,71,8...|(17518,[1,17,71,8...|
|       C|(17518,[0,2,4,19,...|(17518,[0,2,4,19,...|
|       H|(17518,[0,2,3,4,5...|(17518,[0,2,3,4,5...|
|       C|(17518,[0,2,3,4,5...|(17518,[0,2,3,4,5...|
|       C|(17518,[0,4,5,15,...|(17518,[0,4,5,15,...|
|       C|(17518,[0,2,3,4,6...|(17518,[0,2,3,4,6...|
|       A|(17518,[0,2,3,4,6...|(17518,[0,2,3,4,6...|
|       C|(17518,[0,4,14,18...|(17518,[0,4,14,

In [35]:
tfidfForTrain.createOrReplaceTempView('table3')
spark.sql('select count(*) as  from table3').show()

+--------+
|count(1)|
+--------+
|    4091|
+--------+



In [19]:
from pyspark.ml.linalg import Vectors # !!!!caution: not from pyspark.mllib.linalg import Vectors
from pyspark.ml import Pipeline
from pyspark.ml.feature import IndexToString,StringIndexer, VectorIndexer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [20]:
labelIndexer = StringIndexer(inputCol='mono_ipc',outputCol='indexedLabel').fit(tfidfForTrain)
labelIndexer.transform(tfidfForTrain).show(5, True)
stage1=labelIndexer.transform(tfidfForTrain)

+--------+--------------------+--------------------+------------+
|mono_ipc|         rawFeatures|            features|indexedLabel|
+--------+--------------------+--------------------+------------+
|       B|(17518,[0,3,4,6,7...|(17518,[0,3,4,6,7...|         2.0|
|       A|(17518,[6,29,95,1...|(17518,[6,29,95,1...|         1.0|
|       A|(17518,[0,3,4,5,1...|(17518,[0,3,4,5,1...|         1.0|
|       G|(17518,[0,4,5,14,...|(17518,[0,4,5,14,...|         3.0|
|       G|(17518,[0,1,7,15,...|(17518,[0,1,7,15,...|         3.0|
+--------+--------------------+--------------------+------------+
only showing top 5 rows



In [21]:
(trainingData, testData) = stage1.randomSplit([0.6, 0.4])
#trainingData.show(2,truncate=False)
#testData.show(2,truncate=False)

```
RandomForestClassifier(
    featuresCol='features',
    labelCol='label',
    predictionCol='prediction',
    probabilityCol='probability',
    rawPredictionCol='rawPrediction',
    maxDepth=5,
    maxBins=32,
    minInstancesPerNode=1,
    minInfoGain=0.0,
    maxMemoryInMB=256,
    cacheNodeIds=False,
    checkpointInterval=10,
    impurity='gini',
    numTrees=20,
    featureSubsetStrategy='auto',
    seed=None,
    subsamplingRate=1.0,
    leafCol='',
    minWeightFractionPerNode=0.0,
    weightCol=None,
    bootstrap=True,
)
Docstring:     
`Random Forest <http://en.wikipedia.org/wiki/Random_forest>`_
learning algorithm for classification.
It supports both binary and multiclass labels, as well as both continuous and categorical```

In [24]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'indexedLabel', maxDepth=30, maxBins=17518)
rfModel = rf.fit(trainingData).transform(testData)

In [25]:
rfModel.show()

+--------+--------------------+--------------------+------------+--------------------+--------------------+----------+
|mono_ipc|         rawFeatures|            features|indexedLabel|       rawPrediction|         probability|prediction|
+--------+--------------------+--------------------+------------+--------------------+--------------------+----------+
|       A|(17518,[0,3,4,5,1...|(17518,[0,3,4,5,1...|         1.0|[5.31949626363017...|[0.26597481318150...|       1.0|
|       C|(17518,[0,1,4,5,6...|(17518,[0,1,4,5,6...|         0.0|[5.75319723618016...|[0.28765986180900...|       1.0|
|       C|(17518,[0,1,4,5,2...|(17518,[0,1,4,5,2...|         0.0|[6.65036692082716...|[0.33251834604135...|       0.0|
|       C|(17518,[0,2,3,4,6...|(17518,[0,2,3,4,6...|         0.0|[6.63741379616861...|[0.33187068980843...|       0.0|
|       C|(17518,[0,2,4,19,...|(17518,[0,2,4,19,...|         0.0|[9.13415339567911...|[0.45670766978395...|       0.0|
|       C|(17518,[0,4,14,18...|(17518,[0,4,14,18

In [27]:
prob_RF=rfModel.select('probability').collect();prob_RF[:10]

[Row(probability=DenseVector([0.266, 0.2867, 0.1436, 0.1186, 0.0745, 0.055, 0.0398, 0.0158])),
 Row(probability=DenseVector([0.2877, 0.3246, 0.1246, 0.1034, 0.0664, 0.0466, 0.033, 0.0138])),
 Row(probability=DenseVector([0.3325, 0.2861, 0.123, 0.1008, 0.0634, 0.0467, 0.0337, 0.0138])),
 Row(probability=DenseVector([0.3319, 0.2657, 0.1301, 0.1028, 0.067, 0.0488, 0.0383, 0.0156])),
 Row(probability=DenseVector([0.4567, 0.3167, 0.0726, 0.0575, 0.0395, 0.0265, 0.0192, 0.0114])),
 Row(probability=DenseVector([0.3272, 0.3024, 0.1209, 0.0995, 0.0606, 0.0443, 0.0316, 0.0134])),
 Row(probability=DenseVector([0.2539, 0.267, 0.131, 0.1718, 0.0678, 0.048, 0.0425, 0.0181])),
 Row(probability=DenseVector([0.2458, 0.2331, 0.1343, 0.1853, 0.0929, 0.0541, 0.042, 0.0126])),
 Row(probability=DenseVector([0.2597, 0.2826, 0.1488, 0.121, 0.075, 0.0557, 0.0413, 0.0159])),
 Row(probability=DenseVector([0.2864, 0.2806, 0.1423, 0.1137, 0.0708, 0.0516, 0.0381, 0.0165]))]

In [29]:
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",labels=labelIndexer.labels)
labelConverter.transform(rfModel).select('features','mono_ipc','predictedLabel').show()

+--------------------+--------+--------------+
|            features|mono_ipc|predictedLabel|
+--------------------+--------+--------------+
|(17518,[0,3,4,5,1...|       A|             A|
|(17518,[0,1,4,5,6...|       C|             A|
|(17518,[0,1,4,5,2...|       C|             C|
|(17518,[0,2,3,4,6...|       C|             C|
|(17518,[0,2,4,19,...|       C|             C|
|(17518,[0,4,14,18...|       C|             C|
|(17518,[5,6,10,16...|       C|             A|
|(17518,[0,4,5,14,...|       G|             C|
|(17518,[0,2,3,4,5...|       H|             A|
|(17518,[0,4,42,52...|       A|             C|
|(17518,[0,30,37,3...|       A|             C|
|(17518,[0,1,4,19,...|       C|             C|
|(17518,[1,5,6,9,1...|       C|             C|
|(17518,[0,1,4,7,8...|       G|             A|
|(17518,[2,5,10,14...|       G|             A|
|(17518,[1,2,8,23,...|       B|             A|
|(17518,[0,1,2,4,5...|       C|             A|
|(17518,[1,2,3,6,1...|       C|             A|
|(17518,[2,20

In [33]:
labelConverter.transform(rfModel).select('indexedLabel','prediction').createOrReplaceTempView('table2')
predictionAndTarget = spark.sql('select indexedLabel as label, prediction from table2')

evaluatorMulti = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
acc = evaluatorMulti.evaluate(predictionAndTarget)

In [34]:
print(acc)

0.275723232216577
