In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("yarn")\
.config('spark.executor.cores', '2')\
.config('spark.executor.memory', '9G')\
.config('spark.executor.instances','4')\
.appName('bdse62')\
.getOrCreate()

In [2]:
abstract=spark.read.csv('hdfs:///bdse71/ABS',sep=',')

# Logistic Regression

In [3]:
testDF = spark.read.json('hdfs:///data/df_mono_ipc.json')

In [4]:
testDF.show()

+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+----------------+------------------+--------------------+--------------+--------+
|              Agents|          Applicants|Application_Number|   Designated_States|           Inventors|  PublicationNo_Name|Publication_Date|Publication_Number|               Title|ipc_simplified|mono_ipc|
+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+----------------+------------------+--------------------+--------------+--------+
|'ОСИПОВА, Наталья...|[' [CY]/[CY] (All...| PCT/IB2011/002779|[[BF,  BJ,  CF,  ...|'ЯСНЕЦОВ, Владими...|1. WO2012028965 -...|      08-03-2012|    WO/2012/028965|[EN), COMBINATION...|           [A]|       A|
|                 nan|[' [BY]/[BY] (AM,...| PCT/BY2015/000005|[[BF,  BJ,  CF,  ...|'ЖАВНЕРКО, Геннад...|1. WO2017070769 -...|      04-05-2017|    WO/2017/070769|[EN), COMPO

## TF-IDF計算

In [5]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, RegexTokenizer
from pyspark.ml.feature import StopWordsRemover

In [6]:
abstract.show()

+-----------------+--------------------+
|              _c0|                 _c1|
+-----------------+--------------------+
|PCT/EP2001/009832|Liquid crystal mi...|
|PCT/KR2000/000814|The present inven...|
|PCT/IL2000/000667|The present inven...|
|PCT/DK2001/000750|The invention rel...|
|PCT/EP2001/012972|The invention rel...|
|PCT/GB1992/000681|Compounds having ...|
|PCT/EP2007/000257|The invention rel...|
|PCT/GB1992/001189|Compounds of gene...|
|PCT/US2001/024105|The present inven...|
|PCT/RU2001/000190|The inventive met...|
|PCT/EP2004/007957|The invention rel...|
|PCT/IL2000/000459|The present inven...|
|PCT/KR2006/002440|A diagnostic kit ...|
|PCT/KR2001/001941|The present inven...|
|PCT/SE2002/000250|To facilitate eg ...|
|PCT/GR1993/000005|A technique for t...|
|PCT/DE2001/001579|The invention rel...|
|PCT/IL2001/000088|A method and circ...|
|PCT/KR2001/000965|A preferable embo...|
|PCT/GB2001/002487|New spisulosine d...|
+-----------------+--------------------+
only showing top

### Join two tables

In [7]:
joinExpression = testDF['Application_Number']==abstract["_c0"]
joinType = "inner"

In [8]:
joinedDF=testDF.join(abstract,joinExpression,joinType).select('Application_Number','mono_ipc','_c1')
joinedDF.show()

+------------------+--------+--------------------+
|Application_Number|mono_ipc|                 _c1|
+------------------+--------+--------------------+
| PCT/AT2002/000007|       B|The invention rel...|
| PCT/AU2001/001240|       A|An electrode asse...|
| PCT/CH2000/000616|       A|The invention rel...|
| PCT/DE2001/002185|       G|The invention rel...|
| PCT/DE2001/003336|       G|According to the ...|
| PCT/DE2001/003917|       H|The invention rel...|
| PCT/EP2001/003772|       C|The invention rel...|
| PCT/EP2001/004360|       C|A method for dete...|
| PCT/EP2001/005589|       F|With a view to ra...|
| PCT/EP2001/005723|       C|The invention rel...|
| PCT/EP2001/005780|       H|The invention rel...|
| PCT/EP2001/005780|       C|The invention rel...|
| PCT/FI1992/000106|       B|The invention con...|
| PCT/FI2001/000982|       C|This invention re...|
| PCT/FR2000/002789|       C|The invention rel...|
| PCT/FR2000/002789|       A|The invention rel...|
| PCT/IB2001/000244|       C|Th

### stack the model

In [9]:
tokenizer = RegexTokenizer(inputCol="_c1", outputCol="words",pattern="\\W")
remover = StopWordsRemover(inputCol="words", outputCol="removededWords") # stopWords=yourownstopwords
vectorizer = CountVectorizer(inputCol="removededWords", outputCol="rawFeatures")
#idf = IDF(inputCol="rawFeatures", outputCol="features") # minDocFreq=2, TF小於2的就忽略
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=2)
pipeline = Pipeline(stages=[tokenizer,remover ,vectorizer, idf])
model = pipeline.fit(joinedDF)

In [10]:
import numpy as np
total_counts = model.transform(joinedDF).select('rawFeatures').rdd.map(lambda row: row['rawFeatures'].toArray()).reduce(lambda x,y: [x[i]+y[i] for i in range(len(y))])
vocabList = model.stages[2].vocabulary
d = {'vocabList':vocabList,'counts':total_counts}
spark.createDataFrame(np.array(list(d.values())).T.tolist(),list(d.keys())).show()

+----------+------+
| vocabList|counts|
+----------+------+
| invention|3814.0|
|      said|2590.0|
|       one|2459.0|
|     least|2110.0|
|   relates|2055.0|
|    method|1878.0|
|comprising|1672.0|
|    device|1404.0|
|     means|1352.0|
|     first|1122.0|
| comprises|1108.0|
|       use|1055.0|
|   surface|1048.0|
|      also|1035.0|
|  material|1004.0|
|    system|1003.0|
|    second| 981.0|
|  provided| 960.0|
|   present| 930.0|
|   wherein| 906.0|
+----------+------+
only showing top 20 rows



In [11]:
tfidf=model.transform(joinedDF)

In [12]:
tfidf.createOrReplaceTempView("table1")
spark.sql('select count(*) from table1').show()

+--------+
|count(1)|
+--------+
|    4548|
+--------+



In [13]:
tfidf.columns

['Application_Number',
 'mono_ipc',
 '_c1',
 'words',
 'removededWords',
 'rawFeatures',
 'features']

In [14]:
tfidfForTrain=spark.sql('select mono_ipc,rawFeatures, features from table1')

In [15]:
tfidfForTrain.show()

+--------+--------------------+--------------------+
|mono_ipc|         rawFeatures|            features|
+--------+--------------------+--------------------+
|       B|(18778,[0,3,4,6,7...|(18778,[0,3,4,6,7...|
|       A|(18778,[6,30,104,...|(18778,[6,30,104,...|
|       A|(18778,[0,3,4,5,1...|(18778,[0,3,4,5,1...|
|       G|(18778,[0,4,5,13,...|(18778,[0,4,5,13,...|
|       G|(18778,[0,1,7,14,...|(18778,[0,1,7,14,...|
|       H|(18778,[0,3,4,5,8...|(18778,[0,3,4,5,8...|
|       C|(18778,[0,1,4,5,2...|(18778,[0,1,4,5,2...|
|       C|(18778,[5,6,9,16,...|(18778,[5,6,9,16,...|
|       F|(18778,[1,17,64,8...|(18778,[1,17,64,8...|
|       C|(18778,[0,2,4,19,...|(18778,[0,2,4,19,...|
|       H|(18778,[0,2,3,4,5...|(18778,[0,2,3,4,5...|
|       C|(18778,[0,2,3,4,5...|(18778,[0,2,3,4,5...|
|       B|(18778,[0,1,6,8,4...|(18778,[0,1,6,8,4...|
|       C|(18778,[0,4,5,14,...|(18778,[0,4,5,14,...|
|       C|(18778,[0,2,3,4,6...|(18778,[0,2,3,4,6...|
|       A|(18778,[0,2,3,4,6...|(18778,[0,2,3,4

In [16]:
from pyspark.ml.linalg import Vectors # !!!!caution: not from pyspark.mllib.linalg import Vectors
from pyspark.ml import Pipeline
from pyspark.ml.feature import IndexToString,StringIndexer, VectorIndexer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [17]:
labelIndexer = StringIndexer(inputCol='mono_ipc',outputCol='indexedLabel').fit(tfidfForTrain)
labelIndexer.transform(tfidfForTrain).show(5, True)
stage1=labelIndexer.transform(tfidfForTrain)

+--------+--------------------+--------------------+------------+
|mono_ipc|         rawFeatures|            features|indexedLabel|
+--------+--------------------+--------------------+------------+
|       B|(18778,[0,3,4,6,7...|(18778,[0,3,4,6,7...|         2.0|
|       A|(18778,[6,30,104,...|(18778,[6,30,104,...|         0.0|
|       A|(18778,[0,3,4,5,1...|(18778,[0,3,4,5,1...|         0.0|
|       G|(18778,[0,4,5,13,...|(18778,[0,4,5,13,...|         3.0|
|       G|(18778,[0,1,7,14,...|(18778,[0,1,7,14,...|         3.0|
+--------+--------------------+--------------------+------------+
only showing top 5 rows



In [18]:
(trainingData, testData) = stage1.randomSplit([0.6, 0.4])
#trainingData.show(2,truncate=False)
#testData.show(2,truncate=False)

In [19]:
from pyspark.ml.classification import LogisticRegression
logr = LogisticRegression(featuresCol='features', labelCol='indexedLabel')
stage2=logr.fit(trainingData).transform(testData)

In [20]:
stage2.show()

+--------+--------------------+--------------------+------------+--------------------+--------------------+----------+
|mono_ipc|         rawFeatures|            features|indexedLabel|       rawPrediction|         probability|prediction|
+--------+--------------------+--------------------+------------+--------------------+--------------------+----------+
|       A|(18778,[0,2,3,4,6...|(18778,[0,2,3,4,6...|         0.0|[-104.76359778962...|[0.0,1.0,0.0,0.0,...|       1.0|
|       A|(18778,[6,30,104,...|(18778,[6,30,104,...|         0.0|[-30.771753986580...|[9.68753173195840...|       4.0|
|       B|(18778,[0,3,4,6,7...|(18778,[0,3,4,6,7...|         2.0|[-44.143478410535...|[8.36028938053069...|       6.0|
|       G|(18778,[0,4,5,13,...|(18778,[0,4,5,13,...|         3.0|[-591.92806829425...|[0.0,1.0,0.0,1.41...|       1.0|
|       H|(18778,[0,2,3,4,5...|(18778,[0,2,3,4,5...|         4.0|[-128.38596039992...|[0.0,1.0,6.126823...|       1.0|
|       A|(18778,[0,2,10,12...|(18778,[0,2,10,12

In [21]:
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",labels=labelIndexer.labels)
labelConverter.transform(stage2).select('features','mono_ipc','predictedLabel').show()

+--------------------+--------+--------------+
|            features|mono_ipc|predictedLabel|
+--------------------+--------+--------------+
|(18778,[0,2,3,4,6...|       A|             C|
|(18778,[6,30,104,...|       A|             H|
|(18778,[0,3,4,6,7...|       B|             E|
|(18778,[0,4,5,13,...|       G|             C|
|(18778,[0,2,3,4,5...|       H|             C|
|(18778,[0,2,10,12...|       A|             A|
|(18778,[0,4,10,22...|       A|             B|
|(18778,[0,4,13,18...|       A|             C|
|(18778,[0,29,35,3...|       A|             C|
|(18778,[0,1,2,4,1...|       B|             A|
|(18778,[0,1,4,19,...|       C|             C|
|(18778,[1,5,6,10,...|       C|             C|
|(18778,[1,21,27,4...|       C|             A|
|(18778,[2,5,11,13...|       H|             H|
|(18778,[0,4,6,7,8...|       A|             A|
|(18778,[6,14,25,2...|       A|             A|
|(18778,[12,13,17,...|       A|             A|
|(18778,[0,1,4,5,1...|       B|             C|
|(18778,[1,2,

In [28]:
evaluatorMulti = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction")
labelConverter.transform(stage2).select('indexedLabel','prediction').createOrReplaceTempView('table1')
predictionAndTarget = spark.sql('select indexedLabel as label, prediction from table1')
acc = evaluator.evaluate(predictionAndTarget)

In [29]:
print(acc)

0.4359177129475953


In [1]:
spark.stop()

NameError: name 'spark' is not defined