In [1]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, CountVectorizer, StopWordsRemover, StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

def create_session():
    ''' Function used to instantiate a pySpark Session with 
    the specific configurations'''
    sc_conf = SparkConf()
    sc_conf.setAppName('SparkPreProcessing')
    sc_conf.setMaster('local')
    sc_conf.set('spark.executor.memory', '6g')
    sc_conf.set('spark.executor.cores', '8')
    sc_conf.set('spark.logConf', True)
    print(sc_conf.getAll())
    sc = SparkContext.getOrCreate(conf=sc_conf)
    ss = SparkSession(sc)
    return ss

# 1. PySpark

## 1.1 Loading Files and Creating Session

In [2]:
%%time
# Parquet files are self-describing so the schema is preserved.
# The result of loading a parquet file is also a DataFrame.
RELATIVE_FOLDER_PATH = "assets/data/"
filename = "data"
pyspark_session = create_session()

ailab_df = pyspark_session.read.parquet(RELATIVE_FOLDER_PATH +"/data.parquet.gzip")
print(ailab_df.printSchema())

[('spark.executor.memory', '6g'), ('spark.master', 'local'), ('spark.logConf', 'True'), ('spark.submit.deployMode', 'client'), ('spark.executor.cores', '8'), ('spark.ui.showConsoleProgress', 'true'), ('spark.app.name', 'SparkPreProcessing')]
root
 |-- process_class: string (nullable = true)
 |-- process_id: string (nullable = true)
 |-- doc_id: string (nullable = true)
 |-- path_img: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- text: string (nullable = true)
 |-- doc_type: string (nullable = true)
 |-- num_pag: long (nullable = true)

None
CPU times: user 28.3 ms, sys: 3.14 ms, total: 31.4 ms
Wall time: 2.76 s


## 1.2 Preprocessing and Vectorizing

In [3]:
%%time
ailab_df.cache().count()

tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
remover = StopWordsRemover(inputCol="tokens", outputCol="stopWordsRemovedTokens")
hashingTF = HashingTF(inputCol="stopWordsRemovedTokens", outputCol="rawFeatures", numFeatures=2000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5)
label_stringIdx = StringIndexer(inputCol = "process_class", outputCol = "label")

pre_processing_pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, label_stringIdx])

pre_processing_pipeline_model = pre_processing_pipeline.fit(ailab_df)

treated_df = pre_processing_pipeline_model.transform(ailab_df)

treated_df.show(1)

+-------------+----------+---------+--------------------+--------------------+--------------------+-------+--------------------+----------------------+--------------------+--------------------+-----+
|process_class|process_id|   doc_id|            path_img|                text|            doc_type|num_pag|              tokens|stopWordsRemovedTokens|         rawFeatures|            features|label|
+-------------+----------+---------+--------------------+--------------------+--------------------+-------+--------------------+----------------------+--------------------+--------------------+-----+
|           RE|   1004784|310550039|[./processos_imgs...| Documento digita...|despacho_de_admis...|      3|[, documento, dig...|  [, documento, dig...|(2000,[15,17,20,2...|(2000,[15,17,20,2...|  1.0|
+-------------+----------+---------+--------------------+--------------------+--------------------+-------+--------------------+----------------------+--------------------+--------------------+-----+


## 1.3 Clusterizing

In [4]:
%%time
kmeans = KMeans(k=20)
kmeans_trained_model = kmeans.fit(treated_df)
kmeans_result_df = kmeans_trained_model.transform(treated_df)
kmeans_result_df.show(1)

+-------------+----------+---------+--------------------+--------------------+--------------------+-------+--------------------+----------------------+--------------------+--------------------+-----+----------+
|process_class|process_id|   doc_id|            path_img|                text|            doc_type|num_pag|              tokens|stopWordsRemovedTokens|         rawFeatures|            features|label|prediction|
+-------------+----------+---------+--------------------+--------------------+--------------------+-------+--------------------+----------------------+--------------------+--------------------+-----+----------+
|           RE|   1004784|310550039|[./processos_imgs...| Documento digita...|despacho_de_admis...|      3|[, documento, dig...|  [, documento, dig...|(2000,[15,17,20,2...|(2000,[15,17,20,2...|  1.0|         1|
+-------------+----------+---------+--------------------+--------------------+--------------------+-------+--------------------+----------------------+-----

### 1.4 Classifying

In [5]:
%%time
(trainingData, testData) = treated_df.randomSplit([0.7, 0.3], seed = 100)
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("text","process_class","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)


+------------------------------+-------------+------------------------------+-----+----------+
|                          text|process_class|                   probability|label|prediction|
+------------------------------+-------------+------------------------------+-----+----------+
| OAB/RS 0882

LACERDA 6 JAC...|          ARE|[0.9982459593595978,0.00124...|  0.0|       0.0|
| SEÇÃO V
Da Execução

Art. ...|          ARE|[0.9973522878312651,0.00200...|  0.0|       0.0|
|  HOFFMANN

 

ADVOGADOÇSÇS...|          ARE|[0.9923195606782952,0.00594...|  0.0|       0.0|
| seus inimigos. É isto que ...|          ARE|[0.9923172708770535,0.00458...|  0.0|       0.0|
| Documento recebido eletron...|          ARE|[0.9904962761549694,0.00875...|  0.0|       0.0|
| Esso
MARCOS III NOVAES MAR...|           RE|[0.9848453619196611,0.01268...|  1.0|       0.0|
| ESTADO DE SANTA CATARINA
P...|          ARE|[0.9844945094248899,0.01347...|  0.0|       0.0|
| A

jce.j i 1652-62.2010.8....|          ARE|[0.9

### 1.4 Model Evaluation

In [6]:
%%time
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

CPU times: user 9.55 ms, sys: 849 µs, total: 10.4 ms
Wall time: 36.7 s


0.7459806286135295

In [7]:
sc.stop()