In [1]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, CountVectorizer, StopWordsRemover, StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.clustering import KMeans

def create_session():
    sc_conf = SparkConf()
    sc_conf.setAppName('SparkPreProcessing')
    sc_conf.setMaster('local')
    sc_conf.set('spark.executor.memory', '6g')
    sc_conf.set('spark.executor.cores', '8')
    sc_conf.set('spark.logConf', True)
    print(sc_conf.getAll())
    sc = SparkContext.getOrCreate(conf=sc_conf)
    ss = SparkSession(sc)
    return ss

In [14]:
# Parquet files are self-describing so the schema is preserved.
# The result of loading a parquet file is also a DataFrame.
RELATIVE_FOLDER_PATH = "assets/data/"
filename = "data"
sc = create_session()

ailab_df = sc.read.parquet(RELATIVE_FOLDER_PATH +"/data.parquet.gzip")
print(ailab_df.printSchema())

[('spark.executor.memory', '6g'), ('spark.master', 'local'), ('spark.logConf', 'True'), ('spark.submit.deployMode', 'client'), ('spark.executor.cores', '8'), ('spark.ui.showConsoleProgress', 'true'), ('spark.app.name', 'SparkPreProcessing')]
root
 |-- process_class: string (nullable = true)
 |-- process_id: string (nullable = true)
 |-- doc_id: string (nullable = true)
 |-- path_img: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- text: string (nullable = true)
 |-- doc_type: string (nullable = true)
 |-- num_pag: long (nullable = true)

None


In [29]:
ailab_df.cache().count()

tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
remover = StopWordsRemover(inputCol="tokens", outputCol="stopWordsRemovedTokens")
hashingTF = HashingTF(inputCol="stopWordsRemovedTokens", outputCol="rawFeatures", numFeatures=2000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5)
label_stringIdx = StringIndexer(inputCol = "process_class", outputCol = "label")

pre_processing_pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, label_stringIdx])

pre_processing_pipeline_model = pre_processing_pipeline.fit(ailab_df)

treated_df = pre_processing_pipeline_model.transform(ailab_df)

treated_df.show(1)

+-------------+----------+---------+--------------------+--------------------+--------------------+-------+--------------------+----------------------+--------------------+--------------------+-----+
|process_class|process_id|   doc_id|            path_img|                text|            doc_type|num_pag|              tokens|stopWordsRemovedTokens|         rawFeatures|            features|label|
+-------------+----------+---------+--------------------+--------------------+--------------------+-------+--------------------+----------------------+--------------------+--------------------+-----+
|           RE|   1004784|310550039|[./processos_imgs...| Documento digita...|despacho_de_admis...|      3|[, documento, dig...|  [, documento, dig...|(2000,[15,17,20,2...|(2000,[15,17,20,2...|  1.0|
+-------------+----------+---------+--------------------+--------------------+--------------------+-------+--------------------+----------------------+--------------------+--------------------+-----+


In [30]:
kmeans = KMeans(k=20)
kmeans_trained_model = kmeans.fit(treated_df)
kmeans_result_df = kmeans_trained_model.transform(treated_df)
kmeans_result_df.show(1)

+-------------+----------+---------+--------------------+--------------------+--------------------+-------+--------------------+----------------------+--------------------+--------------------+-----+----------+
|process_class|process_id|   doc_id|            path_img|                text|            doc_type|num_pag|              tokens|stopWordsRemovedTokens|         rawFeatures|            features|label|prediction|
+-------------+----------+---------+--------------------+--------------------+--------------------+-------+--------------------+----------------------+--------------------+--------------------+-----+----------+
|           RE|   1004784|310550039|[./processos_imgs...| Documento digita...|despacho_de_admis...|      3|[, documento, dig...|  [, documento, dig...|(2000,[15,17,20,2...|(2000,[15,17,20,2...|  1.0|        11|
+-------------+----------+---------+--------------------+--------------------+--------------------+-------+--------------------+----------------------+-----

In [31]:
(trainingData, testData) = treated_df.randomSplit([0.7, 0.3], seed = 100)
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("text","process_class","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)


+------------------------------+-------------+------------------------------+-----+----------+
|                          text|process_class|                   probability|label|prediction|
+------------------------------+-------------+------------------------------+-----+----------+
| i

 

 

 

 

 

 

 

  ...|          ARE|[0.9999932589578077,3.97377...|  0.0|       0.0|
| SEÇÃO V
Da Execução

Art. ...|          ARE|[0.9992210908220771,4.54681...|  0.0|       0.0|
| Rodrigues dos Santos & Sou...|           RE|[0.9904324939584316,0.00632...|  1.0|       0.0|
| IXA

privada, não é razoáv...|          ARE|[0.9887038042609323,0.01085...|  0.0|       0.0|
| JUNTADA
junto aos presente...|          ARE|[0.9851287444111012,0.01310...|  0.0|       0.0|
| mxl o

ADVOC

MARCATTO

se...|          ARE|[0.9836183978290796,0.01266...|  0.0|       0.0|
| ESTADO DE SANTA CATARINA J...|          ARE|[0.9818315239268227,0.01594...|  0.0|       0.0|
| fis. 155

ESTADO DE SANTA ...|          ARE|[0.9

In [33]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.7279028242295433

In [None]:
sc.stop()