In [9]:
# Modules used for PySpark solution
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, CountVectorizer, StopWordsRemover, StringIndexer
from pyspark.ml import Pipeline as PySparkPipeline
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.clustering import KMeans as PySparkKMeans
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Modules used for non distributed solution
import collections

import spacy
from spacy.lang.pt.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.cluster import KMeans
from fastparquet import write 
import pandas as pd

def create_session():
    ''' Function used to instantiate a pySpark Session with 
    the specific configurations'''
    sc_conf = SparkConf()
    sc_conf.setAppName('SparkPreProcessing')
    sc_conf.setMaster('local')
    sc_conf.set('spark.executor.memory', '6g')
    sc_conf.set('spark.executor.cores', '8')
    sc_conf.set('spark.logConf', True)
    print(sc_conf.getAll())
    sc = SparkContext.getOrCreate(conf=sc_conf)
    ss = SparkSession(sc)
    return ss

# 1. PySpark

## 1.1 Loading Files and Creating Session

In [29]:
%%time
# Parquet files are self-describing so the schema is preserved.
# The result of loading a parquet file is also a DataFrame.
RELATIVE_FOLDER_PATH = "assets/data/"
filename = "data"
pyspark_session = create_session()

ailab_df = pyspark_session.read.parquet(RELATIVE_FOLDER_PATH +"/data.parquet.gzip")

[('spark.executor.memory', '6g'), ('spark.master', 'local'), ('spark.logConf', 'True'), ('spark.submit.deployMode', 'client'), ('spark.executor.cores', '8'), ('spark.ui.showConsoleProgress', 'true'), ('spark.app.name', 'SparkPreProcessing')]
CPU times: user 6.61 ms, sys: 4.44 ms, total: 11.1 ms
Wall time: 94.2 ms


## 1.2 Preprocessing and Vectorizing

In [30]:
%%time
ailab_df.cache().count()

tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
remover = StopWordsRemover(inputCol="tokens", outputCol="stopWordsRemovedTokens")
hashingTF = HashingTF(inputCol="stopWordsRemovedTokens", outputCol="rawFeatures", numFeatures=2000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5)
label_stringIdx = StringIndexer(inputCol = "process_class", outputCol = "label")

pre_processing_pipeline = PySparkPipeline(stages=[tokenizer, remover, hashingTF, idf, label_stringIdx])

pre_processing_pipeline_model = pre_processing_pipeline.fit(ailab_df)

treated_df = pre_processing_pipeline_model.transform(ailab_df)

CPU times: user 66.4 ms, sys: 16.4 ms, total: 82.8 ms
Wall time: 4.59 s


## 1.3 Clustering

In [31]:
%%time
kmeans = PySparkKMeans(k=20)
kmeans_trained_model = kmeans.fit(treated_df)
kmeans_result_df = kmeans_trained_model.transform(treated_df)

CPU times: user 13.5 ms, sys: 0 ns, total: 13.5 ms
Wall time: 10.2 s


### 1.4 Classifying

In [32]:
%%time
(trainingData, testData) = treated_df.randomSplit([0.7, 0.3], seed = 100)
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions_df = lrModel.transform(testData)

CPU times: user 22.5 ms, sys: 4.51 ms, total: 27 ms
Wall time: 13.2 s


### 1.5 Model Evaluation

In [33]:
%%time
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions_df)

CPU times: user 10.8 ms, sys: 437 µs, total: 11.2 ms
Wall time: 36.7 s


0.7279028242295433

### 1.6 Storing Results

In [37]:
%%time
predictions.write.mode("overwrite").format("parquet").option("compression", "gzip").mode("overwrite").save(RELATIVE_FOLDER_PATH +"pyspark_result.parquet")
kmeans_result_df.write.mode("overwrite").format("parquet").option("compression", "gzip").mode("overwrite").save(RELATIVE_FOLDER_PATH +"pyspark_cluster_result.parquet")

CPU times: user 7.45 ms, sys: 1.2 ms, total: 8.65 ms
Wall time: 39 s


In [None]:
pyspark_session.stop()

# 2. Spacy e Sci-kit learn

## 2.1 Loading files and Models

In [40]:
%%time 
VECTOR_MODEL_NAME = "pt_core_news_sm"
NLP_SPACY = spacy.load(VECTOR_MODEL_NAME)
RELATIVE_FOLDER_PATH = "assets/data/"
filename = "data"
stopwords_set = set(STOP_WORDS)

parquet_filename = RELATIVE_FOLDER_PATH + filename + ".parquet.gzip"
ailab_df = pd.read_parquet(parquet_filename)
print(ailab_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2036 entries, 0 to 2035
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   process_class  2036 non-null   object
 1   process_id     2036 non-null   object
 2   doc_id         2036 non-null   object
 3   path_img       2036 non-null   object
 4   text           2036 non-null   object
 5   doc_type       2036 non-null   object
 6   num_pag        2036 non-null   int64 
dtypes: int64(1), object(6)
memory usage: 111.5+ KB
None
CPU times: user 3.1 s, sys: 302 ms, total: 3.4 s
Wall time: 3.39 s


## 2.2 Preprocessing and Vectorizing

In [41]:
%%time
tokenizer = NLP_SPACY.Defaults.create_tokenizer(NLP_SPACY)
raw_text = ailab_df['text'].to_list()

tokenized_text = []
for row in raw_text[:20]:
    doc = tokenizer(row)
    preprocessed_doc = [token for token in doc if not token.norm_ in stopwords_set]
    tokenized_text.append(" ".join([word.text for word in preprocessed_doc]))

count_vectorizer = CountVectorizer()
tfidf_transformer = TfidfTransformer()

''' Encapsuling components in pipeline '''
pipeline = Pipeline([
    ('count_vectorizer', count_vectorizer),
    ('tfidf_transformer', tfidf_transformer)
])

vectorized_docs = pipeline.fit_transform(tokenized_text)

CPU times: user 227 ms, sys: 205 µs, total: 227 ms
Wall time: 226 ms


## 2.3 Clustering


In [51]:
%%time
kmeans = KMeans(20)
kmeans.fit(vectorized_docs)
clustering = collections.defaultdict(list)

for idx, label in enumerate(kmeans.labels_):
    clustering[str(label)].append(str(idx))
    
kmeans_df = pd.DataFrame(clustering)

CPU times: user 953 ms, sys: 558 ms, total: 1.51 s
Wall time: 443 ms


## 2.4 Classyfing

In [43]:
%%time
targets_labels = ailab_df['process_class'].to_list()[:20]
''' Let's evaluate more deeply the best model '''
X_train, X_test, y_train, y_test = train_test_split(
     vectorized_docs,
    targets_labels,
    test_size=0.25, random_state=42)

clf = SGDClassifier()

train1 = X_train
labelsTrain1 = y_train
test1 = X_test
labelsTest1 = y_test
"""  train """
clf.fit(train1, labelsTrain1)
"""  test """
preds = clf.predict(test1)

CPU times: user 10.4 ms, sys: 7.72 ms, total: 18.1 ms
Wall time: 6.48 ms


### 2.5 Model Evaluation

In [44]:
%%time
print("accuracy:", accuracy_score(labelsTest1, preds))
print(
    classification_report(
        labelsTest1,
        preds,
        target_names=ailab_df['process_class'].unique()))

accuracy: 0.8


ValueError: Number of classes, 2, does not match size of target_names, 3. Try specifying the labels parameter

## 2.6 Storing Results

In [53]:
%%time
ailab_df['path_img'] = [ str(doc) for doc in ailab_df['path_img']]
write(RELATIVE_FOLDER_PATH +"result.parquet", ailab_df, compression='gzip')
write(RELATIVE_FOLDER_PATH +"cluster_result.parquet", kmeans_df, compression='gzip')

CPU times: user 1.93 s, sys: 42.9 ms, total: 1.97 s
Wall time: 1.99 s
