In [208]:
#Load the input data to be annotated
data = spark. \
        read. \
        parquet("file:///" + os.getcwd() + "/../../../src/test/resources/sentiment.parquet"). \
        limit(300)
data.cache()
data.count()
data.show(5)

+------+---------+--------------------+
|itemid|sentiment|                text|
+------+---------+--------------------+
|     1|        0|                 ...|
|     2|        0|                 ...|
|     3|        1|              omg...|
|     4|        0|          .. Omga...|
|     5|        0|         i think ...|
+------+---------+--------------------+
only showing top 5 rows



In [210]:
from pyspark.sql import functions as F

data = data.select("itemid", F.col("text").alias("original"))
data.show(5)

+------+--------------------+
|itemid|            original|
+------+--------------------+
|     1|                 ...|
|     2|                 ...|
|     3|              omg...|
|     4|          .. Omga...|
|     5|         i think ...|
+------+--------------------+
only showing top 5 rows



In [3]:
def exportData(data, name, header="false"):
        """
        This method exports the data to disk
        :param data: dataset to export
        :param name: name of the file
        """
        data.write \
            .format("com.databricks.spark.csv") \
            .option("header", header) \
            .mode("overwrite") \
            .save("/home/danilo/Documents/JSL/PublicDatasets/"+name)
        print(f"Data {name} exported")

        
exportData(data, "twitter_posts")

Data twitter_posts exported


Get the result of JSL Spell Checker

In [318]:
import sys
sys.path.append('../../')

from sparknlp.annotator import *
from sparknlp.base import DocumentAssembler, Finisher
from pyspark.ml import Pipeline


### Define the dataframe
document_assembler = DocumentAssembler() \
                    .setInputCol("original")

tokenizer = Tokenizer() \
            .setInputCols(["document"]) \
            .setOutputCol("token")

normalizer = Normalizer() \
            .setInputCols(["token"]) \
            .setOutputCol("normal")
        
pipeline = Pipeline(stages=[document_assembler,
                           tokenizer,
                           normalizer])

In [319]:
ready_data = pipeline.fit(data).transform(data)

In [320]:
ready_data.show(5)

+------+--------------------+--------------------+--------------------+--------------------+
|itemid|            original|            document|               token|              normal|
+------+--------------------+--------------------+--------------------+--------------------+
|     1|                 ...|[[document,0,60, ...|[[token,21,22,is,...|[[token,21,22,is,...|
|     2|                 ...|[[document,0,50, ...|[[token,19,19,I,M...|[[token,19,19,i,M...|
|     3|              omg...|[[document,0,36, ...|[[token,14,16,omg...|[[token,14,16,omg...|
|     4|          .. Omga...|[[document,0,131,...|[[token,10,11,..,...|[[token,13,18,omg...|
|     5|         i think ...|[[document,0,52, ...|[[token,9,9,i,Map...|[[token,9,9,i,Map...|
+------+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [321]:
# Spell Checker
home_path = "file:///" + os.getcwd() + "/../../../../"
corpus = home_path + \
         "spark-nlp-models/src/main/resources/spell/wiki1_en.txt"     
# "spark-nlp/src/test/resources/spell/sherlockholmes.txt"

dictionary = home_path + \
             "spark-nlp-models/src/main/resources/spell/words.txt"
spell_checker = NorvigSweetingApproach() \
            .setInputCols(["normal"]) \
            .setOutputCol("spell") \
            .setDictionary(dictionary) \
            .setCorpus(corpus) \
            .setShortCircuit(True) \
            .setDoubleVariants(True)

In [322]:
finisher = Finisher() \
    .setInputCols(["spell"]) \
    .setIncludeKeys(False)
    
pipeline = Pipeline(stages=[document_assembler,
                           tokenizer,
                           normalizer,
                           spell_checker,
                           finisher])

ready_data = pipeline.fit(data).transform(data)

In [323]:
ready_data.show(5)

+------+--------------------+--------------------+
|itemid|            original|      finished_spell|
+------+--------------------+--------------------+
|     1|                 ...|is@so@sad@for@my@...|
|     2|                 ...|i@missed@the@new@...|
|     3|              omg...|   omg@its@already@o|
|     4|          .. Omga...|omgaga@im@so@im@g...|
|     5|         i think ...|i@think@mi@bf@is@...|
+------+--------------------+--------------------+
only showing top 5 rows



In [324]:
ready_data.select("finished_spell").show(1, False)

+---------------------------+
|finished_spell             |
+---------------------------+
|is@so@sad@for@my@apl@friend|
+---------------------------+
only showing top 1 row



In [325]:
from pyspark.sql import functions as F

sc_jsl = ready_data.withColumn("finished_spell", 
                               F.regexp_replace("finished_spell", "@", " "))
sc_jsl.select("original").show(5,False)
sc_jsl.select("finished_spell").show(5,False)

+------------------------------------------------------------------------------------------------------------------------------------+
|original                                                                                                                            |
+------------------------------------------------------------------------------------------------------------------------------------+
|                     is so sad for my APL friend.............                                                                       |
|                   I missed the New Moon trailer...                                                                                 |
|              omg its already 7:30 :O                                                                                               |
|          .. Omgaga. Im sooo  im gunna CRy. I've been at this dentist since 11.. I was suposed 2 just get a crown put on (30mins)...|
|         i think mi bf is cheating on me!!!       T_T 

Get the result of Intellexer Spell Checker

In [326]:
# Read data with intellexer spell checker
corrected_data = spark. \
                 read. \
                 load("corrected_text.csv",
                 format="com.databricks.spark.csv",
                 header="true")

In [327]:
corrected_data.select("text").show(5, False)

+------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                    |
+------------------------------------------------------------------------------------------------------------------------+
|is so sad for my APL friend.............                                                                                |
|I missed the New Moon trailer...                                                                                        |
|long its already 7:30:O                                                                                                 |
|.. Omgaga. Im so in guns Cy. I 've been at this dentist since 11.. I was supposed 2 just get a crown put on (30 mins)...|
|i think my bf is cheating on me!!! T_T                                                                                  |
+---------------

In [328]:
# Create a pipeline just with tokenizer and normalizer of the Intellexer spell checker
document_assembler = DocumentAssembler() \
                    .setInputCol("text")

tokenizer = Tokenizer() \
            .setInputCols(["document"]) \
            .setOutputCol("token")

normalizer = Normalizer() \
            .setInputCols(["token"]) \
            .setOutputCol("normal")

finisher = Finisher() \
           .setInputCols(["normal"]) \
           .setIncludeKeys(False)
    
pipeline = Pipeline(stages=[document_assembler,
                           tokenizer,
                           normalizer,
                           finisher])

ready_data = pipeline.fit(corrected_data).transform(corrected_data)

In [329]:
ready_data.show(3, False)

+------+----------------------------------------+-----------------------------+
|itemid|text                                    |finished_normal              |
+------+----------------------------------------+-----------------------------+
|1     |is so sad for my APL friend.............|is@so@sad@for@my@apl@friend  |
|2     |I missed the New Moon trailer...        |i@missed@the@new@moon@trailer|
|3     |long its already 7:30:O                 |long@its@already@o           |
+------+----------------------------------------+-----------------------------+
only showing top 3 rows



In [330]:
sc_intellexer = ready_data.withColumn("finished_normal", 
                                     F.regexp_replace("finished_normal", "@", " "))
sc_intellexer.select("text").show(5,False)
sc_intellexer.select("finished_normal").show(5,False)

+------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                    |
+------------------------------------------------------------------------------------------------------------------------+
|is so sad for my APL friend.............                                                                                |
|I missed the New Moon trailer...                                                                                        |
|long its already 7:30:O                                                                                                 |
|.. Omgaga. Im so in guns Cy. I 've been at this dentist since 11.. I was supposed 2 just get a crown put on (30 mins)...|
|i think my bf is cheating on me!!! T_T                                                                                  |
+---------------

In [331]:
sc_intellexer.show(3, False)

+------+----------------------------------------+-----------------------------+
|itemid|text                                    |finished_normal              |
+------+----------------------------------------+-----------------------------+
|1     |is so sad for my APL friend.............|is so sad for my apl friend  |
|2     |I missed the New Moon trailer...        |i missed the new moon trailer|
|3     |long its already 7:30:O                 |long its already o           |
+------+----------------------------------------+-----------------------------+
only showing top 3 rows



In [332]:
sc_jsl.show(3, False)

+------+-------------------------------------------------------------+-----------------------------+
|itemid|original                                                     |finished_spell               |
+------+-------------------------------------------------------------+-----------------------------+
|1     |                     is so sad for my APL friend.............|is so sad for my apl friend  |
|2     |                   I missed the New Moon trailer...          |i missed the new moon trailer|
|3     |              omg its already 7:30 :O                        |omg its already o            |
+------+-------------------------------------------------------------+-----------------------------+
only showing top 3 rows



In [333]:
sc_evaluation = sc_intellexer.join(sc_jsl,
                                  sc_intellexer.itemid == sc_jsl.itemid)

In [334]:
sc_evaluation.show(5)

+------+--------------------+--------------------+------+--------------------+--------------------+
|itemid|                text|     finished_normal|itemid|            original|      finished_spell|
+------+--------------------+--------------------+------+--------------------+--------------------+
|     1|is so sad for my ...|is so sad for my ...|     1|                 ...|is so sad for my ...|
|     2|I missed the New ...|i missed the new ...|     2|                 ...|i missed the new ...|
|     3|long its already ...|  long its already o|     3|              omg...|   omg its already o|
|     4|.. Omgaga. Im so ...|omgaga im so in g...|     4|          .. Omga...|omgaga im so im g...|
|     5|i think my bf is ...|i think my bf is ...|     5|         i think ...|i think mi bf is ...|
+------+--------------------+--------------------+------+--------------------+--------------------+
only showing top 5 rows



In [335]:
from pyspark.sql import functions as F

sc_evaluation = sc_evaluation.select("original",
                                     F.col("finished_normal").alias("sc_intellexer"),
                                     F.col("finished_spell").alias("sc_jsl"))

In [336]:
sc_evaluation.show(3, False)

+-------------------------------------------------------------+-----------------------------+-----------------------------+
|original                                                     |sc_intellexer                |sc_jsl                       |
+-------------------------------------------------------------+-----------------------------+-----------------------------+
|                     is so sad for my APL friend.............|is so sad for my apl friend  |is so sad for my apl friend  |
|                   I missed the New Moon trailer...          |i missed the new moon trailer|i missed the new moon trailer|
|              omg its already 7:30 :O                        |long its already o           |omg its already o            |
+-------------------------------------------------------------+-----------------------------+-----------------------------+
only showing top 3 rows



In [337]:
sc_evaluation = sc_evaluation. \
                withColumn("equal",
                           F.when(F.col("sc_intellexer") == F.col("sc_jsl"), 1). \
                           otherwise(0))

In [338]:
sc_evaluation.show(3, False)

+-------------------------------------------------------------+-----------------------------+-----------------------------+-----+
|original                                                     |sc_intellexer                |sc_jsl                       |equal|
+-------------------------------------------------------------+-----------------------------+-----------------------------+-----+
|                     is so sad for my APL friend.............|is so sad for my apl friend  |is so sad for my apl friend  |1    |
|                   I missed the New Moon trailer...          |i missed the new moon trailer|i missed the new moon trailer|1    |
|              omg its already 7:30 :O                        |long its already o           |omg its already o            |0    |
+-------------------------------------------------------------+-----------------------------+-----------------------------+-----+
only showing top 3 rows



In [339]:
sc_evaluation.filter(F.col("equal")==0).show()

+--------------------+--------------------+--------------------+-----+
|            original|       sc_intellexer|              sc_jsl|equal|
+--------------------+--------------------+--------------------+-----+
|              omg...|  long its already o|   omg its already o|    0|
|          .. Omga...|omgaga im so in g...|omgaga im so im g...|    0|
|         i think ...|i think my bf is ...|i think mi bf is ...|    0|
|       Juuuuuuuuu...|juuuuuuuuuuuuuuuu...|        just chillin|    0|
|      hmmmm.... i...|hmmmm i wonder ho...|hm i wonder how s...|    0|
|     jb isnt show...|jab is not showin...|jb isnt showing i...|    0|
|     ok thats it ...|  ok that it you win| ok thats it you win|    0|
|    awhhe man.......|awe man i m compl...|awhhe man i m com...|    0|
|    HUGE roll of ...|huge roll of thun...|huge roll of thun...|    0|
|       wompppp wompp|        wompppp womb|           womp womp|    0|
|   &lt;---Sad lev...|lt sad level is i...|ltsad level is i ...|    0|
|   ..

In [340]:
diff = sc_evaluation.filter(F.col("equal")==0).count()
print(diff)

145


In [341]:
acc = 100-(diff*100/300)
print(acc)

51.666666666666664
