In [1]:
import sparknlp
from sparknlp.pretrained import PretrainedPipeline
from pyspark.sql import functions as sf
from pyspark.ml import Pipeline
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.types import DoubleType
import os
import time

In [2]:
spark = sparknlp.start()

In [3]:
DATA_PATH = "/Users/briancai/Desktop/Datasets/yelp_dataset/yelp_academic_dataset_review.json"
data = spark.read.json(DATA_PATH).limit(1000)

data = (
    data
    .withColumn(
        "label",
        sf.when(data.stars > 3, 1.0).otherwise(0.0)
    )
    .select("text", "label")
)

In [4]:
pipeline = PretrainedPipeline("analyze_sentimentdl_use_imdb")

analyze_sentimentdl_use_imdb download started this may take some time.
Approx size to download 935.8 MB
[OK!]


In [7]:
preds = (
    pipeline
    .transform(data)
#     .withColumn("prediction", sf.col("class.result").getItem(0).cast(DoubleType()))
#     .select("label", "prediction")
)

In [10]:
preds.select("sentiment").show(5)

+--------------------+
|           sentiment|
+--------------------+
|[[category, 0, 13...|
|[[category, 0, 14...|
|[[category, 0, 43...|
|[[category, 0, 40...|
|[[category, 0, 84...|
+--------------------+
only showing top 5 rows



In [11]:
pandasDF = preds.toPandas()

In [22]:
pandasDF.loc[0, "sentiment"][0]

Row(annotatorType='category', begin=0, end=1372, result='positive', metadata={'sentence': '0', 'negative': '3.839735E-12', 'positive': '1.0'}, embeddings=[])

In [19]:
pandasDF['sentiment_result'] = pandasDF.apply(lambda x: x["sentiment"][0]["result"], axis=1)

In [20]:
pandasDF["actual_result"] 

Unnamed: 0,text,label,document,sentence_embeddings,sentiment,sentiment_result
0,"As someone who has worked with many museums, I...",0.0,"[(document, 0, 1372, As someone who has worked...","[(sentence_embeddings, 0, 1372, As someone who...","[(category, 0, 1372, positive, {'sentence': '0...",positive
1,I am actually horrified this place is still in...,0.0,"[(document, 0, 1406, I am actually horrified t...","[(sentence_embeddings, 0, 1406, I am actually ...","[(category, 0, 1406, negative, {'sentence': '0...",negative
2,I love Deagan's. I do. I really do. The atmosp...,1.0,"[(document, 0, 430, I love Deagan's. I do. I r...","[(sentence_embeddings, 0, 430, I love Deagan's...","[(category, 0, 430, positive, {'sentence': '0'...",positive
3,"Dismal, lukewarm, defrosted-tasting ""TexMex"" g...",0.0,"[(document, 0, 400, Dismal, lukewarm, defroste...","[(sentence_embeddings, 0, 400, Dismal, lukewar...","[(category, 0, 400, positive, {'sentence': '0'...",positive
4,"Oh happy day, finally have a Canes near my cas...",1.0,"[(document, 0, 840, Oh happy day, finally have...","[(sentence_embeddings, 0, 840, Oh happy day, f...","[(category, 0, 840, positive, {'sentence': '0'...",positive
...,...,...,...,...,...,...
995,"my favorite is Okinawa MT or Rose MT, Lychee P...",1.0,"[(document, 0, 167, my favorite is Okinawa MT ...","[(sentence_embeddings, 0, 167, my favorite is ...","[(category, 0, 167, positive, {'sentence': '0'...",positive
996,Great food and good pricing. Food is made fres...,1.0,"[(document, 0, 109, Great food and good pricin...","[(sentence_embeddings, 0, 109, Great food and ...","[(category, 0, 109, positive, {'sentence': '0'...",positive
997,Again only 4 stars cuz i havent tried the food...,1.0,"[(document, 0, 277, Again only 4 stars cuz i h...","[(sentence_embeddings, 0, 277, Again only 4 st...","[(category, 0, 277, positive, {'sentence': '0'...",positive
998,Husband went to get us some food to go from he...,0.0,"[(document, 0, 264, Husband went to get us som...","[(sentence_embeddings, 0, 264, Husband went to...","[(category, 0, 264, positive, {'sentence': '0'...",positive


In [26]:
metrics = MulticlassMetrics(preds.rdd)
f1_score = metrics.fMeasure(1.0)

In [28]:
f1_score

0.7612903225806451

In [29]:
def save_dict_as_json(dictionary, path):

    with open(path, "w") as outfile:
        json.dump(dictionary, outfile)

    return None

In [30]:
results_dict = {
    "model type": "SparkNLP",
    "n": 1000,
    "f1 score": f1_score,
}

In [31]:
results_dict

{'model type': 'SparkNLP', 'n': 1000, 'f1 score': 0.7612903225806451}

In [32]:
save_path = os.path.join("/Users/briancai/Drive/NU/Q4/Text/sparknlp_performance/output", "spark_nlp_1")
save_dict_as_json(results_dict, save_path)