In [24]:
import sparknlp
from sparknlp.pretrained import PretrainedPipeline
from pyspark.sql import functions as sf
from pyspark.ml import Pipeline
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.types import DoubleType
import os
import time

from sklearn.metrics import f1_score

In [2]:
spark = sparknlp.start()

In [38]:
DATA_PATH = "/Users/briancai/Desktop/Datasets/yelp_dataset/yelp_academic_dataset_review.json"
data = spark.read.json(DATA_PATH).limit(50000)

data = (
    data
    .withColumn(
        "label",
        sf.when(data.stars > 3, 1.0).otherwise(0.0)
    )
    .select("text", "label")
)

In [39]:
pipeline = PretrainedPipeline("analyze_sentimentdl_use_imdb")

analyze_sentimentdl_use_imdb download started this may take some time.
Approx size to download 935.8 MB
[OK!]


In [42]:
preds = (
    pipeline
    .transform(data)
    .select("label", "sentiment")
)

In [43]:
pandasDF = preds.toPandas()

In [44]:
pandasDF.head()

Unnamed: 0,label,sentiment
0,0.0,"[(category, 0, 1372, positive, {'sentence': '0..."
1,0.0,"[(category, 0, 1406, negative, {'sentence': '0..."
2,1.0,"[(category, 0, 430, positive, {'sentence': '0'..."
3,0.0,"[(category, 0, 400, positive, {'sentence': '0'..."
4,1.0,"[(category, 0, 840, positive, {'sentence': '0'..."


In [45]:
pandasDF['sentiment_result'] = pandasDF.apply(lambda x: x["sentiment"][0]["result"], axis=1)

In [46]:
pandasDF["pred"] = pandasDF.apply(lambda x: 1.0 if x["sentiment_result"] == "positive" else 0.0, axis=1)

In [47]:
f1_score(pandasDF['label'], pandasDF["pred"], pos_label=1)

0.8582704857176352

In [26]:
metrics = MulticlassMetrics(preds.rdd)
f1_score = metrics.fMeasure(1.0)

In [28]:
f1_score

0.7612903225806451

In [29]:
def save_dict_as_json(dictionary, path):

    with open(path, "w") as outfile:
        json.dump(dictionary, outfile)

    return None

In [30]:
results_dict = {
    "model type": "SparkNLP",
    "n": 1000,
    "f1 score": f1_score,
}

In [31]:
results_dict

{'model type': 'SparkNLP', 'n': 1000, 'f1 score': 0.7612903225806451}

In [32]:
save_path = os.path.join("/Users/briancai/Drive/NU/Q4/Text/sparknlp_performance/output", "spark_nlp_1")
save_dict_as_json(results_dict, save_path)