# PySpark Lab

In [5]:
import sys

print(sys.path)
import sparknlp  # The line that's causing the error

['/home/john/miniconda3/envs/genai-lab/lib/python311.zip', '/home/john/miniconda3/envs/genai-lab/lib/python3.11', '/home/john/miniconda3/envs/genai-lab/lib/python3.11/lib-dynload', '', '/home/john/miniconda3/envs/genai-lab/lib/python3.11/site-packages', '/home/john/miniconda3/envs/genai-lab/lib/python3.11/site-packages/setuptools/_vendor']


In [8]:


from sparknlp.base import Pipeline
from sparknlp.annotator import DocumentAssembler, LanguageDetectorDL
import pyspark
from pyspark.sql.functions import  col, from_unixtime
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, DoubleType, LongType
from genailab.infra.utils.file.io import IOService

print(pyspark.__version__)
sparknlp.version()



3.5.4


'5.1.2'

## Spark Session

In [None]:
spark = (
    SparkSession.builder.appName("genailab")
    .master("local[*]")
    .config("spark.driver.memory", "32g")
    .config("spark.executor.memory", "32g")
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .config("spark.kryoserializer.buffer.max", "2000M")
    .config("spark.driver.maxResultSize", "0")
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.3.3")
    .config("spark.sql.legacy.parquet.nanosAsLong", "true")
    .getOrCreate()
)

## Data

In [3]:
schema = StructType([ \
    StructField("id", StringType(), True), \
    StructField("app_id", StringType(), True), \
    StructField("app_name", StringType(), True), \
    StructField("category_id", StringType(), True), \
    StructField("category", StringType(), True), \
    StructField("author", StringType(), True), \
    StructField("rating", DoubleType(), True), \
    StructField("title", StringType(), True), \
    StructField("content", StringType(), True), \
    StructField("eda_review_length", LongType(), True), \
    StructField("vote_count", LongType(), True), \
    StructField("vote_sum", LongType(), True), \
    StructField("date", LongType(), True), \

    ])

In [4]:
fp1 = "data/dev/00_raw/reviews.pkl"
fp2 = "data/dev/00_raw/reviews.parquet"
df1 = IOService.read(filepath=fp1)
df1.to_parquet(fp2)


## Create Spark DataFrame

In [None]:
df2 = spark.read.option("header", "true").schema(schema).parquet(fp2)
df2 = df2.withColumn("date", from_unixtime(col("date")/1000000000).cast('timestamp'))
df2.printSchema()
df2.take(5)

## Pipeline

In [None]:
documentAssembler = DocumentAssembler().setInputCol("content").setOutputCol('document')

#language detection
languageDetector = LanguageDetectorDL.pretrained() \
  .setInputCols(["document"]) \
  .setOutputCol("language")

pipe = Pipeline(stages=[documentAssembler, languageDetector])
results = pipe.fit(df2).transform(df2)

## Resullts

In [None]:
results.select("language.result").show()