In [3]:
# Setup - Run only once per Kernel App
%conda install openjdk -y

# install PySpark
%pip install pyspark==3.4.0

# install spark-nlp
%pip install spark-nlp==5.1.3

# restart kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 23.3.1
  latest version: 23.11.0

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install conda=23.11.0



## Package Plan ##

  environment location: /opt/conda

  added / updated specs:
    - openjdk


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2023.08.22 |       h06a4308_0         123 KB
    certifi-2023.11.17         |  py310h06a4308_0         158 KB
    openjdk-11.0.13            |       h87a67e3_0       341.0 MB
    ------------------------------------------------------------
                                           Total:       341.3 MB

The following NEW packages will be INSTALLED:

  openjdk            pkgs/main/linux-64::openjdk-11.0.13-h87a6

## Process the dataset using job

In [2]:
import sagemaker
session = sagemaker.Session()
bucket = session.default_bucket()
!wget -qO- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/jars/spark-nlp-assembly-5.1.3.jar | aws s3 cp - s3://{bucket}/lab8/spark-nlp-assembly-5.1.3.jar
!aws s3 ls s3://{bucket}/lab8/spark-nlp-assembly-5.1.3.jar

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
2023-11-29 01:39:46  708534094 spark-nlp-assembly-5.1.3.jar


## Process random forest regression with max_depth = 5, max_bins=10

In [5]:
%%writefile ../scripts/rf-5-10.py

import os
import sys
import logging
import argparse

# Import pyspark and build Spark session
from pyspark.sql.functions import *
from pyspark.sql.types import (
    DoubleType,
    IntegerType,
    StringType,
    StructField,
    StructType,
)

import json
import sparknlp
import numpy as np
import pandas as pd
from sparknlp.base import *
from pyspark.ml import Pipeline
from sparknlp.annotator import *
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from sparknlp.pretrained import PretrainedPipeline
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import *
from pyspark.ml.regression import RandomForestRegressor

logging.basicConfig(format='%(asctime)s,%(levelname)s,%(module)s,%(filename)s,%(lineno)d,%(message)s', level=logging.DEBUG)
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler(sys.stdout))

def main():
    parser = argparse.ArgumentParser(description="app inputs and outputs")
    parser.add_argument("--s3_dataset_path", type=str, help="Path of dataset in S3")
    parser.add_argument("--s3_output_bucket", type=str, help="s3 output bucket")
    parser.add_argument("--s3_output_key_prefix", type=str, help="s3 output key prefix")
    args = parser.parse_args()
    logger.info(f"args={args}")
    
    spark = SparkSession.builder \
    .appName("Spark NLP")\
    .config("spark.driver.memory","16G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.3")\
    .getOrCreate()
    
    logger.info(f"Spark version: {spark.version}")
    logger.info(f"sparknlp version: {sparknlp.version()}")
    
    # This is needed to save RDDs which is the only way to write nested Dataframes into CSV format
    sc = spark.sparkContext
    sc._jsc.hadoopConfiguration().set(
        "mapred.output.committer.class", "org.apache.hadoop.mapred.FileOutputCommitter"
    )

    # Defining the schema corresponding to the input data. The input data does not contain the headers
    schema = StructType(
        [
            StructField('subreddit', StringType(), True),
            StructField('id', StringType(), True),
            StructField('body', StringType(), True),
            StructField('score', DoubleType(), True),
            StructField('gilded', DoubleType(), True),
            StructField('score', IntegerType(), True),
            StructField('sentiment_index', StringType(), True),
        ]
    )
    
    # Downloading the data from S3 into a Dataframe
    logger.info(f"going to read {args.s3_dataset_path}")
    df = spark.read.parquet(args.s3_dataset_path, header=True, schema=schema)
    df = df.repartition(64)
    logger.info(f"finished reading files...")
    
    # string indexer for subreddit
    string_indexer = StringIndexer(inputCol="subreddit", outputCol="subreddit_index")
    # fit and transform the DataFrame
    df = string_indexer.fit(df).transform(df)
    # string indexer for sentiment
    string_indexer = StringIndexer(inputCol="sentiment_index", outputCol="sentiments_index")
    df = string_indexer.fit(df).transform(df)

    # build pipeline for bert sentence embeddings
    documentAssembler = DocumentAssembler()\
      .setInputCol("body")\
      .setOutputCol("document")

    embeddings = BertSentenceEmbeddings.pretrained("sent_small_bert_L2_128")\
      .setInputCols("document")\
      .setOutputCol("sentence_bert_embeddings")

    embeddingsFinisher = EmbeddingsFinisher()\
      .setInputCols("sentence_bert_embeddings")\
      .setOutputCols("finished_embeddings")\
      .setOutputAsVector(True)

    pipeline = Pipeline().setStages([
        documentAssembler,
        embeddings,
        embeddingsFinisher,
    ])
    result = pipeline.fit(df).transform(df)
    # explode the result into a vector
    result = result.withColumn("finished_embeddings_vector", explode("finished_embeddings"))
    assembler = VectorAssembler(
        inputCols=["finished_embeddings_vector", "subreddit_index","sentiments_index",'gilded',"is_submission"],
        outputCol="features"
    )
    result = assembler.transform(result)
    result.persist()
    
    # split the data into train and test set
    train_df, test_df = result.randomSplit([0.7, 0.3], seed=232)
    
    # define the model
    rf = RandomForestRegressor(
        featuresCol="features",
        labelCol="score",
        maxDepth=5,
        maxBins=10
    )

    # fit the pipeline
    model = rf.fit(train_df)

    # make predictions on data
    predictions = model.transform(test_df)

    # get the predictions
    output = predictions.select("score", "prediction")
    
    s3_path = f"s3://{args.s3_output_bucket}/{args.s3_output_key_prefix}/rf-5-10"
    logger.info(f"going to save dataframe to {s3_path}")
    output.write.mode("overwrite").parquet(s3_path)
    logger.info("all done")
    
if __name__ == "__main__":
    main()

Overwriting ../scripts/rf-5-10.py


In [6]:
%%time
import boto3
import sagemaker
from sagemaker.spark.processing import PySparkProcessor

account_id = boto3.client('sts').get_caller_identity()['Account']

# Setup the PySpark processor to run the job. Note the instance type and instance count parameters. SageMaker will create these many instances of this type for the spark job.
role = sagemaker.get_execution_role()
spark_processor = PySparkProcessor(
    base_job_name="sm-spark-rf",
    image_uri=f"{account_id}.dkr.ecr.us-east-1.amazonaws.com/sagemaker-spark:latest",
    role=role,
    instance_count=8,
    instance_type="ml.m5.xlarge",
    max_runtime_in_seconds=3600,
)

# s3 paths
session = sagemaker.Session()
bucket = session.default_bucket()
s3_dataset_path = f"*"
s3_dataset_path_1 = f"*"
print(f"account_id={account_id}, s3_dataset_path={s3_dataset_path_1}")
output_prefix_data = f"result/rf"
output_prefix_logs = f"result/logs"


# run the job now, the arguments array is provided as command line to the Python script (Spark code in this case).
spark_processor.run(
    submit_app="../scripts/rf-5-10.py",
    submit_jars=[f"s3://{bucket}/lab8/spark-nlp-assembly-5.1.3.jar"],
    arguments=[
        "--s3_dataset_path",
        s3_dataset_path,
        "--s3_output_bucket",
        bucket,
        "--s3_output_key_prefix",
        output_prefix_data,
    ],
    spark_event_logs_s3_uri="s3://{}/{}/spark_event_logs".format(bucket, output_prefix_logs),
    logs=False,
)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
account_id=260236516028, s3_dataset_path=s3://sagemaker-us-east-1-260236516028/6000_project/*.parquet


INFO:sagemaker:Creating processing-job with name sm-spark-rf-2023-11-29-02-14-57-295


.............................................................................................................................!CPU times: user 812 ms, sys: 75.6 ms, total: 888 ms
Wall time: 10min 36s


## Process random forest regression with max_depth = 10, max_bins=30 and SAVE the model

In [13]:
%%writefile ../scripts/rf-10-30-save.py

import os
import sys
import logging
import argparse

# Import pyspark and build Spark session
from pyspark.sql.functions import *
from pyspark.sql.types import (
    DoubleType,
    IntegerType,
    StringType,
    StructField,
    StructType,
)

import json
import sparknlp
import numpy as np
import pandas as pd
from sparknlp.base import *
from pyspark.ml import Pipeline
from sparknlp.annotator import *
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from sparknlp.pretrained import PretrainedPipeline
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import *
from pyspark.ml.regression import RandomForestRegressor

logging.basicConfig(format='%(asctime)s,%(levelname)s,%(module)s,%(filename)s,%(lineno)d,%(message)s', level=logging.DEBUG)
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler(sys.stdout))

def main():
    parser = argparse.ArgumentParser(description="app inputs and outputs")
    parser.add_argument("--s3_dataset_path", type=str, help="Path of dataset in S3")
    parser.add_argument("--s3_output_bucket", type=str, help="s3 output bucket")
    parser.add_argument("--s3_output_key_prefix", type=str, help="s3 output key prefix")
    args = parser.parse_args()
    logger.info(f"args={args}")
    
    spark = SparkSession.builder \
    .appName("Spark NLP")\
    .config("spark.driver.memory","16G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.3")\
    .getOrCreate()
    
    logger.info(f"Spark version: {spark.version}")
    logger.info(f"sparknlp version: {sparknlp.version()}")
    
    # This is needed to save RDDs which is the only way to write nested Dataframes into CSV format
    sc = spark.sparkContext
    sc._jsc.hadoopConfiguration().set(
        "mapred.output.committer.class", "org.apache.hadoop.mapred.FileOutputCommitter"
    )
    
    # Downloading the data from S3 into a Dataframe
    logger.info(f"going to read {args.s3_dataset_path}")
    df = spark.read.parquet(args.s3_dataset_path, header=True)
    df = df.repartition(64)
    logger.info(f"finished reading files...")
    
    # string indexer for subreddit
    string_indexer = StringIndexer(inputCol="subreddit", outputCol="subreddit_index")
    # fit and transform the DataFrame
    df = string_indexer.fit(df).transform(df)
    # string indexer for sentiment
    string_indexer = StringIndexer(inputCol="sentiment_index", outputCol="sentiments_index")
    df = string_indexer.fit(df).transform(df)

    # build pipeline for bert sentence embeddings
    documentAssembler = DocumentAssembler()\
      .setInputCol("body")\
      .setOutputCol("document")

    embeddings = BertSentenceEmbeddings.pretrained("sent_small_bert_L2_128")\
      .setInputCols("document")\
      .setOutputCol("sentence_bert_embeddings")

    embeddingsFinisher = EmbeddingsFinisher()\
      .setInputCols("sentence_bert_embeddings")\
      .setOutputCols("finished_embeddings")\
      .setOutputAsVector(True)

    pipeline = Pipeline().setStages([
        documentAssembler,
        embeddings,
        embeddingsFinisher,
    ])
    result = pipeline.fit(df).transform(df)
    # explode the result into a vector
    result = result.withColumn("finished_embeddings_vector", explode("finished_embeddings"))
    assembler = VectorAssembler(
        inputCols=["finished_embeddings_vector", "subreddit_index","sentiments_index",'gilded',"is_submission"],
        outputCol="features"
    )
    result = assembler.transform(result)
    result.persist()
    
    # split the data into train and test set
    train_df, test_df = result.randomSplit([0.7, 0.3], seed=232)
    
    # define the model
    rf = RandomForestRegressor(
        featuresCol="features",
        labelCol="score",
        maxDepth=10,
        maxBins=30
    )

    # fit the pipeline
    model = rf.fit(train_df)

    # make predictions on data
    predictions = model.transform(test_df)

    # get the predictions
    output = predictions.select("score", "prediction")
    
    s3_path = f"s3://{args.s3_output_bucket}/{args.s3_output_key_prefix}/rf-10-30"
    model_path = f"s3://{args.s3_output_bucket}/{args.s3_output_key_prefix}/rfmodel"
    model.save(model_path)
    logger.info(f"going to save dataframe to {s3_path}")
    output.write.mode("overwrite").parquet(s3_path)
    logger.info("all done")
    
if __name__ == "__main__":
    main()

Overwriting ../scripts/rf-10-30-save.py


In [14]:
%%time
import boto3
import sagemaker
from sagemaker.spark.processing import PySparkProcessor

account_id = boto3.client('sts').get_caller_identity()['Account']

# Setup the PySpark processor to run the job. Note the instance type and instance count parameters. SageMaker will create these many instances of this type for the spark job.
role = sagemaker.get_execution_role()
spark_processor = PySparkProcessor(
    base_job_name="sm-spark-rf",
    image_uri=f"{account_id}.dkr.ecr.us-east-1.amazonaws.com/sagemaker-spark:latest",
    role=role,
    instance_count=8,
    instance_type="ml.m5.xlarge",
    max_runtime_in_seconds=3600,
)

# s3 paths
session = sagemaker.Session()
bucket = session.default_bucket()
output_bucket = "*"
s3_dataset_path = f"*"
print(f"account_id={account_id}, s3_dataset_path={s3_dataset_path}")
output_prefix_data = f"6000_project"
output_prefix_logs = f"result/logs"


# run the job now, the arguments array is provided as command line to the Python script (Spark code in this case).
spark_processor.run(
    submit_app="../scripts/rf-10-30-save.py",
    submit_jars=[f"s3://{bucket}/lab8/spark-nlp-assembly-5.1.3.jar"],
    arguments=[
        "--s3_dataset_path",
        s3_dataset_path,
        "--s3_output_bucket",
        output_bucket,
        "--s3_output_key_prefix",
        output_prefix_data,
    ],
    spark_event_logs_s3_uri="s3://{}/{}/spark_event_logs".format(bucket, output_prefix_logs),
    logs=False,
)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
account_id=260236516028, s3_dataset_path=s3://sagemaker-us-east-1-640225923506/6000_project/ml_cleaned


INFO:sagemaker:Creating processing-job with name sm-spark-rf-2023-12-08-07-54-31-807


.....................................................................................................................................!CPU times: user 3.32 s, sys: 198 ms, total: 3.52 s
Wall time: 11min 18s


In [17]:
%%writefile ../scripts/rf-10-30.py

import os
import sys
import logging
import argparse

# Import pyspark and build Spark session
from pyspark.sql.functions import *
from pyspark.sql.types import (
    DoubleType,
    IntegerType,
    StringType,
    StructField,
    StructType,
)

import json
import sparknlp
import numpy as np
import pandas as pd
from sparknlp.base import *
from pyspark.ml import Pipeline
from sparknlp.annotator import *
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from sparknlp.pretrained import PretrainedPipeline
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import *
from pyspark.ml.regression import RandomForestRegressor

logging.basicConfig(format='%(asctime)s,%(levelname)s,%(module)s,%(filename)s,%(lineno)d,%(message)s', level=logging.DEBUG)
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler(sys.stdout))

def main():
    parser = argparse.ArgumentParser(description="app inputs and outputs")
    parser.add_argument("--s3_dataset_path", type=str, help="Path of dataset in S3")
    parser.add_argument("--s3_output_bucket", type=str, help="s3 output bucket")
    parser.add_argument("--s3_output_key_prefix", type=str, help="s3 output key prefix")
    args = parser.parse_args()
    logger.info(f"args={args}")
    
    spark = SparkSession.builder \
    .appName("Spark NLP")\
    .config("spark.driver.memory","16G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.3")\
    .getOrCreate()
    
    logger.info(f"Spark version: {spark.version}")
    logger.info(f"sparknlp version: {sparknlp.version()}")
    
    # This is needed to save RDDs which is the only way to write nested Dataframes into CSV format
    sc = spark.sparkContext
    sc._jsc.hadoopConfiguration().set(
        "mapred.output.committer.class", "org.apache.hadoop.mapred.FileOutputCommitter"
    )

    # Defining the schema corresponding to the input data. The input data does not contain the headers
    schema = StructType(
        [
            StructField('subreddit', StringType(), True),
            StructField('id', StringType(), True),
            StructField('body', StringType(), True),
            StructField('score', DoubleType(), True),
            StructField('gilded', DoubleType(), True),
            StructField('score', IntegerType(), True),
            StructField('sentiment_index', StringType(), True),
        ]
    )
    
    # Downloading the data from S3 into a Dataframe
    logger.info(f"going to read {args.s3_dataset_path}")
    df = spark.read.parquet(args.s3_dataset_path, header=True, schema=schema)
    df = df.repartition(64)
    logger.info(f"finished reading files...")
    
    # string indexer for subreddit
    string_indexer = StringIndexer(inputCol="subreddit", outputCol="subreddit_index")
    # fit and transform the DataFrame
    df = string_indexer.fit(df).transform(df)
    # string indexer for sentiment
    string_indexer = StringIndexer(inputCol="sentiment_index", outputCol="sentiments_index")
    df = string_indexer.fit(df).transform(df)

    # build pipeline for bert sentence embeddings
    documentAssembler = DocumentAssembler()\
      .setInputCol("body")\
      .setOutputCol("document")

    embeddings = BertSentenceEmbeddings.pretrained("sent_small_bert_L2_128")\
      .setInputCols("document")\
      .setOutputCol("sentence_bert_embeddings")

    embeddingsFinisher = EmbeddingsFinisher()\
      .setInputCols("sentence_bert_embeddings")\
      .setOutputCols("finished_embeddings")\
      .setOutputAsVector(True)

    pipeline = Pipeline().setStages([
        documentAssembler,
        embeddings,
        embeddingsFinisher,
    ])
    result = pipeline.fit(df).transform(df)
    # explode the result into a vector
    result = result.withColumn("finished_embeddings_vector", explode("finished_embeddings"))
    assembler = VectorAssembler(
        inputCols=["finished_embeddings_vector", "subreddit_index","sentiments_index",'gilded',"is_submission"],
        outputCol="features"
    )
    result = assembler.transform(result)
    result.persist()
    
    # split the data into train and test set
    train_df, test_df = result.randomSplit([0.7, 0.3], seed=232)
    
    # define the model
    rf = RandomForestRegressor(
        featuresCol="features",
        labelCol="score",
        maxDepth=10,
        maxBins=30，
    )

    # fit the pipeline
    model = rf.fit(train_df)

    # make predictions on data
    predictions = model.transform(test_df)

    # get the predictions
    output = predictions.select("score", "prediction")
    
    s3_path = f"s3://{args.s3_output_bucket}/{args.s3_output_key_prefix}/rf-10-30"
    logger.info(f"going to save dataframe to {s3_path}")
    output.write.mode("overwrite").parquet(s3_path)
    logger.info("all done")
    
if __name__ == "__main__":
    main()

Overwriting ../scripts/rf-10-50.py


In [20]:
%%time
import boto3
import sagemaker
from sagemaker.spark.processing import PySparkProcessor

account_id = boto3.client('sts').get_caller_identity()['Account']

# Setup the PySpark processor to run the job. Note the instance type and instance count parameters. SageMaker will create these many instances of this type for the spark job.
role = sagemaker.get_execution_role()
spark_processor = PySparkProcessor(
    base_job_name="sm-spark-rf",
    image_uri=f"{account_id}.dkr.ecr.us-east-1.amazonaws.com/sagemaker-spark:latest",
    role=role,
    instance_count=8,
    instance_type="ml.m5.xlarge",
    max_runtime_in_seconds=3600,
)

# s3 paths
session = sagemaker.Session()
bucket = session.default_bucket()
s3_dataset_path = f"*"
s3_dataset_path_1 = f"*"
print(f"account_id={account_id}, s3_dataset_path={s3_dataset_path_1}")
output_prefix_data = f"result/rf"
output_prefix_logs = f"result/logs"


# run the job now, the arguments array is provided as command line to the Python script (Spark code in this case).
spark_processor.run(
    submit_app="../scripts/rf-10-30.py",
    submit_jars=[f"s3://{bucket}/lab8/spark-nlp-assembly-5.1.3.jar"],
    arguments=[
        "--s3_dataset_path",
        s3_dataset_path,
        "--s3_output_bucket",
        bucket,
        "--s3_output_key_prefix",
        output_prefix_data,
    ],
    spark_event_logs_s3_uri="s3://{}/{}/spark_event_logs".format(bucket, output_prefix_logs),
    logs=False,
)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
account_id=260236516028, s3_dataset_path=s3://sagemaker-us-east-1-260236516028/6000_project/*.parquet


INFO:sagemaker:Creating processing-job with name sm-spark-rf-2023-11-29-03-06-36-641


...............................................................................................................................!CPU times: user 884 ms, sys: 106 ms, total: 990 ms
Wall time: 10min 46s


## Process linear regression maxIter=10, regParam=0.3, elasticNetParam=0.8

In [34]:
%%writefile ../scripts/lr-10-0.3-0.8.py

import os
import sys
import logging
import argparse

# Import pyspark and build Spark session
from pyspark.sql.functions import *
from pyspark.sql.types import (
    DoubleType,
    IntegerType,
    StringType,
    StructField,
    StructType,
)

import json
import sparknlp
import numpy as np
import pandas as pd
from sparknlp.base import *
from pyspark.ml import Pipeline
from sparknlp.annotator import *
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from sparknlp.pretrained import PretrainedPipeline
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import *
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.regression import LinearRegression

logging.basicConfig(format='%(asctime)s,%(levelname)s,%(module)s,%(filename)s,%(lineno)d,%(message)s', level=logging.DEBUG)
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler(sys.stdout))

def main():
    parser = argparse.ArgumentParser(description="app inputs and outputs")
    parser.add_argument("--s3_dataset_path", type=str, help="Path of dataset in S3")
    parser.add_argument("--s3_output_bucket", type=str, help="s3 output bucket")
    parser.add_argument("--s3_output_key_prefix", type=str, help="s3 output key prefix")
    args = parser.parse_args()
    logger.info(f"args={args}")
    
    spark = SparkSession.builder \
    .appName("Spark NLP")\
    .config("spark.driver.memory","16G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.3")\
    .getOrCreate()
    
    logger.info(f"Spark version: {spark.version}")
    logger.info(f"sparknlp version: {sparknlp.version()}")
    
    # This is needed to save RDDs which is the only way to write nested Dataframes into CSV format
    sc = spark.sparkContext
    sc._jsc.hadoopConfiguration().set(
        "mapred.output.committer.class", "org.apache.hadoop.mapred.FileOutputCommitter"
    )

    # Defining the schema corresponding to the input data. The input data does not contain the headers
    schema = StructType(
        [
            StructField('subreddit', StringType(), True),
            StructField('id', StringType(), True),
            StructField('body', StringType(), True),
            StructField('score', DoubleType(), True),
            StructField('gilded', DoubleType(), True),
            StructField('score', IntegerType(), True),
            StructField('sentiment_index', StringType(), True),

        ]
    )
    
    # Downloading the data from S3 into a Dataframe
    logger.info(f"going to read {args.s3_dataset_path}")
    df = spark.read.parquet(args.s3_dataset_path, header=True, schema=schema)
    df = df.repartition(64)
    logger.info(f"finished reading files...")
    
    # string indexer for subreddit
    string_indexer = StringIndexer(inputCol="subreddit", outputCol="subreddit_index")
    # fit and transform the DataFrame
    df = string_indexer.fit(df).transform(df)
    # string indexer for sentiment
    string_indexer = StringIndexer(inputCol="sentiment_index", outputCol="sentiments_index")
    df = string_indexer.fit(df).transform(df)

    # build pipeline for bert sentence embeddings
    documentAssembler = DocumentAssembler()\
      .setInputCol("body")\
      .setOutputCol("document")

    embeddings = BertSentenceEmbeddings.pretrained("sent_small_bert_L2_128")\
      .setInputCols("document")\
      .setOutputCol("sentence_bert_embeddings")

    embeddingsFinisher = EmbeddingsFinisher()\
      .setInputCols("sentence_bert_embeddings")\
      .setOutputCols("finished_embeddings")\
      .setOutputAsVector(True)

    pipeline = Pipeline().setStages([
        documentAssembler,
        embeddings,
        embeddingsFinisher,
    ])
    result = pipeline.fit(df).transform(df)
    # explode the result into a vector
    result = result.withColumn("finished_embeddings_vector", explode("finished_embeddings"))
    assembler = VectorAssembler(
        inputCols=["finished_embeddings_vector", "subreddit_index","sentiments_index",'gilded',"is_submission"],
        outputCol="features"
    )
    result = assembler.transform(result)
    result.persist()
    
    # split the data into train and test set
    train_df, test_df = result.randomSplit([0.7, 0.3], seed=232)
    
    # define the model
    lr = LinearRegression(maxIter=5, regParam=0.3, elasticNetParam=0.8,
        featuresCol="features",
        labelCol="score",
    )

    # fit the pipeline
    model = lr.fit(train_df)

    # make predictions on data
    predictions = model.transform(test_df)

    # get the predictions
    output = predictions.select("score", "prediction")
    
    s3_path = f"s3://{args.s3_output_bucket}/{args.s3_output_key_prefix}/lr-10-0.3-0.8"
    logger.info(f"going to save dataframe to {s3_path}")
    output.write.mode("overwrite").parquet(s3_path)
    logger.info("all done")
    
if __name__ == "__main__":
    main()

Overwriting ../scripts/lr-10-0.3-0.8.py


In [22]:
%%time
import boto3
import sagemaker
from sagemaker.spark.processing import PySparkProcessor

account_id = boto3.client('sts').get_caller_identity()['Account']

# Setup the PySpark processor to run the job. Note the instance type and instance count parameters. SageMaker will create these many instances of this type for the spark job.
role = sagemaker.get_execution_role()
spark_processor = PySparkProcessor(
    base_job_name="sm-spark-tree",
    image_uri=f"{account_id}.dkr.ecr.us-east-1.amazonaws.com/sagemaker-spark:latest",
    role=role,
    instance_count=8,
    instance_type="ml.m5.xlarge",
    max_runtime_in_seconds=3600,
)

# s3 paths
session = sagemaker.Session()
bucket = session.default_bucket()
s3_dataset_path = f"*"
s3_dataset_path_1 = f"*"
print(f"account_id={account_id}, s3_dataset_path={s3_dataset_path_1}")
output_prefix_data = f"result/lr"
output_prefix_logs = f"result/logs"


# run the job now, the arguments array is provided as command line to the Python script (Spark code in this case).
spark_processor.run(
    submit_app="../scripts/lr-10-0.3-0.8.py",
    submit_jars=[f"s3://{bucket}/lab8/spark-nlp-assembly-5.1.3.jar"],
    arguments=[
        "--s3_dataset_path",
        s3_dataset_path,
        "--s3_output_bucket",
        bucket,
        "--s3_output_key_prefix",
        output_prefix_data,
    ],
    spark_event_logs_s3_uri="s3://{}/{}/spark_event_logs".format(bucket, output_prefix_logs),
    logs=False,
)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
account_id=260236516028, s3_dataset_path=s3://sagemaker-us-east-1-260236516028/6000_project/*.parquet


INFO:sagemaker:Creating processing-job with name sm-spark-tree-2023-11-29-03-19-07-162


......................................................................................................................................................!CPU times: user 1.01 s, sys: 87.9 ms, total: 1.09 s
Wall time: 12min 42s


## Process linear regression maxIter=10, regParam=0.3, elasticNetParam=0.2

In [35]:
%%writefile ../scripts/lr-10-0.3-0.2.py

import os
import sys
import logging
import argparse

# Import pyspark and build Spark session
from pyspark.sql.functions import *
from pyspark.sql.types import (
    DoubleType,
    IntegerType,
    StringType,
    StructField,
    StructType,
)

import json
import sparknlp
import numpy as np
import pandas as pd
from sparknlp.base import *
from pyspark.ml import Pipeline
from sparknlp.annotator import *
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from sparknlp.pretrained import PretrainedPipeline
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import *
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.regression import LinearRegression

logging.basicConfig(format='%(asctime)s,%(levelname)s,%(module)s,%(filename)s,%(lineno)d,%(message)s', level=logging.DEBUG)
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler(sys.stdout))

def main():
    parser = argparse.ArgumentParser(description="app inputs and outputs")
    parser.add_argument("--s3_dataset_path", type=str, help="Path of dataset in S3")
    parser.add_argument("--s3_output_bucket", type=str, help="s3 output bucket")
    parser.add_argument("--s3_output_key_prefix", type=str, help="s3 output key prefix")
    args = parser.parse_args()
    logger.info(f"args={args}")
    
    spark = SparkSession.builder \
    .appName("Spark NLP")\
    .config("spark.driver.memory","16G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.3")\
    .getOrCreate()
    
    logger.info(f"Spark version: {spark.version}")
    logger.info(f"sparknlp version: {sparknlp.version()}")
    
    # This is needed to save RDDs which is the only way to write nested Dataframes into CSV format
    sc = spark.sparkContext
    sc._jsc.hadoopConfiguration().set(
        "mapred.output.committer.class", "org.apache.hadoop.mapred.FileOutputCommitter"
    )

    # Defining the schema corresponding to the input data. The input data does not contain the headers
    schema = StructType(
        [
            StructField('subreddit', StringType(), True),
            StructField('id', StringType(), True),
            StructField('body', StringType(), True),
            StructField('score', DoubleType(), True),
            StructField('gilded', DoubleType(), True),
            StructField('score', IntegerType(), True),
            StructField('sentiment_index', StringType(), True),

        ]
    )
    
    # Downloading the data from S3 into a Dataframe
    logger.info(f"going to read {args.s3_dataset_path}")
    df = spark.read.parquet(args.s3_dataset_path, header=True, schema=schema)
    df = df.repartition(64)
    logger.info(f"finished reading files...")
    
    # string indexer for subreddit
    string_indexer = StringIndexer(inputCol="subreddit", outputCol="subreddit_index")
    # fit and transform the DataFrame
    df = string_indexer.fit(df).transform(df)
    # string indexer for sentiment
    string_indexer = StringIndexer(inputCol="sentiment_index", outputCol="sentiments_index")
    df = string_indexer.fit(df).transform(df)

    # build pipeline for bert sentence embeddings
    documentAssembler = DocumentAssembler()\
      .setInputCol("body")\
      .setOutputCol("document")

    embeddings = BertSentenceEmbeddings.pretrained("sent_small_bert_L2_128")\
      .setInputCols("document")\
      .setOutputCol("sentence_bert_embeddings")

    embeddingsFinisher = EmbeddingsFinisher()\
      .setInputCols("sentence_bert_embeddings")\
      .setOutputCols("finished_embeddings")\
      .setOutputAsVector(True)

    pipeline = Pipeline().setStages([
        documentAssembler,
        embeddings,
        embeddingsFinisher,
    ])
    result = pipeline.fit(df).transform(df)
    # explode the result into a vector
    result = result.withColumn("finished_embeddings_vector", explode("finished_embeddings"))
    assembler = VectorAssembler(
        inputCols=["finished_embeddings_vector", "subreddit_index","sentiments_index",'gilded',"is_submission"],
        outputCol="features"
    )
    result = assembler.transform(result)
    result.persist()
    
    # split the data into train and test set
    train_df, test_df = result.randomSplit([0.7, 0.3], seed=232)
    
    # define the model
    lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.2,
        featuresCol="features",
        labelCol="score",
    )

    # fit the pipeline
    model = lr.fit(train_df)

    # make predictions on data
    predictions = model.transform(test_df)

    # get the predictions
    output = predictions.select("score", "prediction")
    
    s3_path = f"s3://{args.s3_output_bucket}/{args.s3_output_key_prefix}/lr-10-0.3-0.2"
    logger.info(f"going to save dataframe to {s3_path}")
    output.write.mode("overwrite").parquet(s3_path)
    logger.info("all done")
    
if __name__ == "__main__":
    main()

Overwriting ../scripts/lr-10-0.3-0.2.py


In [31]:
%%time
import boto3
import sagemaker
from sagemaker.spark.processing import PySparkProcessor

account_id = boto3.client('sts').get_caller_identity()['Account']

# Setup the PySpark processor to run the job. Note the instance type and instance count parameters. SageMaker will create these many instances of this type for the spark job.
role = sagemaker.get_execution_role()
spark_processor = PySparkProcessor(
    base_job_name="sm-spark-lr",
    image_uri=f"{account_id}.dkr.ecr.us-east-1.amazonaws.com/sagemaker-spark:latest",
    role=role,
    instance_count=8,
    instance_type="ml.m5.xlarge",
    max_runtime_in_seconds=3600,
)

# s3 paths
session = sagemaker.Session()
bucket = session.default_bucket()
s3_dataset_path = f"*"
s3_dataset_path_1 = f"*"
print(f"account_id={account_id}, s3_dataset_path={s3_dataset_path_1}")
output_prefix_data = f"result/lr"
output_prefix_logs = f"result/logs"


# run the job now, the arguments array is provided as command line to the Python script (Spark code in this case).
spark_processor.run(
    submit_app="../scripts/lr-10-0.3-0.2.py",
    submit_jars=[f"s3://{bucket}/lab8/spark-nlp-assembly-5.1.3.jar"],
    arguments=[
        "--s3_dataset_path",
        s3_dataset_path,
        "--s3_output_bucket",
        bucket,
        "--s3_output_key_prefix",
        output_prefix_data,
    ],
    spark_event_logs_s3_uri="s3://{}/{}/spark_event_logs".format(bucket, output_prefix_logs),
    logs=False,
)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
account_id=260236516028, s3_dataset_path=s3://sagemaker-us-east-1-260236516028/6000_project/*.parquet


INFO:sagemaker:Creating processing-job with name sm-spark-lr-2023-11-29-03-47-54-747


.....................................................................................................................!CPU times: user 852 ms, sys: 88.7 ms, total: 941 ms
Wall time: 9min 55s


In [4]:
# Import pyspark and build Spark session
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("PySparkApp")
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.2.2")
    .config(
        "fs.s3a.aws.credentials.provider",
        "com.amazonaws.auth.ContainerCredentialsProvider",
    )
    .getOrCreate()
)

print(spark.version)



:: loading settings :: url = jar:file:/opt/conda/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-2a57cd82-c304-40fc-8c83-7c5d0fc51e56;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.2.2 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.563 in central
:: resolution report :: resolve 311ms :: artifacts dl 18ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.11.563 from central in [default]
	org.apache.hadoop#hadoop-aws;3.2.2 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	---------------------------------------------

3.4.0


## RMSE for random forest maxdepth=5, maxbins=10

In [4]:
%%time
s3_path = f"*"
print(f"reading results from {s3_path}")
rf_1 = spark.read.parquet(s3_path, header=True)
print(f"shape of the comments dataframe is {rf_1.count():,}x{len(rf_1.columns)}")
# calculate the rmse
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(
    labelCol="score", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(rf_1)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

reading results from s3a://sagemaker-us-east-1-260236516028/result/rf/rf-5-10/*.parquet


23/11/29 05:07:41 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

shape of the comments dataframe is 88,233x2


[Stage 5:>                                                          (0 + 4) / 4]

Root Mean Squared Error (RMSE) on test data = 3.95315
CPU times: user 1.47 s, sys: 283 ms, total: 1.75 s
Wall time: 16.6 s


                                                                                

## RMSE for random forest maxdepth=10, maxbins=30

In [5]:
%%time
s3_path = f"*"
print(f"reading results from {s3_path}")
rf_2 = spark.read.parquet(s3_path, header=True)
print(f"shape of the comments dataframe is {rf_2.count():,}x{len(rf_2.columns)}")
# calculate the rmse
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(
    labelCol="score", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(rf_2)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

reading results from s3a://sagemaker-us-east-1-260236516028/result/rf/rf-10-30/*.parquet


                                                                                

shape of the comments dataframe is 88,233x2


[Stage 11:>                                                         (0 + 4) / 4]

Root Mean Squared Error (RMSE) on test data = 3.90691
CPU times: user 11.3 ms, sys: 7.87 ms, total: 19.2 ms
Wall time: 7.38 s


                                                                                

## RMSE for linear regression maxIter=10, regParam=0.3, elasticNetParam=0.8

In [6]:
%%time
s3_path = f"*"
print(f"reading results from {s3_path}")
lr_1 = spark.read.parquet(s3_path, header=True)
print(f"shape of the comments dataframe is {lr_1.count():,}x{len(lr_1.columns)}")
# calculate the rmse
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(
    labelCol="score", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(lr_1)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

reading results from s3a://sagemaker-us-east-1-260236516028/result/lr/lr-10-0.3-0.8/*.parquet


                                                                                

shape of the comments dataframe is 88,233x2




Root Mean Squared Error (RMSE) on test data = 4.03432
CPU times: user 16.9 ms, sys: 215 µs, total: 17.1 ms
Wall time: 6.63 s


                                                                                

## RMSE for linear regression maxIter=10, regParam=0.3, elasticNetParam=0.2

In [7]:
%%time
s3_path = f"*"
print(f"reading results from {s3_path}")
lr_2 = spark.read.parquet(s3_path, header=True)
print(f"shape of the comments dataframe is {lr_2.count():,}x{len(lr_2.columns)}")
# calculate the rmse
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(
    labelCol="score", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(lr_2)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

reading results from s3a://sagemaker-us-east-1-260236516028/result/lr/lr-10-0.3-0.2/*.parquet


                                                                                

shape of the comments dataframe is 88,233x2


[Stage 23:>                                                         (0 + 4) / 4]

Root Mean Squared Error (RMSE) on test data = 3.99234
CPU times: user 14.5 ms, sys: 42 µs, total: 14.6 ms
Wall time: 6.07 s


                                                                                

## Calculate r2

In [9]:
import numpy as np
def r2_calculate(observed, predicted):
    # calculate mean of observed values
    mean_observed = np.mean(observed)
    # calculate SSR
    ssr = np.sum((observed - predicted) ** 2)
    # calculate SST
    sst = np.sum((observed - mean_observed) ** 2)
    # calculate R-squared
    r2 = 1 - (ssr / sst)
    return r2

In [10]:
df_lr_1 = lr_1.toPandas()
observed = df_lr_1['score']
predicted = df_lr_1['prediction']
lr_1_r2 = r2_calculate(observed, predicted)
print("r2 for lr_1 is:", lr_1_r2)

                                                                                

r2 for lr_1 is: 0.0396494744260002


In [11]:
df_lr_2 = lr_2.toPandas()
observed = df_lr_2['score']
predicted = df_lr_2['prediction']
lr_2_r2 = r2_calculate(observed, predicted)
print("r2 for lr_2 is:", lr_2_r2)

                                                                                

r2 for lr_2 is: 0.059530254489551626


In [12]:
df_rf_1 = rf_1.toPandas()
observed = df_rf_1['score']
predicted = df_rf_1['prediction']
rf_1_r2 = r2_calculate(observed, predicted)
print("r2 for lr_1 is:", rf_1_r2)

                                                                                

r2 for lr_1 is: 0.07790613729149365


In [13]:
df_rf_2 = rf_2.toPandas()
observed = df_rf_2['score']
predicted = df_rf_2['prediction']
rf_2_r2 = r2_calculate(observed, predicted)
print("r2 for lr_1 is:", rf_2_r2)

                                                                                

r2 for lr_1 is: 0.09935082248779104


In [14]:
import pandas as pd
result = pd.concat([df_lr_1, df_lr_2, df_rf_1, df_rf_2], axis=1)
result.columns = ['score', 'lr-1', 'new_col3', 'lr-2', 'new_col5', 'rf-1', 'new_col7', 'rf-2']
columns_to_remove = ['new_col3', 'new_col5', 'new_col7']
result = result.drop(columns=columns_to_remove)

# calculate the residuals
result['re-lr-1'] = result['lr-1'] - result['score']
result['re-lr-2'] = result['lr-2'] - result['score']
result['re-rf-1'] = result['rf-1'] - result['score']
result['re-rf-2'] = result['rf-2'] - result['score']

result.to_csv('../../data/csv/score_residual.csv',index=False)