In [None]:
import warnings

warnings.filterwarnings("ignore")
import os

if "jbook" in os.getcwd():
    os.chdir(os.path.abspath(os.path.join("../..")))
FORCE = False

# AppVoCAI Dataset Enrichment snf Aggregation
During data ingestion, several attributes and aggregations were conceived to add additional context, nuance, and richness to the AppVoCAI dataset. In this section, we implement the following data enrichment pipelines, leveraging the parallelism, and computational efficiency of Apache Spark, and the NLP capabilities of the spaCy framework. 

## Data Enrichment Pipeline

1. **Sentiment Classification Pipeline**: Using the spaCy SentimentAnalyzer, we’ll efficiently incorporate review-level user sentiment into the dataset. 
2. **Review Enrichment Pipeline**: The following attributes add additional context to our review dataset. 
    1. **Review Age**: Calculated as the difference between the review date and the most recent review date in the dataset, this variable will illuminate trends over time, such as evolving user expectations or the impact of app updates.
    2. **Review Length**: By measuring the number of words in each review, we can analyze how review detail correlates with sentiment, rating, and user engagement. Lengthier reviews often reflect stronger user opinions and can provide richer qualitative insights.
    3. **App Rating Deviation**: This metric captures the difference between an individual review’s rating and both the app's average rating and the category average. Identifying significant deviations can spotlight unique user perspectives or areas of disagreement.
    4. **Review Length Deviation**: This variable compares each review’s length to the average for the app and the category, uncovering engagement patterns and indicating whether specific apps or categories elicit more detailed feedback.
    5. **Author Review Count**: Tracking the number of reviews submitted by each author allows us to differentiate between casual and frequent reviewers, shedding light on power users whose feedback might carry greater significance or reflect deeper engagement.
3. **Aggregation Pipeline**: Precomputing and materializing the following aggregation promote an agile, efficient, and iterative exploratory analysis empowering interactive desktop analysis, rapid feedback loops and reduced cycle times.   



## Import Libraries

In [2]:
from discover.container import DiscoverContainer
from discover.infra.config.flow import FlowConfigReader
from discover.flow.enrich.metadata.stage import MetadataStage
from discover.flow.enrich.sentiment.stage import SentimentClassificationStage
from discover.flow.enrich.quality.stage import QualityStage
from discover.flow.enrich.deviation.stage import DeviationStage
from discover.flow.aggregation.stage import AggregationStage
from discover.core.flow import PhaseDef, EnrichmentStageDef, AggregationStageDef

## Dependency Container

In [3]:
container = DiscoverContainer()
container.init_resources()
container.wire(
    modules=[
        "discover.flow.enrich.stage",
        "discover.flow.aggregation.stage",
    ],
)

## Metadata Pipeline

In [4]:
# Obtain the configuration
reader = FlowConfigReader()
stage_config = reader.get_stage_config(
    phase=PhaseDef.ENRICHMENT, stage=EnrichmentStageDef.METADATA
)

# Build and run Data Ingestion Stage
stage = MetadataStage.build(stage_config=stage_config, force=FORCE)
asset_id = stage.run()

[11/08/2024 01:35:12 AM] [INFO] [discover.infra.persistence.repo.dataset.DatasetRepo] [_remove_dataset_file_by_filepath] : Removed dataset file at workspace/dev/dataset/02_enrichment/appvocai_discover-02_enrichment-00_metadata-review-dataset.parquet from repository.
[11/08/2024 01:35:12 AM] [INFO] [discover.infra.persistence.repo.dataset.DatasetRepo] [remove] : Removed dataset dataset-dev-enrichment-metadata-review from the repository.




#                             Review Metadata Stage                              #



your 131072x1 screen size is bogus. expect trouble


:: loading settings :: url = jar:file:/home/john/miniconda3/envs/appvocai/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/john/.ivy2/cache
The jars for the packages stored in: /home/john/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-9affb78c-d0d5-4789-995b-5f707694ea93;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;5.3.3 in central
	found com.typesafe#config;1.4.2 in central
	found org.rocksdb#rocksdbjni;6.29.5 in central
	found com.amazonaws#aws-java-sdk-s3;1.12.500 in central
	found com.amazonaws#aws-java-sdk-kms;1.12.500 in central
	found com.amazonaws#aws-java-sdk-core;1.12.500 in central
	found commons-logging#commons-logging;1.1.3 in central
	found commons-codec#commons-codec;1.15 in central
	found org.apache.httpcomponents#httpclient;4.5.13 in central
	found org.apache.httpcomponents#httpcore;4.4.13 in central
	found software.amazon.ion#ion-java;1.0.2 in central
	found joda-time#joda-time;2.8.1 in central
	found com.amazonaws#jmespath-java;1.12.500 in central
	f



                              ComputeReviewAgeTask                              
                              --------------------                              
                          Start Datetime | Fri, 08 Nov 2024 01:35:36


                                                                                

                       Complete Datetime | Fri, 08 Nov 2024 01:35:40
                                 Runtime | 3.82 seconds


                            ComputeReviewLengthTask                             
                            -----------------------                             
                          Start Datetime | Fri, 08 Nov 2024 01:35:40
                       Complete Datetime | Fri, 08 Nov 2024 01:35:40
                                 Runtime | 0.05 seconds






                             Review Metadata Stage                              
                           Stage Started | Fri, 08 Nov 2024 01:35:12
                         Stage Completed | Fri, 08 Nov 2024 01:35:44
                           Stage Runtime | 31.39 seconds





                                                                                

## Sentiment Classification Pipeline

In [None]:
# Obtain the configuration
stage_config = reader.get_stage_config(
    phase=PhaseDef.ENRICHMENT, stage=EnrichmentStageDef.SENTIMENT
)

# Build and run Data Ingestion Stage
stage = SentimentClassificationStage.build(stage_config=stage_config, force=FORCE)
asset_id = stage.run()



#                         Sentiment Classification Stage                         #



                          SpacySentimentClassifierTask                          
                          ----------------------------                          
                          Start Datetime | Fri, 08 Nov 2024 01:35:53


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4533), Label(value='0 / 4533'))), …

                       Complete Datetime | Fri, 08 Nov 2024 01:39:17
                                 Runtime | 3.0 minutes and 23.98 seconds


                         Sentiment Classification Stage                         
                           Stage Started | Fri, 08 Nov 2024 01:35:53
                         Stage Completed | Fri, 08 Nov 2024 01:39:17
                           Stage Runtime | 3.0 minutes and 24.52 seconds





## Review Quality Pipeline

In [6]:
# Obtain the configuration
stage_config = reader.get_stage_config(
    phase=PhaseDef.ENRICHMENT, stage=EnrichmentStageDef.QUALITY
)

# Build and run Data Ingestion Stage
stage = QualityStage.build(stage_config=stage_config, force=FORCE)
asset_id = stage.run()

TypeError: TQATask1.__init__() got an unexpected keyword argument 'column'

## Deviation Analysis Pipeline

In [None]:
# Obtain the configuration
stage_config = reader.get_stage_config(
    phase=PhaseDef.ENRICHMENT, stage=EnrichmentStageDef.DEVIATION
)

# Build and run Data Ingestion Stage
stage = DeviationStage.build(stage_config=stage_config, force=FORCE)
asset_id = stage.run()

## App Aggregation

In [None]:
# Obtain the configuration
stage_config = reader.get_stage_config(
    phase=PhaseDef.AGGREGATION, stage=AggregationStageDef.APP
)

# Build and run Data Ingestion Stage
stage = AggregationStage.build(stage_config=stage_config, force=FORCE)
asset_id = stage.run()

## Category Aggregation

In [None]:
# Obtain the configuration
stage_config = reader.get_stage_config(
    phase=PhaseDef.AGGREGATION, stage=AggregationStageDef.CATEGORY
)

# Build and run Data Ingestion Stage
stage = AggregationStage.build(stage_config=stage_config, force=FORCE)
asset_id = stage.run()