In [1]:
import warnings

warnings.filterwarnings("ignore")
import os

if "jbook" in os.getcwd():
    os.chdir(os.path.abspath(os.path.join("../..")))
FORCE = True

# Text Quality Analysis
---

### Import Libraries
---

In [2]:

from genailab.setup import auto_wire_container
from genailab.core.dtypes import DFType
from genailab.infra.utils.file.fileset import FileFormat
from genailab.asset.dataset.config import DatasetConfig
from genailab.flow.feature.tqa.builder import TQAStageBuilder
from genailab.asset.dataset.config import DatasetConfig
from genailab.core.flow import PhaseDef, StageDef


# Wire container
container = auto_wire_container()

## Dask Analyst

In [3]:
# Source Dataset Configuration
source_config = DatasetConfig(
    phase=PhaseDef.DATAPREP,
    stage=StageDef.CLEAN,
    name="review",
    file_format=FileFormat.PARQUET,
    asset_type="dataset",
    dftype=DFType.PANDAS,
)

# Target Dataset Configuration
target_config = DatasetConfig(
    phase=PhaseDef.FEATURE,
    stage=StageDef.TQA,
    name="review",
    file_format=FileFormat.PARQUET,
    asset_type="dataset",
    dftype=DFType.PANDAS,
)


In [4]:
# Create builder
stage = (
    TQAStageBuilder()
        .with_dask(normalized=True, batched=True)
        .build(source_config=source_config, target_config=target_config))
# Run the stage
dataset = stage.run(force=FORCE)



#             Text Quality Analysis Stage Sun, 02 Feb 2025 12:14:18              #

____________________________________________________________________________
Text Quality Analysis Stage             12:14:18    12:19:21    5.0 minutes and 3.81 seconds





In [5]:
dataset.dataframe.head()

Unnamed: 0,id,app_id,app_name,category_id,author,rating,content,vote_sum,vote_count,date,...,adjective_count,adverb_count,aspect_verb_pairs,noun_phrases,verb_phrases,adverbial_phrases,review_length,lexical_density,dependency_depth,tqa_score
0,10019409512,1380362212,GALATEA: Novels & Audiobooks,6018,c011c66aae3e668b150e,5,i love it but the chapter and waiting hours fo...,0,0,2023-06-10 15:09:00+00:00,...,0.0,0.0,0.0,0.693147,0.693147,0.0,2.639057,3.675326,2.70805,16.176484
1,10027124164,1380362212,GALATEA: Novels & Audiobooks,6018,5a2741393dd20358b609,5,i like the books that i have read so far if th...,0,0,2023-06-12 20:14:00+00:00,...,1.386294,1.098612,1.609438,2.079442,2.302585,0.693147,3.912023,3.683251,3.931826,37.360035
2,10036938913,1076402606,"Libby, by OverDrive",6018,46117640263dddac9294,5,i have read dozens upon dozens of books after ...,0,0,2023-06-15 17:01:00+00:00,...,1.94591,1.098612,1.386294,1.609438,2.079442,0.693147,3.850148,3.795001,3.850148,35.486515
3,10047764706,1076402606,"Libby, by OverDrive",6018,a0e95f8868233439444d,5,happy with the app i use it primarily for audi...,0,0,2023-06-18 19:40:00+00:00,...,1.609438,1.386294,1.098612,1.94591,1.386294,0.693147,3.555348,3.987894,3.583519,32.536492
4,10064456025,1535748732,Storyroom - Webnovel & Story,6018,bb43c451a876165c2abf,1,im going to be honest the books are really gre...,0,0,2023-06-23 15:23:00+00:00,...,2.197225,1.609438,1.609438,2.302585,2.639057,0.0,4.418841,3.804492,4.477337,41.925267


In [6]:
# Create builder
stage = (
    TQAStageBuilder()
        .with_dask(normalized=True, batched=False)
        .build(source_config=source_config, target_config=target_config))
# Run the stage
dataset = stage.run(force=FORCE)



#             Text Quality Analysis Stage Sun, 02 Feb 2025 12:19:22              #

____________________________________________________________________________
Text Quality Analysis Stage             12:19:22    12:24:39    5.0 minutes and 16.59 seconds





In [7]:
dataset.dataframe.head()

Unnamed: 0,id,app_id,app_name,category_id,author,rating,content,vote_sum,vote_count,date,...,adjective_count,adverb_count,aspect_verb_pairs,noun_phrases,verb_phrases,adverbial_phrases,review_length,lexical_density,dependency_depth,tqa_score
0,10019409512,1380362212,GALATEA: Novels & Audiobooks,6018,c011c66aae3e668b150e,5,i love it but the chapter and waiting hours fo...,0,0,2023-06-10 15:09:00+00:00,...,0.0,0.0,0.0,0.693147,0.693147,0.0,2.639057,3.675326,2.70805,16.176484
1,10027124164,1380362212,GALATEA: Novels & Audiobooks,6018,5a2741393dd20358b609,5,i like the books that i have read so far if th...,0,0,2023-06-12 20:14:00+00:00,...,1.386294,1.098612,1.609438,2.079442,2.302585,0.693147,3.912023,3.683251,3.931826,37.360035
2,10036938913,1076402606,"Libby, by OverDrive",6018,46117640263dddac9294,5,i have read dozens upon dozens of books after ...,0,0,2023-06-15 17:01:00+00:00,...,1.94591,1.098612,1.386294,1.609438,2.079442,0.693147,3.850148,3.795001,3.850148,35.486515
3,10047764706,1076402606,"Libby, by OverDrive",6018,a0e95f8868233439444d,5,happy with the app i use it primarily for audi...,0,0,2023-06-18 19:40:00+00:00,...,1.609438,1.386294,1.098612,1.94591,1.386294,0.693147,3.555348,3.987894,3.583519,32.536492
4,10064456025,1535748732,Storyroom - Webnovel & Story,6018,bb43c451a876165c2abf,1,im going to be honest the books are really gre...,0,0,2023-06-23 15:23:00+00:00,...,2.197225,1.609438,1.609438,2.302585,2.639057,0.0,4.418841,3.804492,4.477337,41.925267
