In [1]:
import warnings

warnings.filterwarnings("ignore")
import os

if "jbook" in os.getcwd():
    os.chdir(os.path.abspath(os.path.join("../..")))
FORCE = True

# Text Quality Analysis
---

### Import Libraries
---

In [2]:

from dask.distributed import Client
from genailab.setup import auto_wire_container
from genailab.core.dtypes import DFType
from genailab.infra.utils.file.fileset import FileFormat
from genailab.asset.dataset.config import DatasetConfig
from genailab.flow.feature.tqa.builder import TQAStageBuilder
from genailab.asset.dataset.config import DatasetConfig
from genailab.core.flow import PhaseDef, StageDef


# Wire container
container = auto_wire_container()

## Datasets

In [3]:
# Source Dataset Configuration
source_config = DatasetConfig(
    phase=PhaseDef.DATAPREP,
    stage=StageDef.CLEAN,
    name="review",
    file_format=FileFormat.PARQUET,
    asset_type="dataset",
    dftype=DFType.PANDAS,
)

# Target Dataset Configuration
target_config = DatasetConfig(
    phase=PhaseDef.FEATURE,
    stage=StageDef.TQADASK,
    name="review",
    file_format=FileFormat.PARQUET,
    asset_type="dataset",
    dftype=DFType.PANDAS,
)


In [4]:
# Create builder
stage = (
    TQAStageBuilder()
        .with_dask(normalized=True, batched=False)
        .build(source_config=source_config, target_config=target_config))
# Run the stage
dataset = stage.run(force=FORCE)

[01/30/2025 11:38:18 PM] [DEBUG] [genailab.infra.persist.repo.object.rao.RAO] [__init__] : RAO created at workspace/test/datasets/ral/
[01/30/2025 11:38:18 PM] [DEBUG] [genailab.infra.persist.repo.object.dao.DAO] [__init__] : DAO created at workspace/test/datasets/dal/
[01/30/2025 11:38:20 PM] [DEBUG] [asyncio] [__init__] : Using selector: EpollSelector
[01/30/2025 11:38:21 PM] [DEBUG] [genailab.flow.base.stage.TQAStage] [run] : Inside TQAStage: run
[01/30/2025 11:38:21 PM] [DEBUG] [genailab.flow.base.stage.TQAStage] [_dataset_exists] : Inside TQAStage: _dataset_exists
[01/30/2025 11:38:21 PM] [DEBUG] [genailab.flow.base.stage.TQAStage] [_fresh_cache_exists] : Inside TQAStage: _fresh_cache_exists
[01/30/2025 11:38:21 PM] [DEBUG] [genailab.flow.base.stage.TQAStage] [_get_dataset] : Inside TQAStage: _get_dataset
[01/30/2025 11:38:21 PM] [DEBUG] [genailab.flow.base.stage.TQAStage] [_dataset_exists] : Inside TQAStage: _dataset_exists
[01/30/2025 11:38:21 PM] [DEBUG] [genailab.flow.base.sta



#             Text Quality Analysis Stage Thu, 30 Jan 2025 23:38:21              #



[01/30/2025 11:38:21 PM] [DEBUG] [genailab.flow.feature.tqa.task.TQADaskTask] [run] : Inside TQADaskTask: run
[01/30/2025 11:38:21 PM] [DEBUG] [genailab.flow.feature.tqa.task.TQADaskTask] [run] : Shape of dataset: (50, 11)




                           TQADaskTask Configuration                            
                            n_partitions | 8
                               n_workers | 12
                            memory_limit | 6GiB
                      threads_per_worker | 1
                               n_process | 4


[01/30/2025 11:38:22 PM] [DEBUG] [genailab.flow.feature.tqa.task.TQADaskTask] [run] : Processing row-by-row...
[01/30/2025 11:38:24 PM] [DEBUG] [genailab.flow.feature.tqa.task.TQADaskTask] [run] : Starting computation in the background with results of type <class 'dask.dataframe.dask_expr._collection.DataFrame'>
[01/30/2025 11:38:24 PM] [DEBUG] [genailab.flow.feature.tqa.task.TQADaskTask] [run] : Beginning progress bar on result of type <class 'dask.dataframe.dask_expr._collection.DataFrame'>
[01/30/2025 11:38:24 PM] [DEBUG] [genailab.flow.feature.tqa.task.TQADaskTask] [run] : Beginning to materialize results.
[01/30/2025 11:38:31 PM] [DEBUG] [genailab.flow.feature.tqa.task.TQADaskTask] [run] : Converted the dask expression to a <class 'pandas.core.frame.DataFrame'> type.
Profile
None
[01/30/2025 11:38:31 PM] [DEBUG] [genailab.flow.feature.tqa.task.TQADaskTask] [run] : Results
               id      app_id                        app_name category_id                author  rating       

<class 'pandas.core.frame.DataFrame'>
Index: 50 entries, 76 to 4705
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype              
---  ------             --------------  -----              
 0   id                 50 non-null     object             
 1   app_id             50 non-null     object             
 2   app_name           50 non-null     object             
 3   category_id        50 non-null     object             
 4   author             50 non-null     object             
 5   rating             50 non-null     int64              
 6   content            50 non-null     object             
 7   vote_sum           50 non-null     int64              
 8   vote_count         50 non-null     int64              
 9   date               50 non-null     datetime64[ns, UTC]
 10  category           50 non-null     object             
 11  noun_count         50 non-null     float64            
 12  verb_count         50 non-null     float64            

In [5]:
dataset.dataframe.head()

Unnamed: 0,id,app_id,app_name,category_id,author,rating,content,vote_sum,vote_count,date,category,noun_count,verb_count,adjective_count,adverb_count,aspect_verb_pairs,noun_phrases,verb_phrases,adverbial_phrases,review_length
76,8482455002,578836126,Tapas – Comics and Novels,6018,c26db4a42f5e3e10811e,3,very nice feel to the app easy to read that sa...,0,0,2022-03-22 08:09:00+00:00,Book,2.995732,2.484907,1.94591,1.94591,1.791759,2.639057,2.484907,0.693147,4.304065
307,8186210408,1123819773,Instawork: Be your own boss,6000,9a583a2ef4356af180c2,1,during setup when picking a location the map s...,0,0,2021-12-30 22:48:00+00:00,Business,2.079442,1.609438,0.693147,1.098612,1.386294,1.791759,1.609438,0.0,3.367296
336,8460094362,339597578,USPS Mobile®,6000,8e9f069b110ee6ee1f4f,1,changes to the app make it pointless previousl...,0,0,2022-03-16 02:18:00+00:00,Business,1.609438,1.609438,0.693147,0.693147,1.386294,1.609438,1.609438,0.0,2.944439
584,8481556587,1052961520,Du Chinese – Read Mandarin 中文,6017,7f0c20bfc389297fa93c,5,my only complaint is that they dont have a du ...,0,0,2022-03-22 01:19:50+00:00,Education,0.693147,0.693147,1.791759,0.0,0.0,0.693147,0.693147,0.0,2.70805
632,9131487549,1003820457,Animal Jam: Virtual Pet Design,6017,cfcb79da3518e77a699c,5,fun animal game great for introducing digital ...,0,0,2022-09-28 23:08:06+00:00,Education,1.94591,0.693147,1.386294,0.0,0.693147,1.609438,0.693147,0.0,2.70805
