In [1]:
import os

if "jbook" in os.getcwd():
    os.chdir(os.path.abspath(os.path.join("../..")))
import warnings

warnings.filterwarnings("ignore")

# Data Acquisition Stage
The AppVoCAI Dataset contains reviews dating back to 2008. As our aim is to evince current unmet needs and opportunities in the app market, we will limit the data to reviews written during  the 32 months between January 2021 and September 2023.

In [2]:
from discover.container import DiscoverContainer
from discover.infra.config.orchestration import OrchestrationConfigReader
from discover.orchestration.data_prep.stage import DataPrepStage

## Dependency Container

In [3]:
container = DiscoverContainer()
container.init_resources()
container.wire(
    modules=[
        "discover.orchestration.data_prep.stage",
        "discover.orchestration.data_prep.dqa",
    ],
)

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## Data Acquisition Pipeline

In [4]:
# Obtain the configuration
reader = OrchestrationConfigReader()
config = reader.get_config("phases", namespace=False)
stage_config = config["dataprep"]["stages"][0]

# Build and run Data Ingestion Stage
stage = DataPrepStage.build(stage_config=stage_config, force=False)
asset_id = stage.run()



#                             Data Ingestion Stage                             #

Starting Data Ingestion Stage Fri, 18 Oct 2024 11:47:59


[10/18/2024 11:48:40 AM] [DEBUG] [root] [_read] : Read file using ParquetIO and returning a DataFrame


Starting FilterTask Fri, 18 Oct 2024 11:48:40
Completed FilterTask Fri, 18 Oct 2024 11:48:45. Runtime: 4.37 seconds


                              Data Ingestion Stage                              
                             Stage Start | Fri, 18 Oct 2024 11:47:59
                          Stage Complete | Fri, 18 Oct 2024 11:48:51
                                 Runtime | 52.16 seconds







## Data Overview

In [5]:
repo = container.repo.dataset_repo()
ds = repo.get(asset_id=asset_id)
ds.content.info(verbose=True, memory_usage=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 59021 entries, 558124 to 20770924
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   id           59021 non-null  string        
 1   app_id       59021 non-null  string        
 2   app_name     59021 non-null  string        
 3   category_id  59021 non-null  object        
 4   author       59021 non-null  object        
 5   rating       59021 non-null  float64       
 6   content      59021 non-null  string        
 7   vote_sum     59021 non-null  Int64         
 8   vote_count   59021 non-null  Int64         
 9   date         59021 non-null  datetime64[us]
 10  category     59021 non-null  category      
dtypes: Int64(2), category(1), datetime64[us](1), float64(1), object(2), string(4)
memory usage: 5.1+ MB


In [6]:
ds.content.head()

Unnamed: 0,id,app_id,app_name,category_id,author,rating,content,vote_sum,vote_count,date,category
558124,7889871751,580643740,hoopla Digital,6018,0a433aa553dfe6554826,5.0,Titles I didn’t think they would have and a ap...,0,0,2021-10-08 01:25:00,Book
414067,6995102689,1076402606,"Libby, by OverDrive",6018,dce2976b6ee0e0fa44a0,5.0,I have been getting ebooks from the library fo...,0,0,2021-02-15 00:01:00,Book
418064,8308068603,1076402606,"Libby, by OverDrive",6018,50091da0238254dbef31,5.0,With my busy life the e- library brought back ...,0,0,2022-02-01 19:29:00,Book
571213,7316684624,852497554,Golden Quran | المصحف الذهبي,6018,8d51996c99eee37e1f9b,5.0,شكرا جزيلا لمصمم البرنامج والذين ساهمو فيه وا...,0,0,2021-05-09 09:14:00,Book
149022,8287426523,903001147,Axis 360,6018,ccbc729a023bde903a1a,3.0,This app has a lot of content to offer but I h...,0,0,2022-01-27 06:53:00,Book


The filtered dataset contains **5,902,052** entries (reviews) and **11 columns** with the following details:

- **Columns**:
  1. `id`: Unique identifier for each review, stored as `string` (non-null for all entries).
  2. `app_id`: Unique identifier for each app, stored as `string` (non-null).
  3. `app_name`: Name of the app, stored as `string` (non-null).
  4. `category_id`: Identifier for the app category, stored as `object` (non-null).
  5. `author`: The author of the review, stored as `object` (non-null).
  6. `rating`: The rating given by the author, stored as `float64` (non-null).
  7. `content`: The text content of the review, stored as `string` (non-null).
  8. `vote_sum`: Sum of votes received by the review, stored as `Int64` (non-null).
  9. `vote_count`: Total count of votes, stored as `Int64` (non-null).
  10. `date`: Date of the review, stored as `datetime64[us]` (non-null).
  11. `category`: The category of the app, stored as a `category` type (non-null).

- **Memory Usage**: The dataset occupies **512.2+ MB** in memory.

These data appear to be complete with no missing values. In the next section, we'll normalize encoding, cast data types and remove formatting characters from review text.