In [1]:
import os

if "jbook" in os.getcwd():
    os.chdir(os.path.abspath(os.path.join("../..")))
import warnings

warnings.filterwarnings("ignore")
FORCE = False

# AppVoCAI Data Quality Assessment
It's well established, the performance, reliability, accuracy, and validity of AI and machine learning models is limited by the quality and integrity of the training data. Systematic analysis of the data quality is an essential precursor to downstream cleaning, analysis and modeling. In this section, we evaluate the quality of the AppVoCAI dataset along 6 dimensions:

1. **Completeness**: The degree to which all required data values are present. The completeness metric is defined as:
$$X=\frac{N_c}{N}$$ 
where $N_c$ is the number of complete rows, and $N$ is the total number of rows. 

2. **Balance**: The degree to which categories are in balance across the dataset. Balance is computed as:
$$X=1-\frac{\sum_{i=1}^N|x_i-\bar{x}|}{N}$$
where $x_i$ is the count of each category, and $N$ is the total number of observations.
 
3. **Interpretability**:  The degree to which data are in an appropriate language. Non-english app names and app reviews are flagged in the dataset, revealing separate measures of interpretability which are averaged as follows:
$$X=\frac{\frac{N_{e1}}{N}+\frac{N_{e2}}{N}}{2}$$ 

where $N_{e1}$ and $N_{e2}$ are the numbers of english language apps, and reviews respectively, and $N$ is the total number of observations.

4. **Duplication**: What proportion of the data are free of duplication defined as:
$$X=\frac{N_u}{N}$$ 

where $N_u$ is the number of unique rows in the dataset and $N$ is total number of rows in the dataset.

5. **Validity**: Validity refers to the degree to which the data reflect valid data. In this context, validity pertains to the five-point rating scheme of values in [1,5], and is defined as:
$$X=\frac{N_r}{N}$$ 
where $N_r$ are the number of observations for which the rating is valid, and $N$ is the total number of observations.

6. **Text Quality**: Our text quality metric is a weighted sum of syntactic complexity measures and a perplexity-based coherence score to arrive at a quality score for each rating {ref}`appendix:tqa`. 

Exposing data quality and enrichment issues before our exploratory efforts ensures focus on the right questions, and discoveries that evince deep and nuanced insights into mobile app consumer behavior, and its implications for strategic opportunity discovery, selection and new product design

In [2]:
import pandas as pd
from discover.flow.data_prep.dqm.stage import DQAStage
from discover.container import DiscoverContainer
from discover.infra.config.flow import FlowConfigReader
from discover.core.flow import DataPrepStageDef, PhaseDef

pd.options.display.max_colwidth = None

In [3]:
container = DiscoverContainer()
container.init_resources()
container.wire(
    modules=[
        "discover.flow.data_prep.stage",
        "discover.app.base",
    ],
)

## Data Quality Assessment Pipeline


In [4]:
# Obtain the configuration
reader = FlowConfigReader()
stage_config = reader.get_stage_config(
    phase=PhaseDef.DATAPREP, stage=DataPrepStageDef.DQA
)

# Build and run Data Ingestion Stage
stage = DQAStage.build(stage_config=stage_config, force=FORCE)
asset_id = stage.run()



#                            Data Quality Assessment                             #



                            Data Quality Assessment                             
                           Stage Started | Sun, 10 Nov 2024 22:03:32
                         Stage Completed | Sun, 10 Nov 2024 22:03:32
                           Stage Runtime | 0.0 seconds
                           Cached Result | True





In [5]:
from discover.app.dqa import DQA

dqa = DQA(name="review")
dqa.summarize()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86705 entries, 0 to 86704
Data columns (total 30 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   id                         86705 non-null  object        
 1   app_id                     86705 non-null  object        
 2   app_name                   86705 non-null  object        
 3   category_id                86705 non-null  object        
 4   author                     86705 non-null  object        
 5   rating                     86705 non-null  int16         
 6   content                    86705 non-null  object        
 7   vote_sum                   86705 non-null  int64         
 8   vote_count                 86705 non-null  int64         
 9   date                       86705 non-null  datetime64[us]
 10  review_length              86705 non-null  int64         
 11  dqa_url                    86705 non-null  bool          
 12  dqa_

Unnamed: 0,Defect,n,%
0,Contains Unicode Chars,39863,45.98
1,Contains Excess Whitespace,11114,12.82
2,Contains Excess Special Chars,9740,11.23
3,Contains Outlier Review Length,6018,6.94
4,Contains Outlier Vote Count,5379,6.2
5,Contains Outlier Vote Sum,4127,4.76
6,Contains Non English Text,3202,3.69
7,Contains Elongation,2993,3.45
8,Contains Non English App Name,2115,2.44
9,Contains Url,1855,2.14


In [6]:
defects = dqa.get_defects(defect="special")
defects

Unnamed: 0,dqa_excess_special_chars,content
73085,True,#free palestine
1998,True,اسأل الله العلي العظيم رب العرش الكريم ان يجزاك خيرًا يامن سعيت في انشاء هذا التطبيق الجميل المفيد عسى ربي يوفقك ويسدد خطاك وين مارحت💜💜.
35096,True,i got the subscription but there’s still ads..
38468,True,I love this app! It keeps me on track. A big thumbs up 👍🏾.
8689,True,"Really glitchy, takes forever for a student to do work! Most of the work doesn’t update, and doesn’t submit! Awful, would not recommend"
53007,True,False advertising on the “free” trial
59868,True,Love it! Can’t start my day without it!🙏🏾❤️
82145,True,It's quite fun to play with!!
31855,True,❤️❤️
82552,True,Great keyboard! I love it! So many options!


In [7]:
dqa.completeness()

1.0