# Data Acquisition
This notebook obtains the AppVoC dataset from AWS S3, extracts the contents, then creates production, development and testing versions of the dataset in their raw directories.

In [1]:
import os
import pandas as pd
from hashlib import blake2b
from pandarallel import pandarallel
from tqdm import tqdm
from appvocai-genailabslm.utils.aws import S3Handler
from appvocai-genailabslm.utils.file import IOService, TarGzHandler

In [2]:
pandarallel.initialize(progress_bar=False, nb_workers=12, verbose=0)

In [3]:
RAW_BUCKET = "appstore-raw"
FINAL_BUCKET = "appvoc"
S3_KEY_RAW = "appstore_2023-09-17_T113629.tar.gz"
S3_FOLDER = "appvoc_reviews_clean_2024_06_20"
LOCAL_PATH = "data/ext/appstore_2023-09-17_T113629.tar.gz"
EXTRACT_DIR = "data/ext/"
STAGE_DIR = "data/stage/"
REVIEW_FILEPATH = "data/ext/reviews.pkl"
DTYPES = {
            "id": "string",
            "app_id": "string",
            "app_name": "string",
            "category_id": "category",
            "category": "category",
            "author": "string",
            "rating": "float64",
            "title": "string",
            "content": "string",
            "vote_count": "Int64",
            "vote_sum": "Int64",
        }


## Download Data from AWS S3

In [4]:
if not os.path.exists(LOCAL_PATH):
    s3 = S3Handler()
    s3.download_file(bucket_name=RAW_BUCKET, s3_key=S3_KEY_RAW, local_path=LOCAL_PATH)


## Extract Data

In [5]:
if len(os.listdir(EXTRACT_DIR)) == 1:
    tgz = TarGzHandler()
    tgz.extract(tar_gz_path=LOCAL_PATH, extract_dir=EXTRACT_DIR)


## Build Review Dataset

In [6]:
def hash_column(value):
    """Function used to anonymize author information."""    
    try:
        h = blake2b(digest_size=10)
        h.update(value.encode("utf-8"))
        return h.hexdigest()
    except AttributeError as e:
        print(f"Atribute error occured in hash_column. \n{e}")        
        raise
    except Exception as e:
        print(f"Exception occurred in hash_column. \n{e}")        
        raise

In [7]:
def build_review_dataset(directory, dtypes: dict):
    """
    Reads all .tsv files from a directory and concatenates the data into a single DataFrame.

    Args:
        directory (str): The directory containing the .tsv files.

    Returns:
        pd.DataFrame: The concatenated DataFrame.

    Raises:
        FileNotFoundError: If the directory does not exist.
        ValueError: If no .tsv files are found in the directory.
    """    
    if not os.path.exists(directory):
        raise FileNotFoundError(f"The directory {directory} does not exist.")
    
    # Get a list of all .tsv files in the directory
    tsv_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.tsv')]

    if not tsv_files:
        raise ValueError(f"No .tsv files found in {directory}")

    # Read each .tsv file and concatenate into a single DataFrame
    dataframes = []
    for filepath in tqdm(tsv_files):
        if "Shopping" not in filepath:  # Only 9 reviews in shopping category.
            df = pd.read_csv(filepath, sep="\t", dtype=dtypes, parse_dates=["date"], lineterminator='\n')
            df = df.drop(columns=["title"])        
            df = df.drop_duplicates()
            df = df.dropna()
            # Anonymize author
            df["author"] = df["author"].parallel_apply(hash_column)
            dataframes.append(df)

    concatenated_df = pd.concat(dataframes, ignore_index=True, axis=0)
    return concatenated_df

if not os.path.exists(REVIEW_FILEPATH):
    reviews = build_review_dataset(directory=EXTRACT_DIR, dtypes=DTYPES)
else:
    reviews = IOService.read(REVIEW_FILEPATH)

## Inspect Dataset

In [8]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22166591 entries, 0 to 22166590
Data columns (total 11 columns):
 #   Column       Dtype         
---  ------       -----         
 0   id           string        
 1   app_id       string        
 2   app_name     string        
 3   category_id  object        
 4   category     object        
 5   author       object        
 6   rating       float64       
 7   content      string        
 8   vote_sum     Int64         
 9   vote_count   Int64         
 10  date         datetime64[ns]
dtypes: Int64(2), datetime64[ns](1), float64(1), object(3), string(4)
memory usage: 1.9+ GB


In [9]:
reviews.head()

Unnamed: 0,id,app_id,app_name,category_id,category,author,rating,content,vote_sum,vote_count,date
0,9996920641,6446212408,Cookie Blocker,6002,Utilities,bddd71849aa3fa6ede22,5.0,I used to use other extensions until they star...,0,0,2023-06-04 03:13:04
1,8612588402,1577062674,NotifiNote: Notification Notes,6002,Utilities,03b87d640a42153c3292,5.0,When displayed in notifications it includes in...,3,4,2022-04-27 20:00:48
2,9731516851,1577062674,NotifiNote: Notification Notes,6002,Utilities,6c8dc65632e59026ce2d,3.0,DL'd & installed today. I can see where this w...,2,2,2023-03-19 23:04:51
3,9468795531,1577062674,NotifiNote: Notification Notes,6002,Utilities,84b42d8051adc33c1de0,5.0,Very convenient for reminding yourself for var...,0,0,2023-01-04 04:20:12
4,9865359594,1577062674,NotifiNote: Notification Notes,6002,Utilities,87f088639e0f11916816,3.0,I am very forgetful and have post it notes all...,0,0,2023-04-27 04:01:53


## Save Dataset

In [10]:
if not os.path.exists(REVIEW_FILEPATH):
    IOService.write(filepath=REVIEW_FILEPATH, data=reviews)

## Stage Files by Category

In [11]:
def stage_files(df: pd.DataFrame, stage_dir: str, force: bool = False):
    dfg = df.groupby(by='category')
    for name, data in tqdm(dfg):
        filename = f"{name.replace(' ', '-')}.tsv"
        filepath = os.path.join(stage_dir, filename)        
        os.makedirs(os.path.dirname(filepath), exist_ok=True)
        if not os.path.exists(filepath) or force:
            data.to_csv(filepath, sep="\t", index=False, header=True, lineterminator='\n')        
#stage_files(df=reviews, stage_dir=STAGE_DIR, force=False)

## Summarize Staged Files

In [12]:
def summarize_staged_files(stage_dir: str, dtypes: dict):
    categories = []
    rows = []
    cols = []
    crs = []
    pcrs = []
    tsv_files = [os.path.join(stage_dir, f) for f in os.listdir(stage_dir) if f.endswith('.tsv')]
    for filepath in tqdm(tsv_files):
        df = pd.read_csv(filepath, sep="\t", dtype=dtypes, parse_dates=["date"], lineterminator='\n')
        category = os.path.splitext(os.path.basename(filepath))[0]
        r = df.shape[0]
        c = df.shape[1]
        nc = df[df.isna().any(axis=1)].shape[0]
        cr = r - nc
        pcr = round(cr / r * 100,2) 

        categories.append(category)
        rows.append(r)
        cols.append(c)
        crs.append(cr)
        pcrs.append(pcr)
    d = {"Category": categories, "Rows": rows, "Columns": cols, "Complete Rows": crs, "Percent Complete Rows": pcrs}
    df = pd.DataFrame(d)
    df.loc["Total"] = df.sum()
    df.loc[df.index[-1], 'Category'] = ''
    df.loc[df.index[-1], 'Columns'] = cols[0]
    df.loc[df.index[-1], 'Percent Complete Rows'] = round(df.loc[df.index[-1], 'Complete Rows'] / df.loc[df.index[-1], 'Rows'] * 100,2)
    return df
#summary = summarize_staged_files(stage_dir=STAGE_DIR, dtypes=DTYPES)
# summary
        
    

## Upload to AWS

In [13]:
config = {"read_timeout": 120, "retries": {'max_attempts': 10}}
s3 = S3Handler(config=config)
if not s3.bucket_exists(bucket_name=FINAL_BUCKET):
    s3.create_bucket(bucket_name=FINAL_BUCKET)
s3.upload_folder(local_folder=STAGE_DIR, bucket_name=FINAL_BUCKET, s3_folder=S3_FOLDER)


 21%|██▏       | 3/14 [00:00<00:00, 23.72it/s]

File appvoc_reviews_clean_2024_06_20/Social-Networking.tsv already exists in appvoc bucket.
File appvoc_reviews_clean_2024_06_20/Entertainment.tsv already exists in appvoc bucket.
File appvoc_reviews_clean_2024_06_20/Education.tsv already exists in appvoc bucket.
File appvoc_reviews_clean_2024_06_20/Lifestyle.tsv already exists in appvoc bucket.
File appvoc_reviews_clean_2024_06_20/Business.tsv already exists in appvoc bucket.
File appvoc_reviews_clean_2024_06_20/Utilities.tsv already exists in appvoc bucket.


 86%|████████▌ | 12/14 [00:00<00:00, 24.65it/s]

File appvoc_reviews_clean_2024_06_20/Medical.tsv already exists in appvoc bucket.
File appvoc_reviews_clean_2024_06_20/Finance.tsv already exists in appvoc bucket.
File appvoc_reviews_clean_2024_06_20/Health-&-Fitness.tsv already exists in appvoc bucket.
File appvoc_reviews_clean_2024_06_20/Productivity.tsv already exists in appvoc bucket.
File appvoc_reviews_clean_2024_06_20/Book.tsv already exists in appvoc bucket.
File appvoc_reviews_clean_2024_06_20/Photo-&-Video.tsv already exists in appvoc bucket.


100%|██████████| 14/14 [00:00<00:00, 24.55it/s]

File appvoc_reviews_clean_2024_06_20/Reference.tsv already exists in appvoc bucket.
File appvoc_reviews_clean_2024_06_20/Food-&-Drink.tsv already exists in appvoc bucket.





## Create Project Datasets

In [14]:
def create_dataset(dataset: pd.DataFrame, env: str, frac: float = 1.0):
    filepath = os.path.join("data", env, "00_raw/reviews.pkl")
    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    if not os.path.exists(filepath):
        if frac < 1:
            dataset = dataset.sample(frac=frac)
        IOService.write(filepath=filepath, data=dataset)

In [15]:
create_dataset(dataset=reviews, env='prod')
create_dataset(dataset=reviews, env='dev', frac=0.001)
create_dataset(dataset=reviews, env='test', frac=0.001)