In [1]:
import pandas as pd
import numpy as np
import glob
import os

In [2]:
pd. set_option("display.max_columns", None) 

In [3]:
def cleanName(name):
    newline_index = name.index('\n')
    return name[:newline_index]

In [4]:
def clean_age_rating(age_rating):
    plus_index = age_rating.index('+')
    return age_rating[:plus_index+1]

In [5]:
def cleanup_rating_count(rating_count):
    rating_count = str(rating_count)
    if rating_count == 'nan':
        return 0
    rating_count = rating_count.replace('Ratings', '').replace('Rating','')
    return rating_count

In [6]:
def map_in_app_purchase(in_app_purchase):
    if np.isnan(in_app_purchase):
        return False
    else:
        return True

In [7]:
def clean_ios_df(df, app_type):
    print("Number of applications with missing size", len(df[df['Size'].isna()]))
    indexes_to_drop = df[df['Size'].isna()].index
    df.drop(index=indexes_to_drop, axis=0, inplace=True)
    df['App Name'] = df['App Name'].apply(cleanName)
    df['Age Rating'] = df['Age Rating'].apply(clean_age_rating)
    df['InApp Purchase'] = df['InApp Purchase'].apply(map_in_app_purchase)
    df['Rating Count'] = df['Rating Count'].apply(cleanup_rating_count)
    df['App Type'] = app_type
    return df

In [8]:
def load_csvs_in_folder_and_clean(path, app_type):
    ios_files = glob.glob(os.path.join(path , "*.csv"))
    dfs = []
    for filename in ios_files:
        df = pd.read_csv(filename)
        dfs.append(df)

    df = pd.concat(dfs, axis=0, ignore_index=True)
    return clean_ios_df(df, app_type)

In [9]:
fitness_df = load_csvs_in_folder_and_clean('./../ios/fitness', 'FITNESS AND HEALTH')

Number of applications with missing size 8


In [10]:
medical_df = load_csvs_in_folder_and_clean('./../ios/medical', 'MEDICAL')

Number of applications with missing size 0


In [11]:
concatenated_df = pd.concat([medical_df, fitness_df], axis=0, ignore_index=True)

In [12]:
concatenated_df[:5]

Unnamed: 0,App Name,Size,Age Rating,Languages,Price,InApp Purchase,Average Rating,Rating Count,Privacy Data,App Link,Number of Versions,Last Version Date,First Version Date,App Type
0,H and W Drug Store,21.5 MB,12+,"English, Spanish",Free,False,4.0,4,,https://apps.apple.com/us/app/h-and-w-drug-sto...,1,,,MEDICAL
1,H&P-card™,10.1 MB,17+,"English, Spanish",$2.99,False,1.0,1,,https://apps.apple.com/us/app/h-p-card/id56971...,5,"Sep 28, 2020","Oct 20, 2012",MEDICAL
2,H-Book – osobní zdravotní záznamy,47.1 MB,12+,"English, Czech",Free,False,,0,,https://apps.apple.com/us/app/h-book-osobn%C3%...,21,"Jun 28, 2017","Jun 5, 2013",MEDICAL
3,H-FACT,4.1 MB,12+,English,Free,False,,0,,https://apps.apple.com/us/app/h-fact/id1568969345,5,"Nov 24, 2021","Jul 5, 2021",MEDICAL
4,H-Module,27.6 MB,17+,,Free,False,,0,,https://apps.apple.com/us/app/h-module/id15728...,2,"Jun 26, 2021","Jun 22, 2021",MEDICAL


In [13]:
concatenated_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107762 entries, 0 to 107761
Data columns (total 14 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   App Name            107762 non-null  object 
 1   Size                107762 non-null  object 
 2   Age Rating          107762 non-null  object 
 3   Languages           104620 non-null  object 
 4   Price               107762 non-null  object 
 5   InApp Purchase      107762 non-null  bool   
 6   Average Rating      51413 non-null   float64
 7   Rating Count        107762 non-null  object 
 8   Privacy Data        43996 non-null   object 
 9   App Link            107762 non-null  object 
 10  Number of Versions  107762 non-null  int64  
 11  Last Version Date   94855 non-null   object 
 12  First Version Date  94855 non-null   object 
 13  App Type            107762 non-null  object 
dtypes: bool(1), float64(1), int64(1), object(11)
memory usage: 10.8+ MB


#### Write to a single file

In [14]:
concatenated_df.to_csv('./ios-fitness-and-medical-cleaned-data.csv')