In [73]:
import pandas as pd
import numpy as np
import glob
import os

In [74]:
pd. set_option("display.max_columns", None) 

In [75]:
path = r'./../ios/fitness'
ios_fitness_files = glob.glob(os.path.join(path , "*.csv"))
fitness_dfs = []
for filename in ios_fitness_files:
    df = pd.read_csv(filename)
    fitness_dfs.append(df)

ios_fitness_df = pd.concat(fitness_dfs, axis=0, ignore_index=True)

In [76]:
ios_fitness_df.columns

Index(['App Name', 'Size', 'Age Rating', 'Languages', 'Price',
       'InApp Purchase', 'Average Rating', 'Rating Count', 'Privacy Data',
       'App Link', 'Number of Versions', 'Last Version Date',
       'First Version Date'],
      dtype='object')

In [77]:
ios_fitness_df[:5]

Unnamed: 0,App Name,Size,Age Rating,Languages,Price,InApp Purchase,Average Rating,Rating Count,Privacy Data,App Link,Number of Versions,Last Version Date,First Version Date
0,X CLINIC\n 4+,1.8 MB,4+,Japanese,Free,,,,Contact Info,https://apps.apple.com/us/app/x-clinic/id15668...,1,,
1,X Factor\n 4+,21.1 MB,4+,English,Free,,5.0,4 Ratings,,https://apps.apple.com/us/app/x-factor/id31273...,25,"Nov 21, 2017","May 27, 2009"
2,The Boxing Academy\n 4+,43.7 MB,4+,"English, Arabic, Czech, Danish, Dutch, French,...",Free,,,,"Purchases,Financial Info,Contact Info,User Con...",https://apps.apple.com/us/app/the-boxing-acade...,6,"Feb 9, 2022","May 24, 2019"
3,X Gym Xercise\n 17+,78.6 MB,17+\n\n Unrestricted Web Access\n...,English,Free,,5.0,1 Rating,,https://apps.apple.com/us/app/x-gym-xercise/id...,4,"Sep 15, 2020","Jun 1, 2018"
4,X Round Workout\n 4+,66 MB,4+,English,Free,,,,,https://apps.apple.com/us/app/x-round-workout/...,2,"Sep 28, 2021","Sep 27, 2021"


#### Clean name by removing the average rating appened to it

In [78]:
def cleanName(name):
    newline_index = name.index('\n')
    return name[:newline_index]

In [79]:
ios_fitness_df['App Name'] = ios_fitness_df['App Name'].apply(cleanName)

In [80]:
print("Number of applications with missing size", len(ios_fitness_df[ios_fitness_df['Size'].isna()]))

Number of applications with missing size 8


In [81]:
#### Drop application with no size information

In [82]:
indexes_to_drop = ios_fitness_df[ios_fitness_df['Size'].isna()].index
ios_fitness_df.drop(index=indexes_to_drop, axis=0, inplace=True)

In [83]:
#### Clean Age Rating

In [84]:
def clean_age_rating(age_rating):
    plus_index = age_rating.index('+')
    return age_rating[:plus_index+1]

In [85]:
ios_fitness_df['Age Rating'] = ios_fitness_df['Age Rating'].apply(clean_age_rating)

In [86]:
ios_fitness_df['Age Rating'].unique()

array(['4+', '17+', '12+', '9+'], dtype=object)

In [87]:
#### Price consists of Free and dollar price. Nothing to clean

In [88]:
#### In App consists of nan and True. Replace N/A with False.

In [89]:
def map_in_app_purchase(in_app_purchase):
    if np.isnan(in_app_purchase):
        return False
    else:
        return True

In [90]:
ios_fitness_df['InApp Purchase'] = ios_fitness_df['InApp Purchase'].apply(map_in_app_purchase);

In [91]:
ios_fitness_df['InApp Purchase'].unique()

array([False,  True])

In [92]:
#### Nothing to clean on average rating. Will be handles in feature engineering

In [93]:
ios_fitness_df['Average Rating'].unique()

array([nan, 5. , 1. , 3.7, 4.3, 3.5, 4.1, 3. , 3.9, 2.1, 4.7, 4.2, 4.6,
       4.5, 3.4, 3.8, 4.4, 4.8, 4.9, 4. , 2.7, 2.8, 1.9, 2.3, 3.6, 3.1,
       2. , 3.2, 1.8, 1.5, 3.3, 2.6, 1.6, 2.5, 2.9, 1.7, 2.2, 2.4, 1.4,
       1.3, 1.2, 1.1])

In [94]:
def cleanup_rating_count(rating_count):
    rating_count = str(rating_count)
    if rating_count == 'nan':
        return 0
    rating_count = rating_count.replace('Ratings', '').replace('Rating','')
    return rating_count

In [95]:
ios_fitness_df['Rating Count'] = ios_fitness_df['Rating Count'].apply(cleanup_rating_count);

In [96]:
ios_fitness_df

Unnamed: 0,App Name,Size,Age Rating,Languages,Price,InApp Purchase,Average Rating,Rating Count,Privacy Data,App Link,Number of Versions,Last Version Date,First Version Date
0,X CLINIC,1.8 MB,4+,Japanese,Free,False,,0,Contact Info,https://apps.apple.com/us/app/x-clinic/id15668...,1,,
1,X Factor,21.1 MB,4+,English,Free,False,5.0,4,,https://apps.apple.com/us/app/x-factor/id31273...,25,"Nov 21, 2017","May 27, 2009"
2,The Boxing Academy,43.7 MB,4+,"English, Arabic, Czech, Danish, Dutch, French,...",Free,False,,0,"Purchases,Financial Info,Contact Info,User Con...",https://apps.apple.com/us/app/the-boxing-acade...,6,"Feb 9, 2022","May 24, 2019"
3,X Gym Xercise,78.6 MB,17+,English,Free,False,5.0,1,,https://apps.apple.com/us/app/x-gym-xercise/id...,4,"Sep 15, 2020","Jun 1, 2018"
4,X Round Workout,66 MB,4+,English,Free,False,,0,,https://apps.apple.com/us/app/x-round-workout/...,2,"Sep 28, 2021","Sep 27, 2021"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
74885,Uzoma,37 MB,4+,English,Free,True,2.4,17,,https://apps.apple.com/us/app/uzoma/id1436570456,1,,
74886,Uzual,44.3 MB,4+,English,$4.99,False,,0,,https://apps.apple.com/us/app/uzual/id1459270235,1,,
74887,U医站(U-Doctors),131.2 MB,17+,Simplified Chinese,Free,False,,0,,https://apps.apple.com/us/app/u%E5%8C%BB%E7%AB...,25,"Dec 29, 2019","Sep 1, 2018"
74888,U居服,56.1 MB,4+,Traditional Chinese,Free,False,,0,,https://apps.apple.com/us/app/u%E5%B1%85%E6%9C...,21,"May 10, 2022","Mar 23, 2021"


In [36]:
ios_fitness_df[((ios_fitness_df['Price'].str.startswith('$')==True) & (ios_fitness_df['Price'] != 'Free') )]

Unnamed: 0,App Name,Size,Age Rating,Languages,Price,InApp Purchase,Average Rating,Rating Count,Privacy Data,App Link,Number of Versions,Last Version Date,First Version Date
44,X3 Log,93.8 MB,4+,English,$4.99,,4.3,92 Ratings,,https://apps.apple.com/us/app/x3-log/id1467752635,13,"Sep 25, 2019","Jun 23, 2019"
61,XBX and 5BX,25.2 MB,4+,English,$3.99,,,,"Identifiers,Usage Data,Diagnostics",https://apps.apple.com/us/app/xbx-and-5bx/id15...,11,"Aug 7, 2021","Dec 29, 2020"
62,XC Buddy Meet Manager,23.8 MB,4+,English,$1.99,,3.0,1 Rating,,https://apps.apple.com/us/app/xc-buddy-meet-ma...,5,"Nov 1, 2021","Aug 17, 2021"
63,XC Score,1.3 MB,4+,English,$0.99,,3.9,8 Ratings,,https://apps.apple.com/us/app/xc-score/id38887...,7,"Sep 22, 2017","Aug 29, 2010"
75,XclusiveCut,12 MB,4+,English,$0.99,,5.0,4 Ratings,,https://apps.apple.com/us/app/xclusivecut/id15...,2,"Apr 14, 2020","Apr 8, 2020"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
74805,USRK55,66.1 MB,4+,English,$24.99,,5.0,6 Ratings,,https://apps.apple.com/us/app/usrk55/id1225047720,5,"Jun 2, 2022","Apr 18, 2017"
74837,UV - Ultraviolet,5.9 MB,4+,"English, Danish, Dutch, Finnish, French, Germa...",$1.99,,3.7,3 Ratings,,https://apps.apple.com/us/app/uv-ultraviolet/i...,7,"Aug 8, 2020","Feb 12, 2015"
74843,UV Today Skin Safety,16 MB,4+,English,$0.99,,5.0,1 Rating,,https://apps.apple.com/us/app/uv-today-skin-sa...,11,"Mar 30, 2021","Feb 11, 2015"
74859,UVScan,40.4 MB,4+,English,$2.99,,1.0,3 Ratings,,https://apps.apple.com/us/app/uvscan/id409359852,3,"Nov 21, 2016","Dec 18, 2010"
