Analysis of App Data

Practice in cleaning and analyzing a dataset on App downloads and ratings.


In [6]:
from csv import reader

### The Google Play data set ###
opened_file = open('googleplaystore.csv')
read_file = reader(opened_file)
android = list(read_file)
android_header = android[0]
android = android[1:]

### The App Store data set ###
opened_file = open('AppleStore.csv')
read_file = reader(opened_file)
ios = list(read_file)
ios_header = ios[0]
ios = ios[1:]

def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n')
    if rows_and_columns:
        print('Number of rows: ', len(dataset))
        print('Number of columsn: ', len(dataset[0]))
        

In [26]:
print(ios_header)
explore_data(android, 0, 5, rows_and_columns=False)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']
['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite â€“ FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


['Pixel Draw - Number Art Coloring 

In [8]:
android[10472]
del android[10472]


In [23]:
# Removing duplicate entries and keep apps with most amount of reviews
duplicates = []
unique_apps = []

for app in android:
    name = app[0]
    if name in unique_apps:
        duplicates.append(name)
    else:
        unique_apps.append(name)

num_of_duplicates = len(duplicates)
print(num_of_duplicates)

# Create dict that holds app name and max num of reviews 
android_max_reviews = {}

for app in android:
    name = app[0]
    num_reviews = float(app[3])
    if name in android_max_reviews and android_max_reviews[name] < num_reviews:
        android_max_reviews[name] = num_reviews
    elif name not in android_max_reviews:
        android_max_reviews[name] = num_reviews
        
        
ios_max_reviews = {}

for app in ios:
    name = app[0]
    num_reviews = float(app[5])
    if name in ios_max_reviews and ios_max_reviews[name] < num_reviews:
        ios_max_reviews[name] = num_reviews
    elif name not in ios_max_reviews:
        ios_max_reviews[name] = num_reviews
      
# Test to make sure duplicates were actually stored in dict

print('Expected length:', len(android) - 1181)
print('Actual length:', len(android_max_reviews))

print('Expected length:', len(ios))
print('Actual length:', len(ios_max_reviews))


1181
Expected length: 9659
Actual length: 9659
Expected length: 7197
Actual length: 7197


In [15]:
# Separate duplicates based on criterion and add to new non-duplicated list

android_clean = []
android_already_added = []


for app in android:
    name = app[0]
    num_reviews = float(app[3])
    
    if (android_max_reviews[name] == num_reviews) and (name not in android_already_added):
        android_clean.append(app)
        android_already_added.append(name)
        
ios_clean = []
ios_already_added = []

for app in ios:
    name = app[0]
    num_reviews = float(app[5])
    
    if (ios_max_reviews[name] == num_reviews) and (name not in ios_already_added):
        ios_clean.append(app)
        ios_already_added.append(name)

# Clarify android_clean does NOT contain duplicates (len(android) - 1181)
explore_data(ios_clean, 0, 5, True)

['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


['420009108', 'Temple Run', '65921024', 'USD', '0.0', '1724546', '3842', '4.5', '4.0', '1.6.2', '9+', 'Games', '40', '5', '1', '1']


['284035177', 'Pandora - Music & Radio', '130242560', 'USD', '0.0', '1126879', '3594', '4.0', '4.5', '8.4.1', '12+', 'Music', '37', '4', '1', '1']


Number of rows:  7197
Number of columsn:  16


In [16]:
# Remove non-english apps 
# while allowing use of no more than 3 non-english characters

def is_english(string):
    greater_than_127 = 0
    for letter in string:
        if ord(letter) > 127:
            greater_than_127 += 1
    if greater_than_127 > 3:
        return False
    else:
        return True

android_english = []
ios_english = []

for app in ios_clean:
    name = app[0]
    if is_english(name):
        ios_english.append(app)
for app in android_clean:
    name = app[0]
    if is_english(name):
        android_english.append(app)

print(len(android_english))
print(len(ios_english))

9614
7197


In [27]:
# Find apps that are likely to be profitable on both Android and iOS

android_final = []
ios_final = []

for app in android_english:
    price = app[7]
    if price == '0':
        android_final.append(app)

for app in ios_english:
    price = app[4]
    if price == '0.0':
        ios_final.append(app)
        
print(len(android_final))
print(len(ios_final))

# Create frequency table function to examine app genres and % of downloads

def freq_table(dataset, index):
    table = {}
    total = 0
    
    for row in dataset:
        total += 1
        value = row[index]
        if value in table:
            table[value] += 1
        else:
            table[value] = 1
            
    table_percent = {}
    for key in table:
        percent = round((table[key] / total) * 100, 2)
        table_percent[key] = percent
    return table_percent

def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)
        
    table_sorted = sorted(table_display, reverse=True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])
    
print('Percentage of Android Apps by Genre')
display_table(android_final, 1)
print('\n')
print('Percentage of IOS Apps by Genre')
display_table(ios_final, -5)

# Based on results, practical apps are more popular on Android,
# while gaming/entertainment is more popular on IOS

8864
4056
Percentage of Android Apps by Genre
FAMILY : 18.91
GAME : 9.72
TOOLS : 8.46
BUSINESS : 4.59
LIFESTYLE : 3.9
PRODUCTIVITY : 3.89
FINANCE : 3.7
MEDICAL : 3.53
SPORTS : 3.4
PERSONALIZATION : 3.32
COMMUNICATION : 3.24
HEALTH_AND_FITNESS : 3.08
PHOTOGRAPHY : 2.94
NEWS_AND_MAGAZINES : 2.8
SOCIAL : 2.66
TRAVEL_AND_LOCAL : 2.34
SHOPPING : 2.25
BOOKS_AND_REFERENCE : 2.14
DATING : 1.86
VIDEO_PLAYERS : 1.79
MAPS_AND_NAVIGATION : 1.4
FOOD_AND_DRINK : 1.24
EDUCATION : 1.16
ENTERTAINMENT : 0.96
LIBRARIES_AND_DEMO : 0.94
AUTO_AND_VEHICLES : 0.93
HOUSE_AND_HOME : 0.82
WEATHER : 0.8
EVENTS : 0.71
PARENTING : 0.65
ART_AND_DESIGN : 0.64
COMICS : 0.62
BEAUTY : 0.6


Percentage of IOS Apps by Genre
Games : 55.65
Entertainment : 8.23
Photo & Video : 4.12
Social Networking : 3.53
Education : 3.25
Shopping : 2.98
Utilities : 2.69
Lifestyle : 2.32
Finance : 2.07
Sports : 1.95
Health & Fitness : 1.87
Music : 1.65
Book : 1.63
Productivity : 1.53
News : 1.43
Travel : 1.38
Food & Drink : 1.06
Weather : 0

In [32]:
# Measure popularity by genre using number of downloads or ratings

# IOS 
ios_genres = freq_table(ios_final, -5)

for genre in ios_genres:
    total = 0
    len_genre = 0
    for app in ios_final:
        genre_app = app[-5]
        if genre_app == genre:
            num_ratings = float(app[5])
            total += num_ratings
            len_genre += 1
    avg_num_ratings = total / len_genre
    print(genre, ":", avg_num_ratings)


Book : 8498.333333333334
Catalogs : 1779.5555555555557
Social Networking : 53078.195804195806
Business : 6367.8
Navigation : 25972.05
Health & Fitness : 19952.315789473683
Sports : 20128.974683544304
Finance : 13522.261904761905
Utilities : 14010.100917431193
Games : 18924.68896765618
Food & Drink : 20179.093023255813
Travel : 20216.01785714286
Shopping : 18746.677685950413
News : 15892.724137931034
Entertainment : 10822.961077844311
Photo & Video : 27249.892215568863
Education : 6266.333333333333
Lifestyle : 8978.308510638299
Reference : 67447.9
Music : 56482.02985074627
Medical : 459.75
Weather : 47220.93548387097
Productivity : 19053.887096774193
1,000,000+ : 15.73
100,000+ : 11.55
10,000,000+ : 10.55
10,000+ : 10.2
1,000+ : 8.39
100+ : 6.92
5,000,000+ : 6.83
500,000+ : 5.56
50,000+ : 4.77
5,000+ : 4.51
10+ : 3.54
500+ : 3.25
50,000,000+ : 2.3
100,000,000+ : 2.13
50+ : 1.92
5+ : 0.79
1+ : 0.51
500,000,000+ : 0.27
1,000,000,000+ : 0.23
0+ : 0.05
0 : 0.01


In [33]:
# Android
display_table(android_final, 5)

1,000,000+ : 15.73
100,000+ : 11.55
10,000,000+ : 10.55
10,000+ : 10.2
1,000+ : 8.39
100+ : 6.92
5,000,000+ : 6.83
500,000+ : 5.56
50,000+ : 4.77
5,000+ : 4.51
10+ : 3.54
500+ : 3.25
50,000,000+ : 2.3
100,000,000+ : 2.13
50+ : 1.92
5+ : 0.79
1+ : 0.51
500,000,000+ : 0.27
1,000,000,000+ : 0.23
0+ : 0.05
0 : 0.01


In [37]:
# Remove ',' and '+' to convert values to floats

categories_android = freq_table(android, 1)

for category in categories_android:
    total = 0
    len_cat = 0
    for app in android_final:
        category_app = app[1]
        if category_app == category:
            num_installs = app[5]
            num_installs = num_installs.replace(',', '')
            num_installs = num_installs.replace('+', '')
            total += float(num_installs)
            len_cat += 1
    avg_num_install = total / len_cat
    print(category, ":", avg_num_install)

ENTERTAINMENT : 11640705.88235294
PARENTING : 542603.6206896552
HEALTH_AND_FITNESS : 4188821.9853479853
TOOLS : 10801391.298666667
LIFESTYLE : 1437816.2687861272
SPORTS : 3638640.1428571427
LIBRARIES_AND_DEMO : 638503.734939759
MEDICAL : 120550.61980830671
NEWS_AND_MAGAZINES : 9549178.467741935
EDUCATION : 1833495.145631068
DATING : 854028.8303030303
BEAUTY : 513151.88679245283
HOUSE_AND_HOME : 1331540.5616438356
COMMUNICATION : 38456119.167247385
EVENTS : 253542.22222222222
TRAVEL_AND_LOCAL : 13984077.710144928
COMICS : 817657.2727272727
GAME : 15588015.603248259
SHOPPING : 7036877.311557789
ART_AND_DESIGN : 1986335.0877192982
PERSONALIZATION : 5201482.6122448975
SOCIAL : 23253652.127118643
WEATHER : 5074486.197183099
FOOD_AND_DRINK : 1924897.7363636363
VIDEO_PLAYERS : 24727872.452830188
FAMILY : 3695641.8198090694
AUTO_AND_VEHICLES : 647317.8170731707
BOOKS_AND_REFERENCE : 8767811.894736841
BUSINESS : 1712290.1474201474
FINANCE : 1387692.475609756
PRODUCTIVITY : 16787331.344927534
PH

In [39]:
# Examing communication apps

for app in android_final:
    if app[1] == 'COMMUNICATION' and (app[5] == '1,000,000,000+'
                                      or app[5] == '500,000,000+'
                                      or app[5] == '100,000,000+'):
        print(app[0], ':', app[5])

WhatsApp Messenger : 1,000,000,000+
imo beta free calls and text : 100,000,000+
Android Messages : 100,000,000+
Google Duo - High Quality Video Calls : 500,000,000+
Messenger â€“ Text and Video Chat for Free : 1,000,000,000+
imo free video calls and chat : 500,000,000+
Skype - free IM & video calls : 1,000,000,000+
Who : 100,000,000+
GO SMS Pro - Messenger, Free Themes, Emoji : 100,000,000+
LINE: Free Calls & Messages : 500,000,000+
Google Chrome: Fast & Secure : 1,000,000,000+
Firefox Browser fast & private : 100,000,000+
UC Browser - Fast Download Private & Secure : 500,000,000+
Gmail : 1,000,000,000+
Hangouts : 1,000,000,000+
Messenger Lite: Free Calls & Messages : 100,000,000+
Kik : 100,000,000+
KakaoTalk: Free Calls & Text : 100,000,000+
Opera Mini - fast web browser : 100,000,000+
Opera Browser: Fast and Secure : 100,000,000+
Telegram : 100,000,000+
Truecaller: Caller ID, SMS spam blocking & Dialer : 100,000,000+
UC Browser Mini -Tiny Fast Private & Secure : 100,000,000+
Viber Me

In [42]:
# Examing books category

for app in android_final:
    if app[1] == 'BOOKS_AND_REFERENCE':
        print(app[0], ':', app[5])

E-Book Read - Read Book for free : 50,000+
Download free book with green book : 100,000+
Wikipedia : 10,000,000+
Cool Reader : 10,000,000+
Free Panda Radio Music : 100,000+
Book store : 1,000,000+
FBReader: Favorite Book Reader : 10,000,000+
English Grammar Complete Handbook : 500,000+
Free Books - Spirit Fanfiction and Stories : 1,000,000+
Google Play Books : 1,000,000,000+
AlReader -any text book reader : 5,000,000+
Offline English Dictionary : 100,000+
Offline: English to Tagalog Dictionary : 500,000+
FamilySearch Tree : 1,000,000+
Cloud of Books : 1,000,000+
Recipes of Prophetic Medicine for free : 500,000+
ReadEra â€“ free ebook reader : 1,000,000+
Anonymous caller detection : 10,000+
Ebook Reader : 5,000,000+
Litnet - E-books : 100,000+
Read books online : 5,000,000+
English to Urdu Dictionary : 500,000+
eBoox: book reader fb2 epub zip : 1,000,000+
English Persian Dictionary : 500,000+
Flybook : 500,000+
All Maths Formulas : 1,000,000+
Ancestry : 5,000,000+
HTC Help : 10,000,000+

In [44]:
# High popularity in Android book apps

for app in android_final:
    if app[1] == 'BOOKS_AND_REFERENCE' and (app[5] == '1,000,000,000+' 
                                            or app[5] == '500,000,000+'
                                            or app[5] == '100,000,000+'):
        print(app[0], ':', app[5])

Google Play Books : 1,000,000,000+
Bible : 100,000,000+
Amazon Kindle : 100,000,000+
Wattpad ðŸ“– Free Books : 100,000,000+
Audiobooks from Audible : 100,000,000+


In [45]:
# Medium popularity

for app in android_final:
    if app[1] == 'BOOKS_AND_REFERENCE' and (app[5] == '1,000,000+' 
                                            or app[5] == '5,000,000+'
                                            or app[5] == '10,000,000+'
                                            or app[5] == '50,000,000+'):
        print(app[0], ':', app[5])

Wikipedia : 10,000,000+
Cool Reader : 10,000,000+
Book store : 1,000,000+
FBReader: Favorite Book Reader : 10,000,000+
Free Books - Spirit Fanfiction and Stories : 1,000,000+
AlReader -any text book reader : 5,000,000+
FamilySearch Tree : 1,000,000+
Cloud of Books : 1,000,000+
ReadEra â€“ free ebook reader : 1,000,000+
Ebook Reader : 5,000,000+
Read books online : 5,000,000+
eBoox: book reader fb2 epub zip : 1,000,000+
All Maths Formulas : 1,000,000+
Ancestry : 5,000,000+
HTC Help : 10,000,000+
Moon+ Reader : 10,000,000+
English-Myanmar Dictionary : 1,000,000+
Golden Dictionary (EN-AR) : 1,000,000+
All Language Translator Free : 1,000,000+
Aldiko Book Reader : 10,000,000+
Dictionary - WordWeb : 5,000,000+
50000 Free eBooks & Free AudioBooks : 5,000,000+
Al-Quran (Free) : 10,000,000+
Al Quran Indonesia : 10,000,000+
Al'Quran Bahasa Indonesia : 10,000,000+
Al Quran Al karim : 1,000,000+
Al Quran : EAlim - Translations & MP3 Offline : 5,000,000+
Koran Read &MP3 30 Juz Offline : 1,000,000+