# App Markets Analysis 

Here I will be attempting to answer a few questions about the Apple store and Google play store in order to figure out what types of free apps are most in demand. 

In [1]:
# Here we load the two datasets and convert them to list of lists

from csv import reader
opened_file_apple = open('AppleStore.csv')
opened_file_google = open('googleplaystore.csv')
apple_list = list(reader(opened_file_apple))
google_list = list(reader(opened_file_google))

app_store = apple_list[1:]
android = google_list[1:]
app_header = apple_list[0]
android_header = google_list[0]

print(app_header)
print(android_header)
print(app_store[1])
print(android[1])

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']
['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']
['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']
['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


In [2]:

# I will use this to explore the dataset further
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [3]:
# Quick look at the first two rows of the two lists
# The first list corresponds to the header row 
explore_data(app_store,0,2,rows_and_columns=True)
explore_data(android,0,2,rows_and_columns=True)

['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


Number of rows: 7197
Number of columns: 16
['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


Number of rows: 10841
Number of columns: 13


# Columns that we may need include:
- App name
- rating count
- genre
- size
- content rating

In [4]:
# This row has an error, don't run this again
del android[10472]

In [5]:
# Google play has multiple duplicate rows
# Lets find out how many
unique_names = []
duplicate_names = []
for row in android:
    name = row[0]
    if name in unique_names:
        duplicate_names.append(name)
    else:
        unique_names.append(name)
print("There are " + str(len(duplicate_names)) + " duplicates")
print('\n')
print("Examples: ", duplicate_names[0:5])

There are 1181 duplicates


Examples:  ['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings']


We do not want to remove the duplicate rows willy nilly, as some rows are more relevant than the others. For this excercise we will look to keep the row with the largest amount of reviews to ensure we have the latest version.

In [6]:
# Here we create a dictionary and add to it the name and # of reviews only
# if the number of reviews is greatest
reviews_max = {}
for row in android:
    name = row[0]
    n_reviews = float(row[3])
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
    elif name not in reviews_max:
        reviews_max[name] = n_reviews
print(len(reviews_max))

9659


In [7]:
android_clean = []
already_added = []

for app in android:
    name = app[0]
    n_reviews = float(app[3])
    
    if (reviews_max[name] == n_reviews) and (name not in already_added):
        android_clean.append(app)
        already_added.append(name)
explore_data(android_clean,0,2,True)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


Number of rows: 9659
Number of columns: 13


In [8]:
# Since we only want apps in english we need to create a function to filter
# non-english apps out
def english_check(string):
    count = 0
    for character in string:
        if ord(character) > 127:
            count += 1
    if count > 3:
        return False
    else:
        return True

print(english_check('Instagram'))
print(english_check('爱奇艺PPS -《欢乐颂2》电视剧热播'))
print(english_check('Docs To Go™ Free Office Suite'))
print(english_check('Instachat 😜'))

True
False
True
True


In [9]:
eng_apps_goog = []
eng_apps_apple = []
for row in android_clean:
    name = row[0]
    if english_check(name) == True:
        eng_apps_goog.append(row)
for row in app_store:
    name= row[1]
    if english_check(name) == True:
        eng_apps_apple.append(row)
print(len(eng_apps_goog))
print(len(eng_apps_apple))

9614
6183


In [10]:
# Now we seperate the free apps from the non-free apps
# This will be the FINAL list
android_final = []
apple_final = [] 
for row in eng_apps_goog:
    price = row[7]
    if price == '0':
        android_final.append(row)

for row in eng_apps_apple:
    price = row[4]
    if price == "0.0":
        apple_final.append(row)

print(len(android_final))
print(len(apple_final))

8864
3222


Our goal for this analysis is to figure out what apps are most popular on both the apple store and the android store. First let's create a frequency tables for the two genres in both stores.

In [11]:
def freq_table(dataset, index):
    table = {}
    count = 0
    for row in dataset:
        count += 1
        column = row[index]
        if column in table:
            table[column] += 1
        else:
            table[column] = 1
    
    table_percentages = {}
    for key in table:
        percentage = (table[key] / count) * 100
        table_percentages[key] = percentage
    
    return table_percentages

def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)
        
    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])

In [12]:
display_table(apple_final,11) # Apple prime genre

Games : 58.16263190564867
Entertainment : 7.883302296710118
Photo & Video : 4.9658597144630665
Education : 3.662321539416512
Social Networking : 3.2898820608317814
Shopping : 2.60707635009311
Utilities : 2.5139664804469275
Sports : 2.1415270018621975
Music : 2.0484171322160147
Health & Fitness : 2.0173805090006205
Productivity : 1.7380509000620732
Lifestyle : 1.5828677839851024
News : 1.3345747982619491
Travel : 1.2414649286157666
Finance : 1.1173184357541899
Weather : 0.8690254500310366
Food & Drink : 0.8069522036002483
Reference : 0.5586592178770949
Business : 0.5276225946617008
Book : 0.4345127250155183
Navigation : 0.186219739292365
Medical : 0.186219739292365
Catalogs : 0.12414649286157665


In [13]:
display_table(android_final,1) # Android category

FAMILY : 18.907942238267147
GAME : 9.724729241877256
TOOLS : 8.461191335740072
BUSINESS : 4.591606498194946
LIFESTYLE : 3.9034296028880866
PRODUCTIVITY : 3.892148014440433
FINANCE : 3.7003610108303246
MEDICAL : 3.531137184115524
SPORTS : 3.395758122743682
PERSONALIZATION : 3.3167870036101084
COMMUNICATION : 3.2378158844765346
HEALTH_AND_FITNESS : 3.0798736462093865
PHOTOGRAPHY : 2.944494584837545
NEWS_AND_MAGAZINES : 2.7978339350180503
SOCIAL : 2.6624548736462095
TRAVEL_AND_LOCAL : 2.33528880866426
SHOPPING : 2.2450361010830324
BOOKS_AND_REFERENCE : 2.1435018050541514
DATING : 1.861462093862816
VIDEO_PLAYERS : 1.7937725631768955
MAPS_AND_NAVIGATION : 1.3989169675090252
FOOD_AND_DRINK : 1.2409747292418771
EDUCATION : 1.1620036101083033
ENTERTAINMENT : 0.9589350180505415
LIBRARIES_AND_DEMO : 0.9363718411552346
AUTO_AND_VEHICLES : 0.9250902527075812
HOUSE_AND_HOME : 0.8235559566787004
WEATHER : 0.8009927797833934
EVENTS : 0.7107400722021661
PARENTING : 0.6543321299638989
ART_AND_DESIGN : 

In [14]:
display_table(android_final,9) # Android genre

Tools : 8.449909747292418
Entertainment : 6.069494584837545
Education : 5.347472924187725
Business : 4.591606498194946
Productivity : 3.892148014440433
Lifestyle : 3.892148014440433
Finance : 3.7003610108303246
Medical : 3.531137184115524
Sports : 3.463447653429603
Personalization : 3.3167870036101084
Communication : 3.2378158844765346
Action : 3.1024368231046933
Health & Fitness : 3.0798736462093865
Photography : 2.944494584837545
News & Magazines : 2.7978339350180503
Social : 2.6624548736462095
Travel & Local : 2.3240072202166067
Shopping : 2.2450361010830324
Books & Reference : 2.1435018050541514
Simulation : 2.0419675090252705
Dating : 1.861462093862816
Arcade : 1.8501805054151623
Video Players & Editors : 1.7712093862815883
Casual : 1.7599277978339352
Maps & Navigation : 1.3989169675090252
Food & Drink : 1.2409747292418771
Puzzle : 1.128158844765343
Racing : 0.9927797833935018
Role Playing : 0.9363718411552346
Libraries & Demo : 0.9363718411552346
Auto & Vehicles : 0.9250902527075

It looks as though the apps on the app store are more geared to games, entertainment, and social media. The android store however is more a mix of productive and non-productive apps.

In [17]:
apple_freq = freq_table(apple_final,11)
for genre in apple_freq:
    total = 0
    len_genre = 0
    for row in apple_final:
        genre_app = row[11]
        if genre_app == genre:
            value = float(row[5])
            total += value
            len_genre += 1
    avg_rating = (total / len_genre)
    print(genre, ":", avg_rating)

Utilities : 18684.456790123455
Photo & Video : 28441.54375
Health & Fitness : 23298.015384615384
Music : 57326.530303030304
Social Networking : 71548.34905660378
News : 21248.023255813954
Book : 39758.5
Productivity : 21028.410714285714
Finance : 31467.944444444445
Sports : 23008.898550724636
Medical : 612.0
Business : 7491.117647058823
Shopping : 26919.690476190477
Food & Drink : 33333.92307692308
Reference : 74942.11111111111
Entertainment : 14029.830708661417
Games : 22788.6696905016
Travel : 28243.8
Navigation : 86090.33333333333
Weather : 52279.892857142855
Lifestyle : 16485.764705882353
Catalogs : 4004.0
Education : 7003.983050847458


According to the avg rating count we should be looking to make a social networking app, navigating, or a reference one.

In [21]:
google_cat = freq_table(android_final, 1)
for category in google_cat:
    total = 0
    len_category = 0
    for row in android_final:
        category_app = row[1]
        if category_app == category:
            installs = float(row[5].replace("+","").replace(",",""))
            total += installs
            len_category += 1
    avg_installs = (total / len_category)
    print(category, ":", avg_installs)

TOOLS : 10801391.298666667
NEWS_AND_MAGAZINES : 9549178.467741935
COMMUNICATION : 38456119.167247385
FOOD_AND_DRINK : 1924897.7363636363
EDUCATION : 1833495.145631068
BUSINESS : 1712290.1474201474
AUTO_AND_VEHICLES : 647317.8170731707
PRODUCTIVITY : 16787331.344927534
EVENTS : 253542.22222222222
PHOTOGRAPHY : 17840110.40229885
LIBRARIES_AND_DEMO : 638503.734939759
GAME : 15588015.603248259
LIFESTYLE : 1437816.2687861272
MAPS_AND_NAVIGATION : 4056941.7741935486
TRAVEL_AND_LOCAL : 13984077.710144928
ENTERTAINMENT : 11640705.88235294
BOOKS_AND_REFERENCE : 8767811.894736841
BEAUTY : 513151.88679245283
ART_AND_DESIGN : 1986335.0877192982
HEALTH_AND_FITNESS : 4188821.9853479853
VIDEO_PLAYERS : 24727872.452830188
SHOPPING : 7036877.311557789
FINANCE : 1387692.475609756
PERSONALIZATION : 5201482.6122448975
SPORTS : 3638640.1428571427
FAMILY : 3695641.8198090694
SOCIAL : 23253652.127118643
PARENTING : 542603.6206896552
COMICS : 817657.2727272727
MEDICAL : 120550.61980830671
DATING : 854028.8303