# Determining the most Profitable App Genre based on AppStore and GooglePlay datasets

**The following project aims to analyse data to help app developers to understand what genre of FREE apps are likely attract more users**

In [None]:
from csv import reader

In [None]:
#  Reading AppStore CSV dataset and converting it into a list of lists

open_ios = open('AppleStore.csv')
read_ios = reader(open_ios)
ios_list = list(read_ios)
ios_header = ios_list[0]
ios = ios_list[1:]

#  Reading the GooglePlay CSV dataset and converting it into a list of lists

open_android = open('googleplaystore.csv')
read_android = reader(open_android)
android_list = list(read_android)
android_header = android_list[0]
android = android_list[1:]

In [None]:
#  Function to explore the data easily

def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]
    for row in dataset_slice:
        print(row)
        print('\n')
        
    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [None]:
print(ios_header)
print('\n')
explore_data(ios, 0, 4, True)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


['420009108', 'Temple Run', '65921024', 'USD', '0.0', '1724546', '3842', '4.5', '4.0', '1.6.2', '9+', 'Games', '40', '5', '1', '1']


Number of rows: 7197
Number of columns: 16


# Data Cleaning

First, we will check whether there are any missing columns in both datasets

In [None]:
#  Data cleaning for GooglePlay

print(len(android_header))
for row in android[1:]:
    if len(row) != len(android_header):
        print(row)
        print('\n')
        print('Index position is:', android.index(row))

13
['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']


Index position is: 10472


There is a missing column in the row number 10472. We will remove it from the dataset.

In [None]:
#  There is a missing value in row 10472. Therefore, we will delete it

print('Before: ', len(android))
del android[10472]
print('After: ', len(android))

Before:  10841
After:  10840


Now we have total of 10840 rows in Android dataset

In [None]:
#  Data cleaning for IoS

print(len(ios_header))
for row in ios[1:]:
    if len(row) != len(ios_header):
        print(row)
        print('\n')
        print('Index position is:', ios.index(row)) 

16


As we can see from the output above, there are no issues in IOS data in terms of missing columns

## Dublicate Values

Now, let's see if there are any dublicate values.

In [None]:
#  Removing dublicate apps
dublicated_apps = []
unique_apps = []

for app in android:
    name = app[0]
    if name in unique_apps:
        dublicated_apps.append(name)
    else:
        unique_apps.append(name)

print('Number of dublicated apps: ', len(dublicated_apps))
print('Examples:', dublicated_apps[:5])

Number of dublicated apps:  1181
Examples: ['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings']


There are 1181 dublicate values. As we can see from the example above, app like "Google My Business" appears few times in the dataset, but has different value in reviews. Our aim is to find dublicate apps with the highest review value and remove all the rest (dublicate apps).

In [None]:
reviews_max = {}

for app in android:
    name = app[0]
    n_reviews = float(app[3])
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
    elif name not in reviews_max:
        reviews_max[name] = n_reviews

After we removed all dublicate values in Android dataset, let's see whether the actual length is the same as expected.

In [None]:
print('Expected length:', len(android) - 1181)
print('Actual length:', len(reviews_max))

Expected length: 9659
Actual length: 9659


Now, we have to update our Android table. We will make a new empty list and use for loop to iterate through the whole dataset and upload all values into the new list of lists called "android_clean". 

In [None]:
android_clean = []
already_added = []

for app in android:
    name = app[0]
    n_reviews = float(app[3])
    
    if (reviews_max[name]==n_reviews) and (name not in already_added):
        android_clean.append(app)
        already_added.append(name)

Lets use our "explore_data" function to get insights about cleaned Android data

In [None]:
explore_data(android_clean, 0, 4, True)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite ‚Äì FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


['Pixel Draw - Number Art Coloring Book', 'ART_AND_DESIGN', '4.3', '967', '2.8M', '100,000+', 'Free', '0', 'Everyone', 'Art & Design;Creativity', 'June 20, 2018', '1.1', '4.4 and up']


Number of rows: 9659
Number of columns: 13


Same steps are applied for IOS dataset

In [None]:
#  Let's see if there are any dublicated apps in IoS file
dublicated_apps2 = []
unique_apps2 = []

for app in ios:
    name = app[0]
    if name in unique_apps2:
        dublicated_apps2.append(name)
    else:
        unique_apps2.append(name)
        
print('Number of dublicated apps: ', len(dublicated_apps2))

Number of dublicated apps:  0


As it can be seen from the output above, there are no dublicate apps in IOS dataset.

# Non-English Named Apps

Some of the data is non-English, so let's try to remove it. We'll write a function that takes a string as an argument and checks whether the characters match english alphabet or not. English alphabet letters are in ASCII range from 0 to 127. Therefore, everything that is above 127 is considered as non-english.

In [None]:
def language_check(string):
    
    for char in string:
        if ord(char) > 127:
            return False
        else: 
            return True
#  Example
print(language_check('–û–±—Ä–∞–±–æ—Ç–∫–∞ —Ñ–æ—Ç–æ'))
print(language_check('Photoshop'))
print(language_check('Funny App :)'))

False
True
True


Some apps use characters that fall outside the ASCII range (0 - 127). To solve this problem we will allow up to 3 non-English characters in the app name.

In [1]:
#  Let's change the previous function little bit
def language_check(string):
    non_char = 0
    
    for char in string:
        if ord(char) > 127:
            non_char += 1
    
    if non_char > 3:
        return False
    else:
        return True
    
print(language_check('Docs To Go‚Ñ¢ Free Office Suite'))
print(language_check('Instachat üòú'))
print(language_check('–ù–∞–∑–≤–∞–Ω–∏–µ –ø—Ä–∏–ª–æ–∂–µ–Ω–∏—è'))

True
True
False


Let's use this function to filter our datasets.

In [None]:
ios_updated = []
android_updated = []

for app in ios:
    name = app[1]
    if language_check(name):
        ios_updated.append(app)
        
for app in android_clean:
    name = app[0]
    if language_check(name):
        android_updated.append(app)

explore_data(android_updated, 0, 3, True)
print('\n')
explore_data(ios_updated, 0, 3, True)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite ‚Äì FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 9614
Number of columns: 13


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+'

We are left with 9614 Android apps and 6183 IOS apps.

  The following was removed so far:

1.   Inaccurate data 
2.   Dublicate data 
3.   non-English data


# Free Apps

Now we have to leave only free apps in both datasets. To achieve this result we will make a new empty lists and use for loop to iterate through the whole dataset in order to get rid of paid apps.

In [None]:
android_free = []
ios_free = []

for app in ios_updated:
    price = app[4]
    if price == '0.0':
        ios_free.append(app)

for app in android_updated:
    price = app[6]
    if price == 'Free':
        android_free.append(app)
#  Let's see how many data left after leaving only free apps in the dataset

print(len(android_free))
print(len(ios_free))

8863
3222


In [None]:
#To minimize risks and overhead, our validation strategy for an app idea has three steps:

    #Build a minimal Android version of the app, and add it to Google Play.
    #If the app has a good response from users, we develop it further.
    #If the app is profitable after six months, we build an iOS version of the app and add it to the App Store.

#Because our end goal is to add the app on both Google Play and the App Store, we need to find app profiles that are successful in both markets. For instance, a profile that works well for both markets might be a productivity app that makes use of gamification.

To minimize risks and overhead, our validation strategy for an app idea has three steps:

1.   Build a minimal Android version of the app, and add it to Google Play.
2.   If the app has a good response from users, we develop it further.
3.   If the app is profitable after six months, we build an iOS version of the app and add it to the App Store.
    

Because our end goal is to add the app on both Google Play and the App Store, we need to find app profiles that are successful in both markets. For instance, a profile that works well for both markets might be a productivity app that makes use of gamification.

In [None]:
#  Let's check what column should we focus on

print(ios_header)
print('\n')
print(android_header)

#  For IOS it is: prime_genre
#  For Android it is: Category and Genres

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


Now we will build a frequency table to determine the most popular app genre in descending order.

In [None]:
def freq_table(dataset, index):
    table = {}
    total = 0
    
    for row in dataset:
        total += 1
        value = row[index]
        
        if value in table:
            table[value] += 1
        else:
            table[value] = 1
    
    table_percentages = {}
    for key in table:
        percentage = (table[key] / total)*100
        table_percentages[key] = percentage
    
    return table_percentages

def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)
    
    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])

Let's run this function for IOS dataset.

In [None]:
display_table(ios_free, -5)

Games : 58.16263190564867
Entertainment : 7.883302296710118
Photo & Video : 4.9658597144630665
Education : 3.662321539416512
Social Networking : 3.2898820608317814
Shopping : 2.60707635009311
Utilities : 2.5139664804469275
Sports : 2.1415270018621975
Music : 2.0484171322160147
Health & Fitness : 2.0173805090006205
Productivity : 1.7380509000620732
Lifestyle : 1.5828677839851024
News : 1.3345747982619491
Travel : 1.2414649286157666
Finance : 1.1173184357541899
Weather : 0.8690254500310366
Food & Drink : 0.8069522036002483
Reference : 0.5586592178770949
Business : 0.5276225946617008
Book : 0.4345127250155183
Navigation : 0.186219739292365
Medical : 0.186219739292365
Catalogs : 0.12414649286157665


As we can see, among all free English apps, 58.16% are games, while entertainment apps are almost 8%. Photo and video apps are close to 5%. Education genre is 3.66%, while social networking apps are 3.29%.

The overall thought is that the App Store is influenced by apps designed for entertainment (games, entertainment, photo and video, social networking, etc).
Nevertheless, just because there are a lot of fun apps doesn't mean there are a lot of people using them ‚Äì demand and supply may not be equal. 

Now let us check GooglePlay dataset, namely Genres and Category columns.

In [None]:
display_table(android_free, 1)

FAMILY : 18.898792733837304
GAME : 9.725826469592688
TOOLS : 8.462146000225657
BUSINESS : 4.592124562789123
LIFESTYLE : 3.9038700214374367
PRODUCTIVITY : 3.8925871601038025
FINANCE : 3.7007785174320205
MEDICAL : 3.5315355974275078
SPORTS : 3.396141261423897
PERSONALIZATION : 3.317161232088458
COMMUNICATION : 3.2381812027530184
HEALTH_AND_FITNESS : 3.0802211440821394
PHOTOGRAPHY : 2.944826808078529
NEWS_AND_MAGAZINES : 2.798149610741284
SOCIAL : 2.6627552747376737
TRAVEL_AND_LOCAL : 2.335552296062281
SHOPPING : 2.245289405393208
BOOKS_AND_REFERENCE : 2.1437436533904997
DATING : 1.8616721200496444
VIDEO_PLAYERS : 1.7939749520478394
MAPS_AND_NAVIGATION : 1.399074805370642
FOOD_AND_DRINK : 1.241114746699763
EDUCATION : 1.1621347173643235
ENTERTAINMENT : 0.9590432133589079
LIBRARIES_AND_DEMO : 0.9364774906916393
AUTO_AND_VEHICLES : 0.9251946293580051
HOUSE_AND_HOME : 0.8236488773552973
WEATHER : 0.8010831546880289
EVENTS : 0.7108202640189552
PARENTING : 0.6544059573507841
ART_AND_DESIGN : 0

The landscape seems significantly different on Google Play: there are not that many apps designed for fun, and it seems that a good number of apps are designed for practical purposes (family, tools, business, lifestyle, productivity, etc.). 

Compared to IOS data, GooglePlay is significantly different. There are not that much apps designed for fun and apps developed for practical purposes are popular (family, tools, business, lifestyle, finance, etc.).

However, a closer examination reveals that the family category (which accounts for over 19 percent of all apps) primarily consists of children's games. Despite this, practical apps appear to have a stronger presence on Google Play than on the App Store.
The frequency table for the Genres column also supports this conclusion: 

In [None]:
display_table(android_free, -4)

Tools : 8.450863138892023
Entertainment : 6.070179397495204
Education : 5.348076272142616
Business : 4.592124562789123
Productivity : 3.8925871601038025
Lifestyle : 3.8925871601038025
Finance : 3.7007785174320205
Medical : 3.5315355974275078
Sports : 3.463838429425702
Personalization : 3.317161232088458
Communication : 3.2381812027530184
Action : 3.102786866749408
Health & Fitness : 3.0802211440821394
Photography : 2.944826808078529
News & Magazines : 2.798149610741284
Social : 2.6627552747376737
Travel & Local : 2.324269434728647
Shopping : 2.245289405393208
Books & Reference : 2.1437436533904997
Simulation : 2.042197901387792
Dating : 1.8616721200496444
Arcade : 1.8503892587160102
Video Players & Editors : 1.771409229380571
Casual : 1.7601263680469368
Maps & Navigation : 1.399074805370642
Food & Drink : 1.241114746699763
Puzzle : 1.128286133363421
Racing : 0.9928917973598104
Role Playing : 0.9364774906916393
Libraries & Demo : 0.9364774906916393
Auto & Vehicles : 0.9251946293580051
S

The difference between Genres and Category columns is that Genres column is much more precise and has more categories covered.
Since we are interested in overall picture, we will use Category column as a reference, instead of Genres.

It was discovered that AppStore is dominated by games and fun apps, whereas Google Play has a more divided mix of practical and fun apps.

Let's find out what kind of apps have the most number of users. 

Firstly, we will calculate the average number of user ratings per app genre in AppStore. 

Below are the steps:

1.   Segregate the apps of each genre
2.   Sum up the user ratings for the apps of that genre
3.   Divide the sum by the number of apps belonging to that genre

In [None]:
ios_genres = freq_table(ios_free, -5)

for genre in ios_genres:
    total = 0  #  number of ratings, not the actual ratings
    len_genre = 0  #  number of apps of a specific genre
    
    for app in ios_free:
        genre_app = app[-5]
        if genre_app == genre:
            ratings = float(app[5])
            total += ratings
            len_genre += 1
    
    avg_rating = total/len_genre
    print(genre, ':', avg_rating)

Social Networking : 71548.34905660378
Photo & Video : 28441.54375
Games : 22788.6696905016
Music : 57326.530303030304
Reference : 74942.11111111111
Health & Fitness : 23298.015384615384
Weather : 52279.892857142855
Utilities : 18684.456790123455
Travel : 28243.8
Shopping : 26919.690476190477
News : 21248.023255813954
Navigation : 86090.33333333333
Lifestyle : 16485.764705882353
Entertainment : 14029.830708661417
Food & Drink : 33333.92307692308
Sports : 23008.898550724636
Book : 39758.5
Finance : 31467.944444444445
Education : 7003.983050847458
Productivity : 21028.410714285714
Business : 7491.117647058823
Catalogs : 4004.0
Medical : 612.0


We can conclude that Social networking, Photo and Video, and Games categories are one of the most popular fields in AppStore.

Let's analyze GooglePlay now.

In [None]:
display_table(android_free, 5) # the Installs columns

1,000,000+ : 15.728308699086089
100,000+ : 11.55365000564143
10,000,000+ : 10.549475346947986
10,000+ : 10.199706645605326
1,000+ : 8.394448832223853
100+ : 6.916393997517771
5,000,000+ : 6.826131106848697
500,000+ : 5.562450637481666
50,000+ : 4.772650344127271
5,000+ : 4.513144533453684
10+ : 3.542818458761142
500+ : 3.2494640640866526
50,000,000+ : 2.3017037120613786
100,000,000+ : 2.1324607920568655
50+ : 1.9180864267178157
5+ : 0.7898002933543946
1+ : 0.5077287600135394
500,000,000+ : 0.270788672007221
1,000,000,000+ : 0.2256572266726842
0+ : 0.045131445334536835


To avoid calculation errors, we need to convert each install number to float, by removing commas and plus characters.
We will achieve this result in the loop below, at the same time we will calculate the average number of installs for each app genre.

In [None]:
android_categories = freq_table(android_free, 1)

for category in android_categories:
    total = 0
    len_category = 0
    
    for app in android_free:
        category_app = app[1]
        if category_app == category:
            n_installs = app[5]
            n_installs = n_installs.replace(',', '')
            n_installs = n_installs.replace('+', '')
            total += float(n_installs)
            len_category += 1
    
    avg_installs = total / len_category
    print(category, ':', avg_installs)

ART_AND_DESIGN : 1986335.0877192982
AUTO_AND_VEHICLES : 647317.8170731707
BEAUTY : 513151.88679245283
BOOKS_AND_REFERENCE : 8767811.894736841
BUSINESS : 1712290.1474201474
COMICS : 817657.2727272727
COMMUNICATION : 38456119.167247385
DATING : 854028.8303030303
EDUCATION : 1833495.145631068
ENTERTAINMENT : 11640705.88235294
EVENTS : 253542.22222222222
FINANCE : 1387692.475609756
FOOD_AND_DRINK : 1924897.7363636363
HEALTH_AND_FITNESS : 4188821.9853479853
HOUSE_AND_HOME : 1331540.5616438356
LIBRARIES_AND_DEMO : 638503.734939759
LIFESTYLE : 1437816.2687861272
GAME : 15588015.603248259
FAMILY : 3697848.1731343283
MEDICAL : 120550.61980830671
SOCIAL : 23253652.127118643
SHOPPING : 7036877.311557789
PHOTOGRAPHY : 17840110.40229885
SPORTS : 3638640.1428571427
TRAVEL_AND_LOCAL : 13984077.710144928
TOOLS : 10801391.298666667
PERSONALIZATION : 5201482.6122448975
PRODUCTIVITY : 16787331.344927534
PARENTING : 542603.6206896552
WEATHER : 5074486.197183099
VIDEO_PLAYERS : 24727872.452830188
NEWS_AND_

From the results above, Art and Design, Auto and Vehicles, and Beaty are the most popular categories in GooglePlay. 

Based on both datasets from GooglePlay and AppStore, we can conclude that the best app category to attract users will be Art and Design category in GooglePlay or Photo and Video in AppStore. 

By developing apps in the following genre, there is a high probability of making good profit from showing ads.

Summarizing, a data from AppStore and GooglePlay was analyzed. The main goal was to determine the most popular genre (category) for free apps that can be used to profit developers from displaying ads.