# Profitable App Profiles 
 We build apps that are free to download and install, and our main source of revenue consists of in-app ads.
 Our goal for this project is to analyze data to help our developers understand what type of apps are likely to attract more users.

In [1]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [2]:
open_AppleStore = open("AppleStore.csv")
open_GooglePlay = open("googleplaystore.csv")
from csv import reader
read_AppleStore = reader(open_AppleStore)
read_GooglePlay = reader(open_GooglePlay)
AppleStore = list(read_AppleStore )
GooglePlay = list(read_GooglePlay)

In [3]:
explore_data(AppleStore[1:], 0, 5, rows_and_columns=True)

explore_data(GooglePlay[1:], 0, 5, rows_and_columns=True)

['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


['420009108', 'Temple Run', '65921024', 'USD', '0.0', '1724546', '3842', '4.5', '4.0', '1.6.2', '9+', 'Games', '40', '5', '1', '1']


['284035177', 'Pandora - Music & Radio', '130242560', 'USD', '0.0', '1126879', '3594', '4.0', '4.5', '8.4.1', '12+', 'Music', '37', '4', '1', '1']


Number of rows: 7197
Number of columns: 16
['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '

In [4]:
# Print the column names and try to identify the columns that could help us with our analysis.

print(AppleStore[0])
print("\n")
print(GooglePlay[0])
print("\n")

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']




In [5]:
#check the error
print(GooglePlay[10473])

['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']


In [6]:
#remove the entry with error
print(len(GooglePlay))
print("\n")
del GooglePlay[10473]
print(len(GooglePlay))

10842


10841


In [7]:
#recheck the error
print(GooglePlay[10473])

['osmino Wi-Fi: free WiFi', 'TOOLS', '4.2', '134203', '4.1M', '10,000,000+', 'Free', '0', 'Everyone', 'Tools', 'August 7, 2018', '6.06.14', '4.4 and up']


In [8]:
#Google play has some duplacate entries
duplcate_googleApps = []
unique_googleApps = []
for app in GooglePlay[1:]:
    name = app[0]
    if name in unique_googleApps:
        duplcate_googleApps.append(name)
    else:
        unique_googleApps.append(name)
        
print("Number of duplicate apps:",len(duplcate_googleApps))
print("\n")
print("Examples of duplicate apps:",duplcate_googleApps[:15])

Number of duplicate apps: 1181


Examples of duplicate apps: ['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings', 'Box', 'Zenefits', 'Google Ads', 'Google My Business', 'Slack', 'FreshBooks Classic', 'Insightly CRM', 'QuickBooks Accounting: Invoicing & Expenses', 'HipChat - Chat Built for Teams', 'Xero Accounting Software']


In [9]:
# To remove the duplicates
reviews_max = {}
for app in GooglePlay[1:]:
    name = app[0]
    n_reviews = float(app[3])
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] =  n_reviews
    elif name not in reviews_max:
        reviews_max[name] =  n_reviews

print("the updated length of GooglePlay: ",len(reviews_max))
        

the updated length of GooglePlay:  9659


In [10]:
android_clean = []
already_added = []

for app in GooglePlay[1:]:
    name = app[0]
    n_reviews = float(app[3])
    if reviews_max[name] == n_reviews and name not in already_added:
        android_clean.append(app)
        already_added.append(name)
print("the cleaned GooglePlay length: ",len(android_clean))
print("\n")
print(android_clean[:15])
  

the cleaned GooglePlay length:  9659


[['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up'], ['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up'], ['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up'], ['Pixel Draw - Number Art Coloring Book', 'ART_AND_DESIGN', '4.3', '967', '2.8M', '100,000+', 'Free', '0', 'Everyone', 'Art & Design;Creativity', 'June 20, 2018', '1.1', '4.4 and up'], ['Paper flowers instructions', 'ART_AND_DESIGN', '4.4', '167', '5.6M', '50,000+', 'Free', '0', 'Everyone', 'Art & Design', 'March 26, 2017', '1.0', '2.3 and up'], ['Smoke Effect Photo Maker - Smoke Editor', 'ART_AND_DES

In [11]:
#function test whether string has non-English chararcters
def test_EnglishOnly(String):
    Test = True
    for character in String:
        if ord(character) > 127:
            Test = False
    return Test

print(test_EnglishOnly('Instagram'))
print(test_EnglishOnly('爱奇艺PPS -《欢乐颂2》电视剧热播'))
print(test_EnglishOnly('Docs To Go™ Free Office Suite'))
print(test_EnglishOnly('Instachat 😜'))

True
False
False
False


In [12]:
#function test whether string has more than three non-English chararcters

def test_EnglishOnly_updated(String):
    Test = 0
    for character in String:
        if ord(character) > 127:
            Test += 1
    if Test > 3:
        return False
    else:
        return True
print(test_EnglishOnly_updated('Docs To Go™ Free Office Suite'))
print(test_EnglishOnly_updated("Instachat 😜"))
print(test_EnglishOnly_updated('爱奇艺PPS -《欢乐颂2》电视剧热播'))

True
True
False


In [13]:
android_english = []
ios_english = []
ios =AppleStore[1:]
for app in android_clean:
    name = app[0]
    if test_EnglishOnly_updated(name):
        android_english.append(app)
for app in ios:
    name = app[1]
    if test_EnglishOnly_updated(name):
        ios_english.append(app)
        
explore_data(android_english, 0, 3, True)
print("\n")
explore_data(ios_english, 0, 3, True)


    

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 9614
Number of columns: 13


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 

In [15]:
android_free = []
ios_free = []
for app in android_english:
    price = app[7]
    if price == "0":
        android_free.append(app)

for app in ios_english:
    price = app[4]
    if price == "0.0":
        ios_free.append(app)

explore_data(android_free, 0, 3, True)
print("\n")
explore_data(ios_free, 0, 3, True)



['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 8864
Number of columns: 13


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 

In [16]:
def frequency_table(dataset,index):
    table = {}
    total = 0
    for app in dataset:
        total += 1
        genre = app[index]
        if genre in table:
            table[genre] +=1
        else:
            table[genre] =1
    table_percentage = {}
    for genre in table:
        table_percentage[genre] = (table[genre]/total)*100
    return  table_percentage

def display_table(dataset,index):
    table = []
    percentage_table = frequency_table(dataset,index)
    for key in percentage_table:
        key_value_as_tuple = percentage_table[key],key
        table.append(key_value_as_tuple)
    sorted_table = sorted(table,reverse = True)
    for app in sorted_table:
        print(app[1],": ",app[0])

display_table(ios_free, -5)
        
                            

Games :  58.16263190564867
Entertainment :  7.883302296710118
Photo & Video :  4.9658597144630665
Education :  3.662321539416512
Social Networking :  3.2898820608317814
Shopping :  2.60707635009311
Utilities :  2.5139664804469275
Sports :  2.1415270018621975
Music :  2.0484171322160147
Health & Fitness :  2.0173805090006205
Productivity :  1.7380509000620732
Lifestyle :  1.5828677839851024
News :  1.3345747982619491
Travel :  1.2414649286157666
Finance :  1.1173184357541899
Weather :  0.8690254500310366
Food & Drink :  0.8069522036002483
Reference :  0.5586592178770949
Business :  0.5276225946617008
Book :  0.4345127250155183
Navigation :  0.186219739292365
Medical :  0.186219739292365
Catalogs :  0.12414649286157665


In [18]:
display_table(android_free, 1)

FAMILY :  18.907942238267147
GAME :  9.724729241877256
TOOLS :  8.461191335740072
BUSINESS :  4.591606498194946
LIFESTYLE :  3.9034296028880866
PRODUCTIVITY :  3.892148014440433
FINANCE :  3.7003610108303246
MEDICAL :  3.531137184115524
SPORTS :  3.395758122743682
PERSONALIZATION :  3.3167870036101084
COMMUNICATION :  3.2378158844765346
HEALTH_AND_FITNESS :  3.0798736462093865
PHOTOGRAPHY :  2.944494584837545
NEWS_AND_MAGAZINES :  2.7978339350180503
SOCIAL :  2.6624548736462095
TRAVEL_AND_LOCAL :  2.33528880866426
SHOPPING :  2.2450361010830324
BOOKS_AND_REFERENCE :  2.1435018050541514
DATING :  1.861462093862816
VIDEO_PLAYERS :  1.7937725631768955
MAPS_AND_NAVIGATION :  1.3989169675090252
FOOD_AND_DRINK :  1.2409747292418771
EDUCATION :  1.1620036101083033
ENTERTAINMENT :  0.9589350180505415
LIBRARIES_AND_DEMO :  0.9363718411552346
AUTO_AND_VEHICLES :  0.9250902527075812
HOUSE_AND_HOME :  0.8235559566787004
WEATHER :  0.8009927797833934
EVENTS :  0.7107400722021661
PARENTING :  0.6543

In [21]:
ios_rate_table = []
ios_frequency = frequency_table(ios_free,-5)
for genre in ios_frequency:
    total = 0
    len_genre = 0
    for row in ios_free:
        if row[-5] == genre:
            len_genre += 1
            total += float(row[5])
    genre_rate_astuple = total/len_genre,genre
    ios_rate_table.append(genre_rate_astuple)
for genre in ios_rate_table:
    print(genre[1],": ",genre[0])

Catalogs :  4004.0
Navigation :  86090.33333333333
Utilities :  18684.456790123455
Photo & Video :  28441.54375
Book :  39758.5
Productivity :  21028.410714285714
Shopping :  26919.690476190477
Food & Drink :  33333.92307692308
Reference :  74942.11111111111
Social Networking :  71548.34905660378
Sports :  23008.898550724636
News :  21248.023255813954
Finance :  31467.944444444445
Games :  22788.6696905016
Business :  7491.117647058823
Health & Fitness :  23298.015384615384
Lifestyle :  16485.764705882353
Education :  7003.983050847458
Travel :  28243.8
Entertainment :  14029.830708661417
Music :  57326.530303030304
Weather :  52279.892857142855
Medical :  612.0


In [29]:
#Now let's calculate the average number of installs per app genre for the Google Play data set.
def remove_comma_plus(string):
    string = string.replace("+","")
    string = string.replace(",","")
    return string

android_install_table = []
android_frequency = frequency_table(android_free,1)
for genre in android_frequency:
    total = 0
    len_genre = 0
    for row in android_free:
        if row[1] == genre:
            len_genre += 1
            total += float(remove_comma_plus(row[5]))
    genre_install_astuple = total/len_genre,genre
    android_install_table.append(genre_install_astuple)
for genre in android_install_table:
    print(genre[1],": ",genre[0])

ART_AND_DESIGN :  1986335.0877192982
PERSONALIZATION :  5201482.6122448975
FAMILY :  3695641.8198090694
MEDICAL :  120550.61980830671
COMMUNICATION :  38456119.167247385
ENTERTAINMENT :  11640705.88235294
VIDEO_PLAYERS :  24727872.452830188
BOOKS_AND_REFERENCE :  8767811.894736841
GAME :  15588015.603248259
HOUSE_AND_HOME :  1331540.5616438356
AUTO_AND_VEHICLES :  647317.8170731707
BEAUTY :  513151.88679245283
WEATHER :  5074486.197183099
BUSINESS :  1712290.1474201474
FINANCE :  1387692.475609756
SHOPPING :  7036877.311557789
LIBRARIES_AND_DEMO :  638503.734939759
SOCIAL :  23253652.127118643
MAPS_AND_NAVIGATION :  4056941.7741935486
DATING :  854028.8303030303
FOOD_AND_DRINK :  1924897.7363636363
NEWS_AND_MAGAZINES :  9549178.467741935
COMICS :  817657.2727272727
SPORTS :  3638640.1428571427
PRODUCTIVITY :  16787331.344927534
HEALTH_AND_FITNESS :  4188821.9853479853
PARENTING :  542603.6206896552
TRAVEL_AND_LOCAL :  13984077.710144928
EDUCATION :  1833495.145631068
PHOTOGRAPHY :  178