# Guided Project: Profitable App Profiles for the App Store and Google Play Markets

1. Open `AppleStore.csv` and `GooglePlayStore.csv`
2. Convert dataset files to list

In [11]:
from csv import reader
app_store = list(reader(open('../datasets/AppleStore.csv')))
google_play_store = list(reader(open('../datasets/GooglePlayStore.csv')))

print(app_store[:5])
print(google_play_store[:5])

[['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic'], ['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1'], ['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1'], ['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1'], ['420009108', 'Temple Run', '65921024', 'USD', '0.0', '1724546', '3842', '4.5', '4.0', '1.6.2', '9+', 'Games', '40', '5', '1', '1']]
[['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver'], ['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_A

1. `get_size()`: Return a dictionary that contains the number of rows and columns of a dataset
2. `print_slice()`: Perform list slicing to a dataset and return the result

In [12]:
def get_size(dataset, isHeader = False):
    return {
        'rows': len(dataset) - 1 * int(isHeader),
        'columns': len(dataset[0])
    }

print(get_size(google_play_store, isHeader = True)['rows'])
print(get_size(google_play_store, isHeader = True)['columns'])

def print_slice(dataset, start, end):
    for i in range(start, end):
        print(i, "th row:", dataset[i])

print_slice(app_store, 0, 5)

10841
13
0 th row: ['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']
1 th row: ['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']
2 th row: ['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']
3 th row: ['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']
4 th row: ['420009108', 'Temple Run', '65921024', 'USD', '0.0', '1724546', '3842', '4.5', '4.0', '1.6.2', '9+', 'Games', '40', '5', '1', '1']


Clean incorrect entry in `GooglePlayStore.csv`

In [13]:
print(google_play_store[10473])

del google_play_store[10473]

['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']


`remove_duplicate()`: Remove duplicate entries from dataset

In [14]:
def remove_duplicate(dataset, is_header = False):
    unique_entries = {}
    duplicate_names = []
    start = int(is_header)
    for row in dataset[start:]:
        row_name = row[0]
        if row_name not in unique_entries:
            unique_entries[row_name] = [row]
        else:
            unique_entries[row_name].append(row)
            if row_name not in duplicate_names:
                duplicate_names.append(row_name)

    keep_entries = {}

    for name in duplicate_names:
        duplicate_entries = unique_entries[name]
        previous_reviews = int(duplicate_entries[0][3])
        for entry in duplicate_entries[1:]:
            entry_reviews = int(entry[3])
            if entry_reviews > previous_reviews:
                previous_reviews = entry_reviews
        keep_entries[name] = previous_reviews

    clean_dataset = []
    clean_dataset_names = []

    for row in dataset[start:]:
        row_name = row[0]
        row_reviews = int(row[3])
        if row_name not in duplicate_names:
            clean_dataset.append(row)
            clean_dataset_names.append(row[0])
        else:
            if (row_reviews == keep_entries[row_name]) and (row_name not in clean_dataset_names):
                clean_dataset.append(row)
                clean_dataset_names.append(row[0])
    return clean_dataset

print(len(google_play_store))

google_play_store_clean = remove_duplicate(google_play_store[1:])

print(len(google_play_store_clean))

10841
9659


1. `is_english()`: Decide whether a string contains non-ASCII character
    - Return `True` when string is only comprised of ASCII characters
    - Return `False` when string contains more than 1 non-ASCII character
2. `is_english_modified()`:
    - Return `True` when string contains less than 3 non-ASCII characters
    - Return `False` when string contains more than 3 non-ASCII characters

In [15]:
def is_english(str):
    for ch in str:
        if ord(ch) > 127:
            return False
    return True

print(is_english('Instagram'))
print(is_english('爱奇艺PPS -《欢乐颂2》电视剧热播'))
print(is_english('Docs To Go™ Free Office Suite'))
print(is_english('Instachat 😜'))

def is_english_modified(str):
    non_ascii_count = 0
    for ch in str:
        if ord(ch) > 127:
            non_ascii_count += 1
            if non_ascii_count > 3:
                return False
    return True

print(is_english_modified('Instagram'))
print(is_english_modified('爱奇艺PPS -《欢乐颂2》电视剧热播'))
print(is_english_modified('Docs To Go™ Free Office Suite'))
print(is_english_modified('Instachat 😜'))

True
False
False
False
True
False
True
True


`filter_english()`: Remove entries which has non-English name using `non-english-modified()`

In [16]:
def filter_english(dataset, name_index, is_header = False):
    english_dataset = []
    start = int(is_header)
    for row in dataset[start:]:
        row_name = row[name_index]
        if is_english_modified(row_name):
            english_dataset.append(row)
    return english_dataset

print(len(app_store))
print(len(filter_english(app_store[1:], 1)))

print(len(google_play_store))
print(len(filter_english(google_play_store_clean, 0)))

7198
6183
10841
9614


Store free apps and non-free apps of `AppleStore.csv` and `GooglePlayStore.csv` in separate lists

In [17]:
google_play_store_free = []
google_play_store_non_free = []

for row in google_play_store_clean:
    row_price = row[7]
    if row_price == '0':
        google_play_store_free.append(row)
    else:
        google_play_store_non_free.append(row)

for row in google_play_store_free[:5]:
    print([row[0], row[7]])
for row in google_play_store_non_free[:5]:
    print([row[0], row[7]])

app_store_free = []
app_store_non_free = []

for row in app_store[1:]:
    row_price = row[4]
    if row_price == '0.0':
        app_store_free.append(row)
    else:
        app_store_non_free.append(row)

for row in app_store_free[:5]:
    print([row[0], row[4]])
for row in app_store_non_free[:5]:
    print([row[0], row[4]])

['Photo Editor & Candy Camera & Grid & ScrapBook', '0']
['U Launcher Lite – FREE Live Cool Themes, Hide Apps', '0']
['Sketch - Draw & Paint', '0']
['Pixel Draw - Number Art Coloring Book', '0']
['Paper flowers instructions', '0']
['TurboScan: scan documents and receipts in PDF', '$4.99']
['Tiny Scanner Pro: PDF Doc Scan', '$4.99']
['Puffin Browser Pro', '$3.99']
['Truth or Dare Pro', '$1.49']
['Private Dating, Hide App- Blue for PrivacyHider', '$2.99']
['284882215', '0.0']
['389801252', '0.0']
['529479190', '0.0']
['420009108', '0.0']
['284035177', '0.0']
['362949845', '1.99']
['500116670', '0.99']
['479516143', '6.99']
['350642635', '0.99']
['307727765', '0.99']


`frequency_table()`: Generate a frequency table from a column of a dataset

In [18]:
def frequency_table(dataset, column_index, is_header = False):
    freq_table = {}
    start = int(is_header)
    for row in dataset[start:]:
        row_column = row[column_index]
        if row_column not in freq_table:
            freq_table[row_column] = 1
        else:
            freq_table[row_column] += 1
    return freq_table

def percentage_table(dataset, column_index, is_header = False):
    freq_table = frequency_table(dataset, column_index, is_header = is_header)
    percent_table = {}
    total_frequency = len(dataset) - int(is_header)
    for unique_value in freq_table:
        frequency = freq_table[unique_value]
        percent = frequency / total_frequency * 100
        percent_table[unique_value] = percent
    return percent_table

def get_table(table, reverse = True):
    display_table = []
    for key in table:
        value = table[key]
        table_tuple = (value, key)
        display_table.append(table_tuple)
    sorted_table = sorted(display_table, reverse = reverse)
    return sorted_table

Most Common Apps by Genre

In [19]:
google_play_store_target = filter_english(google_play_store_free, 0)
app_store_target = filter_english(app_store_free, 1)

print(len(google_play_store_target))
print(len(app_store_target))

google_play_store_category = get_table(frequency_table(google_play_store_target, 1, is_header = True))
print(google_play_store_category)

google_play_store_genres = get_table(frequency_table(google_play_store_target, 9, is_header = True))
print(google_play_store_genres)

app_store_prime_genre = get_table(frequency_table(app_store_target, 11, is_header = True))
print(app_store_prime_genre)

8864
3222
[(1676, 'FAMILY'), (862, 'GAME'), (750, 'TOOLS'), (407, 'BUSINESS'), (346, 'LIFESTYLE'), (345, 'PRODUCTIVITY'), (328, 'FINANCE'), (313, 'MEDICAL'), (301, 'SPORTS'), (294, 'PERSONALIZATION'), (287, 'COMMUNICATION'), (273, 'HEALTH_AND_FITNESS'), (261, 'PHOTOGRAPHY'), (248, 'NEWS_AND_MAGAZINES'), (236, 'SOCIAL'), (207, 'TRAVEL_AND_LOCAL'), (199, 'SHOPPING'), (190, 'BOOKS_AND_REFERENCE'), (165, 'DATING'), (159, 'VIDEO_PLAYERS'), (124, 'MAPS_AND_NAVIGATION'), (110, 'FOOD_AND_DRINK'), (103, 'EDUCATION'), (85, 'ENTERTAINMENT'), (83, 'LIBRARIES_AND_DEMO'), (82, 'AUTO_AND_VEHICLES'), (73, 'HOUSE_AND_HOME'), (71, 'WEATHER'), (63, 'EVENTS'), (58, 'PARENTING'), (56, 'ART_AND_DESIGN'), (55, 'COMICS'), (53, 'BEAUTY')]
[(749, 'Tools'), (538, 'Entertainment'), (474, 'Education'), (407, 'Business'), (345, 'Productivity'), (345, 'Lifestyle'), (328, 'Finance'), (313, 'Medical'), (307, 'Sports'), (294, 'Personalization'), (287, 'Communication'), (275, 'Action'), (273, 'Health & Fitness'), (261, 

Most Popular Apps by Genre on the App Store

In [20]:
app_store_average_rating_count = []

for cell in app_store_prime_genre:
    prime_genre = cell[1]
    app_count = cell[0]
    total_rating_count = 0
    for row in app_store_target:
        row_prime_genre = row[11]
        row_total_rating_count = int(row[5])
        if row_prime_genre == prime_genre:
            total_rating_count += row_total_rating_count
    app_store_average_rating_count.append((total_rating_count / app_count, prime_genre))

print(sorted(app_store_average_rating_count, reverse = True))

[(86090.33333333333, 'Navigation'), (74942.11111111111, 'Reference'), (72229.76190476191, 'Social Networking'), (57326.530303030304, 'Music'), (52279.892857142855, 'Weather'), (39758.5, 'Book'), (33333.92307692308, 'Food & Drink'), (31467.944444444445, 'Finance'), (28441.54375, 'Photo & Video'), (28243.8, 'Travel'), (26919.690476190477, 'Shopping'), (23298.015384615384, 'Health & Fitness'), (23008.898550724636, 'Sports'), (22788.6696905016, 'Games'), (21248.023255813954, 'News'), (21028.410714285714, 'Productivity'), (18684.456790123455, 'Utilities'), (16485.764705882353, 'Lifestyle'), (14029.830708661417, 'Entertainment'), (7491.117647058823, 'Business'), (7003.983050847458, 'Education'), (4004.0, 'Catalogs'), (612.0, 'Medical')]


Most Popular Apps by Genre on Google Play

In [35]:
google_play_store_average_installs = []

for cell in google_play_store_category:
    category = cell[1]
    app_count = cell[0]
    total_installs = 0
    for row in google_play_store_target:
        row_category = row[1]
        row_installs_temp = row[5]
        row_installs = int(row_installs_temp.replace('+', '').replace(',', ''))
        if category == row_category:
            total_installs += row_installs
    google_play_store_average_installs.append((total_installs / app_count,category))

print(sorted(google_play_store_average_installs, reverse = True))



[(38456119.167247385, 'COMMUNICATION'), (24727872.452830188, 'VIDEO_PLAYERS'), (23253652.127118643, 'SOCIAL'), (17840110.40229885, 'PHOTOGRAPHY'), (16787331.344927534, 'PRODUCTIVITY'), (15588015.603248259, 'GAME'), (13984077.710144928, 'TRAVEL_AND_LOCAL'), (11640705.88235294, 'ENTERTAINMENT'), (10801391.298666667, 'TOOLS'), (9549178.467741935, 'NEWS_AND_MAGAZINES'), (8767811.894736841, 'BOOKS_AND_REFERENCE'), (7036877.311557789, 'SHOPPING'), (5201482.6122448975, 'PERSONALIZATION'), (5074486.197183099, 'WEATHER'), (4188821.9853479853, 'HEALTH_AND_FITNESS'), (4056941.7741935486, 'MAPS_AND_NAVIGATION'), (3695641.8198090694, 'FAMILY'), (3638640.1428571427, 'SPORTS'), (2021805.357142857, 'ART_AND_DESIGN'), (1924897.7363636363, 'FOOD_AND_DRINK'), (1833495.145631068, 'EDUCATION'), (1712290.1474201474, 'BUSINESS'), (1437816.2687861272, 'LIFESTYLE'), (1387692.475609756, 'FINANCE'), (1331540.5616438356, 'HOUSE_AND_HOME'), (854028.8303030303, 'DATING'), (817657.2727272727, 'COMICS'), (647317.8170

Most Liked Apps by Genre on Google Play

In [45]:
google_play_store_average_rating = []

for cell in google_play_store_category:
    category = cell[1]
    app_count = cell[0]
    total_rating = 0
    for row in google_play_store_target:
        row_rating_temp = row[2]
        if row_rating_temp == 'NaN':
            row_rating = 0
        else:
            row_rating = float(row_rating_temp)
        row_category = row[1]
        if category == row_category:
            total_rating += row_rating
    google_play_store_average_rating.append((total_rating / app_count, category))

print(sorted(google_play_store_average_rating, reverse = True))
    

[(4.298058252427182, 'EDUCATION'), (4.260714285714285, 'ART_AND_DESIGN'), (4.118823529411763, 'ENTERTAINMENT'), (4.030742459396756, 'GAME'), (4.025454545454546, 'COMICS'), (3.957088122605364, 'PHOTOGRAPHY'), (3.871830985915492, 'WEATHER'), (3.781407035175881, 'SHOPPING'), (3.6934964200477376, 'FAMILY'), (3.6874213836477985, 'VIDEO_PLAYERS'), (3.674390243902439, 'AUTO_AND_VEHICLES'), (3.648387096774193, 'MAPS_AND_NAVIGATION'), (3.638421052631579, 'BOOKS_AND_REFERENCE'), (3.6375000000000006, 'FINANCE'), (3.6220338983050833, 'SOCIAL'), (3.615384615384615, 'HEALTH_AND_FITNESS'), (3.5913793103448284, 'PARENTING'), (3.5284000000000004, 'TOOLS'), (3.517874396135265, 'TRAVEL_AND_LOCAL'), (3.4854545454545454, 'FOOD_AND_DRINK'), (3.4602739726027405, 'HOUSE_AND_HOME'), (3.4182608695652217, 'PRODUCTIVITY'), (3.4078231292517014, 'PERSONALIZATION'), (3.3905660377358484, 'BEAUTY'), (3.364808362369337, 'COMMUNICATION'), (3.3308970099667774, 'SPORTS'), (3.291618497109824, 'LIFESTYLE'), (3.2770161290322