# Guided Project: Profitable App Profiles for the App Store and Google Play Markets

1. Open `AppleStore.csv` and `GooglePlayStore.csv`
2. Convert dataset files to list

In [2]:
from csv import reader
app_store = list(reader(open('../datasets/AppleStore.csv')))
google_play_store = list(reader(open('../datasets/GooglePlayStore.csv')))

print(app_store[:5])
print(google_play_store[:5])

[['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic'], ['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1'], ['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1'], ['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1'], ['420009108', 'Temple Run', '65921024', 'USD', '0.0', '1724546', '3842', '4.5', '4.0', '1.6.2', '9+', 'Games', '40', '5', '1', '1']]
[['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver'], ['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_A

1. `get_size()`: Return a dictionary that contains the number of rows and columns of a dataset
2. `print_slice()`: Perform list slicing to a dataset and return the result

In [4]:
def get_size(dataset, isHeader = False):
    return {
        'rows': len(dataset) - 1 * int(isHeader),
        'columns': len(dataset[0])
    }

print(get_size(google_play_store, isHeader = True)['rows'])
print(get_size(google_play_store, isHeader = True)['columns'])

def print_slice(dataset, start, end):
    for i in range(start, end):
        print(i, "th row:", dataset[i])

print_slice(app_store, 0, 5)

10841
13
0 th row: ['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']
1 th row: ['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']
2 th row: ['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']
3 th row: ['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']
4 th row: ['420009108', 'Temple Run', '65921024', 'USD', '0.0', '1724546', '3842', '4.5', '4.0', '1.6.2', '9+', 'Games', '40', '5', '1', '1']


Clean incorrect entry in `GooglePlayStore.csv`

In [5]:
print(google_play_store[10473])

del google_play_store[10473]

['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']


`remove_duplicate()`: Remove duplicate entries from dataset

In [6]:
def remove_duplicate(dataset, is_header = False):
    unique_entries = {}
    duplicate_names = []
    start = int(is_header)
    for row in dataset[start:]:
        row_name = row[0]
        if row_name not in unique_entries:
            unique_entries[row_name] = [row]
        else:
            unique_entries[row_name].append(row)
            if row_name not in duplicate_names:
                duplicate_names.append(row_name)

    keep_entries = {}

    for name in duplicate_names:
        duplicate_entries = unique_entries[name]
        previous_reviews = int(duplicate_entries[0][3])
        for entry in duplicate_entries[1:]:
            entry_reviews = int(entry[3])
            if entry_reviews > previous_reviews:
                previous_reviews = entry_reviews
        keep_entries[name] = previous_reviews

    clean_dataset = []
    clean_dataset_names = []

    for row in dataset[start:]:
        row_name = row[0]
        row_reviews = int(row[3])
        if row_name not in duplicate_names:
            clean_dataset.append(row)
            clean_dataset_names.append(row[0])
        else:
            if (row_reviews == keep_entries[row_name]) and (row_name not in clean_dataset_names):
                clean_dataset.append(row)
                clean_dataset_names.append(row[0])
    return clean_dataset

print(len(google_play_store[1:]))
print(len(remove_duplicate(google_play_store[1:], is_header = False)))

10840
9659


1. `is_english()`: Decide whether a string contains non-ASCII character
    - Return `True` when string is only comprised of ASCII characters
    - Return `False` when string contains more than 1 non-ASCII character
2. `is_english_modified()`:
    - Return `True` when string contains less than 3 non-ASCII characters
    - Return `False` when string contains more than 3 non-ASCII characters

In [7]:
def is_english(str):
    for ch in str:
        if ord(ch) > 127:
            return False
    return True

print(is_english('Instagram'))
print(is_english('爱奇艺PPS -《欢乐颂2》电视剧热播'))
print(is_english('Docs To Go™ Free Office Suite'))
print(is_english('Instachat 😜'))

def is_english_modified(str):
    non_ascii_count = 0
    for ch in str:
        if ord(ch) > 127:
            non_ascii_count += 1
            if non_ascii_count > 3:
                return False
    return True

print(is_english_modified('Instagram'))
print(is_english_modified('爱奇艺PPS -《欢乐颂2》电视剧热播'))
print(is_english_modified('Docs To Go™ Free Office Suite'))
print(is_english_modified('Instachat 😜'))

True
False
False
False
True
False
True
True


`filter_english()`: Remove entries which has non-English name using `non-english-modified()`

In [8]:
def filter_english(dataset, is_header = False):
    english_dataset = []
    start = int(is_header)
    for row in dataset:
        row_name = row[0]
        if is_english_modified(row_name):
            english_dataset.append(row)
    return english_dataset

print(len(app_store))
print(len(filter_english(app_store)))

print(len(google_play_store))
print(len(filter_english(google_play_store)))

7198
7198
10841
10796


Store free apps and non-free apps of `AppleStore.csv` and `GooglePlayStore.csv` in separate lists

In [13]:
google_play_store_free = []
google_play_store_non_free = []

for row in google_play_store[1:]:
    row_price = row[7]
    if row_price == '0':
        google_play_store_free.append(row)
    else:
        google_play_store_non_free.append(row)

for row in google_play_store_free[:5]:
    print([row[0], row[7]])
for row in google_play_store_non_free[:5]:
    print([row[0], row[7]])

app_store_free = []
app_store_non_free = []

for row in app_store[1:]:
    row_price = row[4]
    if row_price == '0.0':
        app_store_free.append(row)
    else:
        app_store_non_free.append(row)

for row in app_store_free[:5]:
    print([row[0], row[4]])
for row in app_store_non_free[:5]:
    print([row[0], row[4]])

['Photo Editor & Candy Camera & Grid & ScrapBook', '0']
['Coloring book moana', '0']
['U Launcher Lite – FREE Live Cool Themes, Hide Apps', '0']
['Sketch - Draw & Paint', '0']
['Pixel Draw - Number Art Coloring Book', '0']
['TurboScan: scan documents and receipts in PDF', '$4.99']
['Tiny Scanner Pro: PDF Doc Scan', '$4.99']
['TurboScan: scan documents and receipts in PDF', '$4.99']
['Tiny Scanner Pro: PDF Doc Scan', '$4.99']
['Puffin Browser Pro', '$3.99']
['284882215', '0.0']
['389801252', '0.0']
['529479190', '0.0']
['420009108', '0.0']
['284035177', '0.0']
['362949845', '1.99']
['500116670', '0.99']
['479516143', '6.99']
['350642635', '0.99']
['307727765', '0.99']


`frequency_table()`: Generate a frequency table from a column of a dataset

In [27]:
def frequency_table(dataset, column_index, is_header = False):
    frequency_table = {}
    start = int(is_header)
    total_frequency = len(dataset) - int(is_header)
    for row in dataset[start:]:
        row_column = row[column_index]
        if row_column not in frequency_table:
            frequency_table[row_column] = 1
        else:
            frequency_table[row_column] += 1
    return frequency_table

def percentage_table(dataset, column_index, is_header = False):
    frequency_table = frequency_table(dataset, column_index, is_header = is_header)
    percentage_table = {}
    total_frequency = len(dataset) - int(is_header)
    for unique_value in frequency_table:
        frequency = frequency_table[unique_value]
        percentage = frequency / total_frequency
        percentage_table[unique_value] = percentage
    return percentage_table

def display_table(table, reverse = False):
    display_table = []
    for key in table:
        value = table[key]
        table_tuple = (value, key)
        display_table.append(table_tuple)
    sorted_table = sorted(display_table, reverse = reverse)
    return sorted_table

google_play_store_category = frequency_table(google_play_store, 1, is_header = True)
print(google_play_store_category)
print(display_table(google_play_store_category, reverse = False))
print(display_table(google_play_store_category, reverse  = True))

google_play_store_genre = frequency_table(google_play_store, 9, is_header = True)
print(google_play_store_genre)

app_store_prime_genre = frequency_table(app_store, 11, is_header = True)
print(app_store_prime_genre)
    

{'ART_AND_DESIGN': 65, 'AUTO_AND_VEHICLES': 85, 'BEAUTY': 53, 'BOOKS_AND_REFERENCE': 231, 'BUSINESS': 460, 'COMICS': 60, 'COMMUNICATION': 387, 'DATING': 234, 'EDUCATION': 156, 'ENTERTAINMENT': 149, 'EVENTS': 64, 'FINANCE': 366, 'FOOD_AND_DRINK': 127, 'HEALTH_AND_FITNESS': 341, 'HOUSE_AND_HOME': 88, 'LIBRARIES_AND_DEMO': 85, 'LIFESTYLE': 382, 'GAME': 1144, 'FAMILY': 1972, 'MEDICAL': 463, 'SOCIAL': 295, 'SHOPPING': 260, 'PHOTOGRAPHY': 335, 'SPORTS': 384, 'TRAVEL_AND_LOCAL': 258, 'TOOLS': 843, 'PERSONALIZATION': 392, 'PRODUCTIVITY': 424, 'PARENTING': 60, 'WEATHER': 82, 'VIDEO_PLAYERS': 175, 'NEWS_AND_MAGAZINES': 283, 'MAPS_AND_NAVIGATION': 137}
[(53, 'BEAUTY'), (60, 'COMICS'), (60, 'PARENTING'), (64, 'EVENTS'), (65, 'ART_AND_DESIGN'), (82, 'WEATHER'), (85, 'AUTO_AND_VEHICLES'), (85, 'LIBRARIES_AND_DEMO'), (88, 'HOUSE_AND_HOME'), (127, 'FOOD_AND_DRINK'), (137, 'MAPS_AND_NAVIGATION'), (149, 'ENTERTAINMENT'), (156, 'EDUCATION'), (175, 'VIDEO_PLAYERS'), (231, 'BOOKS_AND_REFERENCE'), (234, 'DA