# Working with App Store data

This project is about data from the Apple App Store and the Google Play Store.\
The goal is explore the data set and gather meaningful information from the data provided.

In [1]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]
    for row in dataset_slice:
        print(row)
        print('\n')
        
    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

The following code will read in the csv files. \
*An error will occur when using the list() function to display the file as a list* \
*The encoding='utf-8' parameter is required to solve the error*

In [27]:
opened_file = open('AppleStore.csv', encoding='utf-8')
from csv import reader
read_file = reader(opened_file)
ios = list(read_file)

In [28]:
opened_file = open('googleplaystore.csv', encoding='utf-8')
from csv import reader
read_file = reader(opened_file)
andr = list(read_file)

In [29]:
explore_data(ios, 0 , 3, rows_and_columns=True)
explore_data(andr, 0 , 3, rows_and_columns=True)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


Number of rows: 7198
Number of columns: 16
['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone

In [30]:
print('Column Names for iOS App Store: ', ios[0])
print('Column Names for Android App Store, Google Play: ', andr[0])

Column Names for iOS App Store:  ['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']
Column Names for Android App Store, Google Play:  ['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


An error has been discussed in a forum for this data set. We can remove this entry.

In [31]:
andr[10471:10474]

[['Jazz Wi-Fi',
  'COMMUNICATION',
  '3.4',
  '49',
  '4.0M',
  '10,000+',
  'Free',
  '0',
  'Everyone',
  'Communication',
  'February 10, 2017',
  '0.1',
  '2.3 and up'],
 ['Xposed Wi-Fi-Pwd',
  'PERSONALIZATION',
  '3.5',
  '1042',
  '404k',
  '100,000+',
  'Free',
  '0',
  'Everyone',
  'Personalization',
  'August 5, 2014',
  '3.0.0',
  '4.0.3 and up'],
 ['Life Made WI-Fi Touchscreen Photo Frame',
  '1.9',
  '19',
  '3.0M',
  '1,000+',
  'Free',
  '0',
  'Everyone',
  '',
  'February 11, 2018',
  '1.0.19',
  '4.0 and up'],
 ['osmino Wi-Fi: free WiFi',
  'TOOLS',
  '4.2',
  '134203',
  '4.1M',
  '10,000,000+',
  'Free',
  '0',
  'Everyone',
  'Tools',
  'August 7, 2018',
  '6.06.14',
  '4.4 and up']]

In [33]:
# uncomment the following code to remove the row containing missing data
# del andr[10473] # to remove the Life Made Wifi entry

In [35]:
andr[10471:10474]

[['Jazz Wi-Fi',
  'COMMUNICATION',
  '3.4',
  '49',
  '4.0M',
  '10,000+',
  'Free',
  '0',
  'Everyone',
  'Communication',
  'February 10, 2017',
  '0.1',
  '2.3 and up'],
 ['Xposed Wi-Fi-Pwd',
  'PERSONALIZATION',
  '3.5',
  '1042',
  '404k',
  '100,000+',
  'Free',
  '0',
  'Everyone',
  'Personalization',
  'August 5, 2014',
  '3.0.0',
  '4.0.3 and up'],
 ['osmino Wi-Fi: free WiFi',
  'TOOLS',
  '4.2',
  '134203',
  '4.1M',
  '10,000,000+',
  'Free',
  '0',
  'Everyone',
  'Tools',
  'August 7, 2018',
  '6.06.14',
  '4.4 and up']]

We can see that the appropriate row has been removed

In [37]:
for app in andr:
    name = app[0]
    if name == 'Instagram':
        print(app)

['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66577446', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66509917', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']


In [39]:
duplicate_apps = []
unique_apps = []

for app in andr:
    name = app[0]
    if name in unique_apps:
        duplicate_apps.append(name)
    else:
        unique_apps.append(name)
        
print('Number of duplicate apps:', len(duplicate_apps))
print('\n')
print('Examples of duplicate apps:', duplicate_apps[:15])

Number of duplicate apps: 1181


Examples of duplicate apps: ['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings', 'Box', 'Zenefits', 'Google Ads', 'Google My Business', 'Slack', 'FreshBooks Classic', 'Insightly CRM', 'QuickBooks Accounting: Invoicing & Expenses', 'HipChat - Chat Built for Teams', 'Xero Accounting Software']
