# Data Cleaning - Apple Store

In [1]:
# Importing the libraries
import pandas as pd

In [2]:
# Importing the dataset
apple = pd.read_csv('/Users/mattymrc/Desktop/Ironhack/Datasets/AppleStore.csv')

# Taking a look at the data
apple.head()

Unnamed: 0.1,Unnamed: 0,id,track_name,size_bytes,currency,price,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,ver,cont_rating,prime_genre,sup_devices.num,ipadSc_urls.num,lang.num,vpp_lic
0,1,281656475,PAC-MAN Premium,100788224,USD,3.99,21292,26,4.0,4.5,6.3.5,4+,Games,38,5,10,1
1,2,281796108,Evernote - stay organized,158578688,USD,0.0,161065,26,4.0,3.5,8.2.2,4+,Productivity,37,5,23,1
2,3,281940292,"WeatherBug - Local Weather, Radar, Maps, Alerts",100524032,USD,0.0,188583,2822,3.5,4.5,5.0.0,4+,Weather,37,5,3,1
3,4,282614216,"eBay: Best App to Buy, Sell, Save! Online Shop...",128512000,USD,0.0,262241,649,4.0,4.5,5.10.0,12+,Shopping,37,5,9,1
4,5,282935706,Bible,92774400,USD,0.0,985920,5320,4.5,5.0,7.5.1,4+,Reference,37,5,45,1


In [3]:
# Checking the datatypes
apple.dtypes

Unnamed: 0            int64
id                    int64
track_name           object
size_bytes            int64
currency             object
price               float64
rating_count_tot      int64
rating_count_ver      int64
user_rating         float64
user_rating_ver     float64
ver                  object
cont_rating          object
prime_genre          object
sup_devices.num       int64
ipadSc_urls.num       int64
lang.num              int64
vpp_lic               int64
dtype: object

In [4]:
# Checking for nulls
apple.isna().sum()

Unnamed: 0          0
id                  0
track_name          0
size_bytes          0
currency            0
price               0
rating_count_tot    0
rating_count_ver    0
user_rating         0
user_rating_ver     0
ver                 0
cont_rating         0
prime_genre         0
sup_devices.num     0
ipadSc_urls.num     0
lang.num            0
vpp_lic             0
dtype: int64

In [5]:
# Seems to be only apps from the USA. So I think I will drop the currency col and rename 'price'
apple['currency'].value_counts()

USD    7197
Name: currency, dtype: int64

In [6]:
# Selecting just the columns I want and reordering them
apple = apple[['id', 'track_name', 'price', 'prime_genre', 'size_bytes', 'user_rating', 
               'rating_count_tot', 'cont_rating', 'sup_devices.num', 'lang.num', 'ipadSc_urls.num']]

# Renaming some columns to make more sense and be more intuitive
apple.rename(columns = {'user_rating': 'avg_user_rating', 'rating_count_tot': 'tot_user_rating', 
                        'cont_rating': 'pg_rating', 'track_name': 'app_name', 'price': 'price_usd', 
                        'sup_devices.num': 'numb_supp_devices', 'lang.num': 'numb_supp_langs', 
                        'ipadSc_urls.num': 'numb_screenshots'}, inplace = True)

# Checking the results
apple.head()

Unnamed: 0,id,app_name,price_usd,prime_genre,size_bytes,avg_user_rating,tot_user_rating,pg_rating,numb_supp_devices,numb_supp_langs,numb_screenshots
0,281656475,PAC-MAN Premium,3.99,Games,100788224,4.0,21292,4+,38,10,5
1,281796108,Evernote - stay organized,0.0,Productivity,158578688,4.0,161065,4+,37,23,5
2,281940292,"WeatherBug - Local Weather, Radar, Maps, Alerts",0.0,Weather,100524032,3.5,188583,4+,37,3,5
3,282614216,"eBay: Best App to Buy, Sell, Save! Online Shop...",0.0,Shopping,128512000,4.0,262241,12+,37,9,5
4,282935706,Bible,0.0,Reference,92774400,4.5,985920,4+,37,45,5


In [7]:
# Checking the names that will be split below
apple['app_name']

0                                         PAC-MAN Premium
1                               Evernote - stay organized
2         WeatherBug - Local Weather, Radar, Maps, Alerts
3       eBay: Best App to Buy, Sell, Save! Online Shop...
4                                                   Bible
                              ...                        
7192                                                Kubik
7193                                    VR Roller-Coaster
7194                Bret Michaels Emojis + Lyric Keyboard
7195            VR Roller Coaster World - Virtual Reality
7196                         Escape the Sweet Shop Series
Name: app_name, Length: 7197, dtype: object

In [8]:
# Splitting names on ':' and ' -' and just getting the first part back
apple['app_name'] = apple['app_name'].apply(lambda x: x.split(':')[0])
apple['app_name'] = apple['app_name'].apply(lambda x: x.split(' -')[0])

# Checking results
apple['app_name']

0                             PAC-MAN Premium
1                                    Evernote
2                                  WeatherBug
3                                        eBay
4                                       Bible
                        ...                  
7192                                    Kubik
7193                        VR Roller-Coaster
7194    Bret Michaels Emojis + Lyric Keyboard
7195                  VR Roller Coaster World
7196             Escape the Sweet Shop Series
Name: app_name, Length: 7197, dtype: object

In [9]:
# Removing non-ascii characters (chinese, etc. from the app names)
apple['app_name'] = apple['app_name'].str.replace(r'[^\x00-\x7F]+', '')

# Checking if I have blank names now after the cleaning. Had 374 rows that matched the condition
apple[apple['app_name'] == '']

# Filtering to drop those blank app_names
apple = apple[apple['app_name'] != '']  # Numb of rows is now 6823 from initial 7196

'''
For future reference, could also open the file in ascii format and ignore errors, to remove those
special characters
with open('bla/bla/file.csv', encoding = 'ascii', errors = 'ignore') as infile:
    df = pd.read_csv(infile)
'''

# Checking results. Looks like the index needs to be reset after all this, as the last remains 7196
apple.reset_index(drop = True, inplace = True)

# Making the index match the number of elements in the table
apple.index += 1

In [10]:
# Looking at the genres just to have an idea of how many categories and which has the most apps
apple['prime_genre'].value_counts()

Games                3653
Entertainment         506
Education             443
Photo & Video         346
Utilities             244
Health & Fitness      177
Productivity          176
Social Networking     157
Music                 138
Lifestyle             127
Shopping              112
Sports                111
Book                   89
Finance                86
Travel                 76
Weather                70
News                   69
Reference              63
Business               55
Food & Drink           55
Navigation             39
Medical                23
Catalogs                8
Name: prime_genre, dtype: int64

In [11]:
# Looking at the final dataset and exporting it
apple.to_csv('Cleaned_Apple.csv')
apple

Unnamed: 0,id,app_name,price_usd,prime_genre,size_bytes,avg_user_rating,tot_user_rating,pg_rating,numb_supp_devices,numb_supp_langs,numb_screenshots
1,281656475,PAC-MAN Premium,3.99,Games,100788224,4.0,21292,4+,38,10,5
2,281796108,Evernote,0.00,Productivity,158578688,4.0,161065,4+,37,23,5
3,281940292,WeatherBug,0.00,Weather,100524032,3.5,188583,4+,37,3,5
4,282614216,eBay,0.00,Shopping,128512000,4.0,262241,12+,37,9,5
5,282935706,Bible,0.00,Reference,92774400,4.5,985920,4+,37,45,5
...,...,...,...,...,...,...,...,...,...,...,...
6819,1187617475,Kubik,0.00,Games,126644224,4.5,142,4+,38,1,5
6820,1187682390,VR Roller-Coaster,0.00,Games,120760320,4.5,30,4+,38,1,0
6821,1187779532,Bret Michaels Emojis + Lyric Keyboard,1.99,Utilities,111322112,4.5,15,9+,37,1,1
6822,1187838770,VR Roller Coaster World,0.00,Games,97235968,4.5,85,12+,38,2,0
