## Questions:
- Apps by genre (count)
- Genre and total rating (sum)
- Genre and user rating (mean)
- Genre and price (free or not)

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('AppleStore.csv', index_col=0)
data.head()

Unnamed: 0,id,track_name,size_bytes,currency,price,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,ver,cont_rating,prime_genre,sup_devices.num,ipadSc_urls.num,lang.num,vpp_lic
1,281656475,PAC-MAN Premium,100788224,USD,3.99,21292,26,4.0,4.5,6.3.5,4+,Games,38,5,10,1
2,281796108,Evernote - stay organized,158578688,USD,0.0,161065,26,4.0,3.5,8.2.2,4+,Productivity,37,5,23,1
3,281940292,"WeatherBug - Local Weather, Radar, Maps, Alerts",100524032,USD,0.0,188583,2822,3.5,4.5,5.0.0,4+,Weather,37,5,3,1
4,282614216,"eBay: Best App to Buy, Sell, Save! Online Shop...",128512000,USD,0.0,262241,649,4.0,4.5,5.10.0,12+,Shopping,37,5,9,1
5,282935706,Bible,92774400,USD,0.0,985920,5320,4.5,5.0,7.5.1,4+,Reference,37,5,45,1


In [3]:
data.shape

(7197, 16)

In [4]:
data.isna().sum()

id                  0
track_name          0
size_bytes          0
currency            0
price               0
rating_count_tot    0
rating_count_ver    0
user_rating         0
user_rating_ver     0
ver                 0
cont_rating         0
prime_genre         0
sup_devices.num     0
ipadSc_urls.num     0
lang.num            0
vpp_lic             0
dtype: int64

In [5]:
data.dtypes

id                    int64
track_name           object
size_bytes            int64
currency             object
price               float64
rating_count_tot      int64
rating_count_ver      int64
user_rating         float64
user_rating_ver     float64
ver                  object
cont_rating          object
prime_genre          object
sup_devices.num       int64
ipadSc_urls.num       int64
lang.num              int64
vpp_lic               int64
dtype: object

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7197 entries, 1 to 11097
Data columns (total 16 columns):
id                  7197 non-null int64
track_name          7197 non-null object
size_bytes          7197 non-null int64
currency            7197 non-null object
price               7197 non-null float64
rating_count_tot    7197 non-null int64
rating_count_ver    7197 non-null int64
user_rating         7197 non-null float64
user_rating_ver     7197 non-null float64
ver                 7197 non-null object
cont_rating         7197 non-null object
prime_genre         7197 non-null object
sup_devices.num     7197 non-null int64
ipadSc_urls.num     7197 non-null int64
lang.num            7197 non-null int64
vpp_lic             7197 non-null int64
dtypes: float64(3), int64(8), object(5)
memory usage: 955.9+ KB


In [7]:
for col in data.columns:
    print(col, data[col].unique())

id [ 281656475  281796108  281940292 ... 1187779532 1187838770 1188375727]
track_name ['PAC-MAN Premium' 'Evernote - stay organized'
 'WeatherBug - Local Weather, Radar, Maps, Alerts' ...
 'Bret Michaels Emojis + Lyric Keyboard'
 'VR Roller Coaster World - Virtual Reality'
 'Escape the Sweet Shop Series']
size_bytes [100788224 158578688 100524032 ... 111322112  97235968  90898432]
currency ['USD']
price [  3.99   0.     0.99   9.99   4.99   7.99   2.99   1.99   5.99  12.99
  21.99 249.99   6.99  74.99  19.99   8.99  24.99  13.99  14.99  16.99
  47.99  11.99  59.99  15.99  27.99  17.99 299.99  49.99  23.99  20.99
  39.99  99.99  29.99  34.99  18.99  22.99]
rating_count_tot [ 21292 161065 188583 ...    283   3384   1441]
rating_count_ver [  26 2822  649 ... 1142 3124 1441]
user_rating [4.  3.5 4.5 5.  3.  2.  2.5 0.  1.5 1. ]
user_rating_ver [4.5 3.5 5.  4.  2.5 0.  3.  2.  1.  1.5]
ver ['6.3.5' '8.2.2' '5.0.0' ... '6.1.13' '0.6.41' '2.0.20.1']
cont_rating ['4+' '12+' '17+' '9+']
prime_g

#### Apps by genre (count)

In [8]:
genre = pd.DataFrame(data.groupby('prime_genre')['prime_genre'].count())
genre.columns = ['Count']
genre.reset_index(inplace=True)
genre.columns = ['Genre', 'Count']
genre.head()

Unnamed: 0,Genre,Count
0,Book,112
1,Business,57
2,Catalogs,10
3,Education,453
4,Entertainment,535


#### Genre and total rating (sum)

In [9]:
total_ratings = pd.DataFrame(data.groupby('prime_genre')['rating_count_tot'].sum())
total_ratings.reset_index(inplace=True)
total_ratings.columns = ['Genre', 'Total ratings']
total_ratings.head()

Unnamed: 0,Genre,Total ratings
0,Book,574049
1,Business,272921
2,Catalogs,17325
3,Education,1014371
4,Entertainment,4030518


#### Genre and user rating (mean)

In [10]:
user_ratings = pd.DataFrame(data.groupby('prime_genre')['user_rating'].mean())
user_ratings.reset_index(inplace=True)
user_ratings.columns = ['Genre', 'User rating']
user_ratings['User rating'] = user_ratings['User rating'].round(decimals=1)
user_ratings.head()

Unnamed: 0,Genre,User rating
0,Book,2.5
1,Business,3.7
2,Catalogs,2.1
3,Education,3.4
4,Entertainment,3.2


#### Genre and price (free or not)

In [11]:
data['Price'] = np.where(data['price'] == 0, 'Free', 'Payed')
data.head()

Unnamed: 0,id,track_name,size_bytes,currency,price,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,ver,cont_rating,prime_genre,sup_devices.num,ipadSc_urls.num,lang.num,vpp_lic,Price
1,281656475,PAC-MAN Premium,100788224,USD,3.99,21292,26,4.0,4.5,6.3.5,4+,Games,38,5,10,1,Payed
2,281796108,Evernote - stay organized,158578688,USD,0.0,161065,26,4.0,3.5,8.2.2,4+,Productivity,37,5,23,1,Free
3,281940292,"WeatherBug - Local Weather, Radar, Maps, Alerts",100524032,USD,0.0,188583,2822,3.5,4.5,5.0.0,4+,Weather,37,5,3,1,Free
4,282614216,"eBay: Best App to Buy, Sell, Save! Online Shop...",128512000,USD,0.0,262241,649,4.0,4.5,5.10.0,12+,Shopping,37,5,9,1,Free
5,282935706,Bible,92774400,USD,0.0,985920,5320,4.5,5.0,7.5.1,4+,Reference,37,5,45,1,Free


In [12]:
price = pd.crosstab(data['prime_genre'], data['Price'])
price.reset_index(inplace=True)
price.columns = ['Genre', 'Free', 'Payed']
price.head()

Unnamed: 0,Genre,Free,Payed
0,Book,66,46
1,Business,20,37
2,Catalogs,9,1
3,Education,132,321
4,Entertainment,334,201


#### Put it all together

In [13]:
genre.shape, total_ratings.shape, user_ratings.shape, price.shape

((23, 2), (23, 2), (23, 2), (23, 3))

In [14]:
data_clean = pd.merge(pd.merge(pd.merge(genre, total_ratings, on='Genre'), user_ratings, on='Genre'), price, on='Genre')

In [15]:
data_clean.sort_values(by=['Count'], ascending=False, inplace=True)
data_clean.reset_index(drop=True, inplace=True)
data_clean

Unnamed: 0,Genre,Count,Total ratings,User rating,Free,Payed
0,Games,3862,52878491,3.7,2257,1605
1,Entertainment,535,4030518,3.2,334,201
2,Education,453,1014371,3.4,132,321
3,Photo & Video,349,5008946,3.8,167,182
4,Utilities,248,1702228,3.3,109,139
5,Health & Fitness,180,1784371,3.7,76,104
6,Productivity,178,1433136,4.0,62,116
7,Social Networking,167,7598316,3.0,143,24
8,Lifestyle,144,887294,2.8,94,50
9,Music,138,3980199,4.0,67,71


In [16]:
data_clean.to_csv('data_clean.csv')