In [2]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from scipy import stats

In [6]:
df = pd.read_csv('/Users/tawneykirkland/GitStuff/05-google-play/04-data/preprocessed_app_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22000 entries, 0 to 21999
Data columns (total 56 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   title                     22000 non-null  object 
 1   description               22000 non-null  object 
 2   summary                   21999 non-null  object 
 3   installs                  22000 non-null  object 
 4   minInstalls               22000 non-null  float64
 5   score                     22000 non-null  float64
 6   ratings                   22000 non-null  float64
 7   reviews                   22000 non-null  float64
 8   histogram                 22000 non-null  object 
 9   price                     22000 non-null  float64
 10  free                      22000 non-null  int64  
 11  currency                  22000 non-null  object 
 12  sale                      22000 non-null  bool   
 13  offersIAP                 22000 non-null  bool   
 14  inAppP

In [7]:
df1 = df[['title','reviews','ratings','free','containsAds','genre','editorsChoice','contentRating',
          'year','top_developer','has_video','installs_day','updated_days','score']]

In [None]:
fig, ax = plt.subplots(figsize=(15, 15))
sns.heatmap(df1.corr(), cmap="seismic", vmin=-1, vmax=1, ax=ax);

#### Observations

- Strong correlations between year, days since last update and also installs per day. Remove one at a time during regression analyis

In [None]:
fig, ax = plt.subplots(figsize=(15, 15))
pd.plotting.scatter_matrix(df1[['minInstalls','ratings','year','installs_day','updated_days','score']], ax=ax);

#### Observations

* Score (the target) has a left skew, which is further demonstrated in the images below
* Ratings appears exponential - need to transform

In [None]:
print('Average app score overall: {:.3f}'.format(df1['score'].mean()))
print('Median app score overall: {:.3f}'.format(df1['score'].median()))

In [None]:
sns.distplot(df1['score'],fit=stats.norm)
print('- Total number of ratings:', len(df1['score']))
print('- Mean of distribution of rating :', np.mean(df1['score']))
print('- Standard deviation:', np.std(df1['score']))

In [None]:
fig = plt.figure() 
prob = stats.probplot(df['score'], plot=plt)

### By content rating

In [None]:
df1.groupby('contentRating')['contentRating'].count()

In [None]:
df1.groupby('contentRating')['score'].mean()

### By app genre

In [None]:
app_genre_score = df.groupby('genre')['score'].mean()
genre_list = df1['genre'].unique()
mean_score = df1['score'].mean()

plt.figure(figsize=[15,5])
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.axhline(mean_score, label= 'Mean score', color = 'blue', linewidth=2)
plt.bar(genre_list,app_genre_score)   
plt.xticks(genre_list,rotation=90)
plt.title('Average score per genre',family='arial',fontsize=16)
plt.tight_layout()

plt.show;

Does not appear to be significant differences in mean user score across genres

In [None]:
df1.groupby('genre')['genre'].count()

## By release year

In [None]:
df.groupby('year')['year'].count()

In [None]:
df.groupby('year')['score'].mean()

### By whether the app contains ads

In [None]:
df1.groupby('containsAds')['score'].mean()

Does not appear to be a large difference in apps that do / don't contain ads

### By whether the app is an Editor's Choice

In [None]:
df1.groupby('editorsChoice')['editorsChoice'].count()

In [None]:
df1.groupby('editorsChoice')['score'].mean()

Apps marked as 'Editor's Choice' appear to have higher average score. Makes sense given likely to promote higher quality apps

### By whether the app is free

In [None]:
df1.groupby('free')['free'].count()

In [None]:
df1.groupby('free')['score'].mean()

Paid-for apps appear to have a higher average score. This makes sense given you expect highr quality when you are paying for access

### By top developer in terms of average app score

In [None]:
df1.groupby('top_developer')['score'].mean()

### By whether the app posting has a video

In [None]:
df1.groupby('has_video')['has_video'].count()

In [None]:
df1.groupby('has_video')['score'].mean()

## Check top developer

In [None]:
df1.groupby('genre')['top_developer'].mean()

## Create new genre column 

In [None]:
df1.loc[(df1['genre'] == 'Action') | (df1['genre'] == 'Adventure'),'new_genre'] = 'Action & Adventure'
df1.loc[(df1['genre'] == 'Card') | (df1['genre'] == 'Casino'),'new_genre'] = 'Card & Casino'
df1.loc[(df1['genre'] == 'Business') | (df1['genre'] == 'Finance'),'new_genre'] = 'Business & Finance'
df1.loc[(df1['genre'] == 'Action') | (df1['genre'] == 'Adventure'),'new_genre'] = 'Action & Adventure'
df1.loc[(df1['genre'] == 'Books & Reference') | 
        (df1['genre'] == 'Education') | 
        (df1['genre'] == 'Libraries & Demo') | 
        (df1['genre'] == 'Word'),'new_genre'] = 'Learning'
df1.loc[(df1['genre'] == 'Dating') | (df1['genre'] == 'Social'),'new_genre'] = 'Dating & Social'
df1.loc[(df1['genre'] == 'Entertainment'),'new_genre'] = 'Entertainment'
df1.loc[(df1['genre'] == 'Maps & Navigation') | 
        (df1['genre'] == 'Weather') |
        (df1['genre'] == 'Travel & Local'),'new_genre'] = 'Explore'
df1.loc[(df1['genre'] == 'Dating') | (df1['genre'] == 'Social'),'new_genre'] = 'Dating & Social'
df1.loc[(df1['genre'] == 'Medical') | 
        (df1['genre'] == 'Health & Fitness') |
        (df1['genre'] == 'Beauty'),'new_genre'] = 'Wellness'
df1.loc[(df1['genre'] == 'Comics') | (df1['genre'] == 'News & Magazines'),'new_genre'] = 'Cultural Reading'
df1.loc[(df1['genre'] == 'Lifestyle') |
        (df1['genre'] == 'Casual') |
        (df1['genre'] == 'House & Home') |
        (df1['genre'] == 'Food & Drink'),'new_genre'] = 'Lifestyle & Casual'
df1.loc[(df1['genre'] == 'Music & Audio') | (df1['genre'] == 'Video Players & Editors'),'new_genre'] = 'Audio & Video'
df1.loc[(df1['genre'] == 'Parenting'),'new_genre'] = 'Parenting'
df1.loc[(df1['genre'] == 'Art & Design') | (df1['genre'] == 'Photography'),'new_genre'] = 'Art & Photography'
df1.loc[(df1['genre'] == 'Personalization') |
        (df1['genre'] == 'Productivity') |
        (df1['genre'] == 'Tools') |
        (df1['genre'] == 'Communication'),'new_genre'] = 'Tools'
df1.loc[(df1['genre'] == 'Puzzle') |
        (df1['genre'] == 'Board') |
        (df1['genre'] == 'Trivia'),'new_genre'] = 'Puzzle Board & Trivia'
df1.loc[(df1['genre'] == 'Racing') |
        (df1['genre'] == 'Simulation') |
        (df1['genre'] == 'Arcade') |
        (df1['genre'] == 'Role Playing') |
        (df1['genre'] == 'Strategy'),'new_genre'] = 'Simulation games'
df1.loc[(df1['genre'] == 'Sports') |
        (df1['genre'] == 'Auto & Vehicles') |
        (df1['genre'] == 'Events') |
        (df1['genre'] == 'Shopping') |
        (df1['genre'] == 'Parenting'),'new_genre'] = 'Other'

In [None]:
#df1.to_csv('04-data/preprocessed_app_data_with_newgenre.csv',index=False)