# Import Statements

In [1]:
import pandas as pd
import plotly.express as px


# Notebook Presentation

In [2]:
# Show numeric output in decimal format e.g., 2.15
pd.options.display.float_format = '{:,.2f}'.format

# Read the Dataset

In [3]:
df_apps = pd.read_csv('apps.csv')

# Data Cleaning

In [None]:
df_apps.shape

In [None]:
df_apps.head()

In [None]:
df_apps.sample(5)

### Drop Unused Columns

In [None]:
df_apps.drop(['Last_Updated', 'Android_Ver'], axis=1, inplace=True)
df_apps.head()

### Find and Remove NaN values in Ratings

In [None]:
nan_rows = df_apps[df_apps.Rating.isna()]
print(nan_rows.shape)
nan_rows.head()

In [None]:
df_apps_clean = df_apps.dropna()
df_apps_clean.shape

### Find and Remove Duplicates


In [None]:
duplicated_rows = df_apps_clean[df_apps_clean.duplicated()]
print(duplicated_rows.shape)
duplicated_rows.head()

In [None]:
df_apps_clean[df_apps_clean.App == 'Instagram']

In [12]:
df_apps_clean = df_apps_clean.drop_duplicates()

In [None]:
df_apps_clean[df_apps_clean.App == 'Instagram']

In [None]:
df_apps_clean = df_apps_clean.drop_duplicates(subset=['App', 'Type', 'Price'])
df_apps_clean[df_apps_clean.App == 'Instagram']

In [15]:
df_apps_clean.shape

(8199, 10)

# Find Highest Rated Apps

In [None]:
df_apps_clean.sort_values('Rating', ascending=False).head()

# Find 5 Largest Apps in terms of Size (MBs)

In [None]:
df_apps_clean.sort_values('Size_MBs', ascending=False).head()

# Find the 5 App with Most Reviews

In [None]:
df_apps_clean.sort_values('Reviews', ascending=False).head(50)

# Plotly Pie and Donut Charts - Visualise Categorical Data: Content Ratings

In [None]:
ratings = df_apps_clean.Content_Rating.value_counts()
ratings

In [None]:
fig = px.pie(labels=ratings.index, values=ratings.values)

fig.show()

In [None]:
fig = px.pie(labels=ratings.index, values=ratings.values, title="Content Rating", names=ratings.index)
fig.update_traces(textposition='outside', textinfo='percent+label')

fig.show()

In [None]:
fig = px.pie(labels=ratings.index, values=ratings.values, title="Content Rating", names=ratings.index, hole=0.6)
fig.update_traces(textposition='inside', textfont_size=15, textinfo='percent')

fig.show()

# Numeric Type Conversion: Examine the Number of Installs

In [None]:
df_apps_clean.Installs.describe()

In [None]:
df_apps_clean.info()

In [None]:
df_apps_clean[['App', 'Installs']].groupby('Installs').count()

In [None]:
df_apps_clean.Installs = df_apps_clean.Installs.astype(str).str.replace(',', "")
df_apps_clean.Installs = pd.to_numeric(df_apps_clean.Installs)
df_apps_clean[['App', 'Installs']].groupby('Installs').count()

# Find the Most Expensive Apps, Filter out the Junk, and Calculate a (ballpark) Sales Revenue Estimate


In [None]:
df_apps_clean.Price = df_apps_clean.Price.astype(str).str.replace('$', "")
df_apps_clean.Price = pd.to_numeric(df_apps_clean.Price)

df_apps_clean.sort_values('Price', ascending=False).head(20)

### The most expensive apps sub $250

In [None]:
df_apps_clean = df_apps_clean[df_apps_clean['Price'] < 250]
df_apps_clean.sort_values('Price', ascending=False).head(5)

### Highest Grossing Paid Apps (ballpark estimate)

In [None]:
df_apps_clean['Revenue_Estimate'] = df_apps_clean.Installs.mul(df_apps_clean.Price)
df_apps_clean.sort_values('Revenue_Estimate', ascending=False)[:10]

# Plotly Bar Charts & Scatter Plots: Analysing App Categories

In [None]:
df_apps_clean.Category.nunique()

In [None]:
top10_category = df_apps_clean.Category.value_counts()[:10]
top10_category

### Vertical Bar Chart - Highest Competition (Number of Apps)

In [None]:
bar = px.bar(x = top10_category.index, y = top10_category.values)

bar.show()

### Horizontal Bar Chart - Most Popular Categories (Highest Downloads)

In [33]:
category_installs = df_apps_clean.groupby('Category').agg({'Installs': pd.Series.sum})
category_installs.sort_values('Installs', ascending=True, inplace=True)

In [None]:
h_bar = px.bar(x = category_installs.Installs, y = category_installs.index, orientation='h', title='Category Popularity')
h_bar.update_layout(xaxis_title='Number of Downloads', yaxis_title='Category')

h_bar.show()

### Category Concentration - Downloads vs. Competition

In [None]:
cat_number = df_apps_clean.groupby('Category').agg({'App': pd.Series.count})
cat_merged_df = pd.merge(cat_number, category_installs, on='Category', how="inner")
print(f'The dimensions of the DataFrame are: {cat_merged_df.shape}')
cat_merged_df.sort_values('Installs', ascending=False)

In [None]:
scatter = px.scatter(cat_merged_df, x='App', y='Installs', title='Category Concentration', size='App', hover_name=cat_merged_df.index, color='Installs')  
scatter.update_layout(xaxis_title="Number of Apps (Lower=More Concentrated)", yaxis_title="Installs", yaxis=dict(type='log'))

scatter.show()

# Extracting Nested Data from a Column

In [None]:
len(df_apps_clean.Genres.unique())

In [None]:
df_apps_clean.Genres.value_counts().sort_values(ascending=True)[:5]

In [None]:
stack = df_apps_clean.Genres.str.split(';', expand=True).stack()
print(f'We now have a single column with shape: {stack.shape}')
num_genres = stack.value_counts()
print(f'Number of genres: {len(num_genres)}')

# Colour Scales in Plotly Charts - Competition in Genres

In [None]:
bar = px.bar(x = num_genres.index[:15], y = num_genres.values[:15], title='Top Genres', hover_name=num_genres.index[:15], color=num_genres.values[:15], color_continuous_scale='Agsunset')
bar.update_layout(xaxis_title='Genre', yaxis_title='Number of Apps', coloraxis_showscale=False)

bar.show()

# Grouped Bar Charts: Free vs. Paid Apps per Category

In [None]:
df_apps_clean.Type.value_counts()

In [None]:
df_free_vs_paid = df_apps_clean.groupby(["Category", "Type"], as_index=False).agg({'App': pd.Series.count})
df_free_vs_paid.sort_values('App')

In [None]:
g_bar = px.bar(df_free_vs_paid, x='Category', y='App', title='Free vs Paid Apps by Category', color='Type', barmode='group')
g_bar.update_layout(xaxis_title='Category', yaxis_title='Number of Apps', xaxis={'categoryorder':'total descending'}, yaxis=dict(type='log'))

g_bar.show()

# Plotly Box Plots: Lost Downloads for Paid Apps

In [None]:
box = px.box(df_apps_clean, y='Installs', x='Type', color='Type', notched=True, points='all', title='How Many Downloads are Paid Apps Giving Up?')
box.update_layout(yaxis=dict(type='log'))

box.show()


# Plotly Box Plots: Revenue by App Category

In [46]:
df_paid_apps = df_apps_clean[df_apps_clean['Type'] == 'Paid']

In [None]:
box = px.box(df_paid_apps, x='Category', y='Revenue_Estimate', title='How Much Can Paid Apps Earn?')
box.update_layout(xaxis_title='Category', yaxis_title='Paid App Ballpark Revenue', xaxis={'categoryorder':'min ascending'}, yaxis=dict(type='log'))


box.show()

# How Much Can You Charge? Examine Paid App Pricing Strategies by Category

**Challenge**: What is the median price price for a paid app? Then compare pricing by category by creating another box plot. But this time examine the prices (instead of the revenue estimates) of the paid apps. I recommend using `{categoryorder':'max descending'}` to sort the categories.

In [None]:
df_paid_apps.Price.median()

In [None]:
box = px.box(df_paid_apps, x='Category', y="Price", title='Price per Category')
box.update_layout(xaxis_title='Category', yaxis_title='Paid App Price', xaxis={'categoryorder':'max descending'}, yaxis=dict(type='log'))

box.show()