<a href="https://colab.research.google.com/github/wiiloebis/bangkit_machine_learning_assigment/blob/master/playstore.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [126]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import LabelEncoder
sns.set(style="darkgrid")
import plotly
plotly.offline.init_notebook_mode(connected=True)
import plotly.graph_objs as go

In [127]:
# read the Google Playstore csv file
apps = pd.read_csv("Google-Playstore-32K.csv")
# App Name will be unvaluable feature so we remove it
# Last Update indicates the time the last version of the app released. It also won't give much impact as feature
# Latest Version indicates the last version of the app released. It also won't give much impact as feature
apps = apps.drop(columns=['App Name', 'Last Updated', 'Latest Version'])
apps = apps.rename(columns={'Minimum Version': 'Minimum_Version', 'Content Rating': 'Content_Rating'})
apps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8777 entries, 0 to 8776
Data columns (total 8 columns):
Category           8777 non-null object
Rating             8777 non-null object
Reviews            8775 non-null float64
Installs           8776 non-null object
Size               8776 non-null object
Price              8776 non-null object
Content_Rating     8776 non-null object
Minimum_Version    8776 non-null object
dtypes: float64(1), object(7)
memory usage: 548.7+ KB


## Cleaning the data
remove the string "+ M $ ," from the Installs, Size and Price. remove Strings that are inconvertible to numeric value to NaN.

In [0]:
string_to_remove = ["+", ",", "M", "$", "k"]
columns = ["Installs","Size", "Price", "Rating"]

# loop over columns that we want to remove a substring from
for col in columns:
    # remove string from columns one by one
    for string in string_to_remove:
        apps[col] = apps[col].str.replace(string, '')
    # convert the column to numeric
    apps[col] = pd.to_numeric(apps[col], errors="coerce")

In [0]:
# check the dataframe information after replacing the string
apps.info()

In [129]:
# drop the rows with NaN value
# drop the rows with Redundant Value
print(apps.shape)
apps = apps.drop_duplicates(keep="first", inplace=False)
print(apps.shape)
apps = apps.dropna()

(8777, 8)
(8774, 8)


In [0]:
apps.info()
print(apps.shape)

## Feature Selection Process

In [0]:
X = apps.iloc[:, apps.columns != 'Installs'].values
Y = apps.iloc[:, 3].values
# print(X.shape)
# print(Y.shape)

labelEncoder_Category =  LabelEncoder()
# print(apps.Category.unique())
X[:,0] = labelEncoder_Category.fit_transform(X[:,0])
# print(X[:4,:])

# print(apps.Minimum_Version.unique())
labelEncoder_MinimumVersion =  LabelEncoder()
# print(X[:4,:])
X[:,6] = labelEncoder_MinimumVersion.fit_transform(X[:,6])
# print(X[:4,:])

# print(apps.Rating.unique())
# print(X[:, 1])
# X[:, [1]] = np.vstack(X[:, [1]]).astype(np.float)
# print(X[:10, :])

# print(apps.Content_Rating.unique())
labelEncoder_Content_Rating =  LabelEncoder()
# print(X[:10, :])
X[:,5] = labelEncoder_Content_Rating.fit_transform(X[:,5])
# print(X[:10, :])


print(Y.shape)
print(X.shape)
print(X[0,:])
dataframeX = pd.DataFrame(X)
dataframeY = pd.DataFrame(Y)
dataframeX = dataframeX.rename(columns={0: "Category", 1: "Rating", 2: "Reviews", 3: "Size", 4: "Price", 5:"Content_Rating", 6:"Minimum_Version"})
print(dataframeX.info())
bestfeatures = SelectKBest(score_func=chi2, k=7)
fit = bestfeatures.fit(dataframeX,dataframeY)
appsScores = pd.DataFrame(fit.scores_)
appscolumns = pd.DataFrame(dataframeX.columns)
# #concat two dataframes for better visualization 
featureScores = pd.concat([appscolumns,appsScores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(7,'Score'))  #print 5 best features


model = ExtraTreesClassifier()
dataframeX = dataframeX.rename(columns={0: "Category", 1: "Rating", 2: "Reviews", 3: "Size", 4: "Price", 5:"Content_Rating", 6:"Minimum_Version"})
model.fit(dataframeX,dataframeY)
print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=dataframeX.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.show()


corrmat = apps.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))
#plot heat map
g=sns.heatmap(apps[top_corr_features].corr(),annot=True,cmap="RdYlGn")

## Write preprocessed data to CSV
run the cell below to save dataframe to csv file

In [0]:
apps.to_csv("googleplaystore_processed.csv")

In [0]:
apps.sample(20)

## Check Distribution of apps across Categories

In [0]:
# Check number of apps in each category and sort by descending order
no_apps_by_category = apps["Category"].value_counts().sort_values(ascending=False)
print("Number of category:",len(no_apps_by_category))
no_apps_by_category.plot.bar(figsize=(20,10))

## App Rating Distribution
finding the distribution of apps with histogram

In [0]:
# Average rating of apps
avg_app_rating = apps['Rating'].mean()
print('Average app rating = ', avg_app_rating)

# Distribution of apps according to their ratings
data = [go.Histogram(
        x = apps['Rating']
)]

# Vertical dashed line to indicate the average app rating
layout = {'shapes': [{
              'type' :'line',
              'x0': avg_app_rating,
              'y0': 0,
              'x1': avg_app_rating,
              'y1': 1000,
              'line': { 'dash': 'dashdot'}
          }]
          }

plotly.offline.iplot({'data': data, 'layout': layout})

## Size vs Rating
seems that most apps with small size have higher rating

In [0]:
# Plot size vs Price
plot_size_vs_rating = sns.jointplot(x=apps["Size"], y=apps["Rating"], kind="hex")
plot_scatter = apps.plot.scatter(x="Size", y="Rating")

## Paid vs Rating
Cheaper apps mostly have higher rating. there are 2 apps which has too high of a price.

In [0]:
plot_price_vs_Rating = sns.jointplot(x=apps["Price"], y=apps["Rating"], kind="hex")
plot_scatter = apps.plot.scatter(x="Price", y="Rating")

In [0]:
# Apps with outlier price
apps[apps["Price"] > 250]

In [0]:
# remove the two outlier
apps.drop(apps[apps["Price"] > 250].index, inplace=True)

## Price vs Category

In [0]:
fig, ax = plt.subplots()
fig.set_size_inches(20, 25)

# seems that we need to reduce the number of category
sns.stripplot(x="Price", y="Category", data=apps, linewidth=1)

In [0]:
fig, ax = plt.subplots()
fig.set_size_inches(12, 10)

# sort the apps by number of apps category in descending order
sorted_apps_by_category = apps["Category"].value_counts().sort_values(ascending=False)
# Take the first 15 Categories with most apps
category_list = list(sorted_apps_by_category.index[0:10])
apps_to_plot = apps[apps["Category"].isin(category_list)]
sns.stripplot(x="Price", y="Category", data=apps_to_plot, linewidth=1)

## Paid downloads vs Free downloads

In [0]:
apps["Type"] = np.where(apps["Price"] > 0, "Paid","Free")
apps.boxplot(by="Type", column=["Installs"], grid=True)
plt.yscale('log')
plt.yticks([10, 1000, 100000, 10000000, 1000000000], ["10", "1k", "100k", "10M", "1B"])