In [2]:
import re
import sys

import time
import datetime

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import random


In [None]:
df = pd.read_csv('googleplaystore.csv')

In [None]:
# The best way to fill missing values might be using the median instead of mean.
df['Rating'] = df['Rating'].fillna(df['Rating'].median())

# Before filling null values we have to clean all non numerical values & unicode charachters 
replaces = [u'\u00AE', u'\u2013', u'\u00C3', u'\u00E3', u'\u00B3', '[', ']', "'"]
for i in replaces:
    df['Current Ver'] = df['Current Ver'].astype(str).apply(lambda x : x.replace(i, ''))

regex = [r'[-+|/:/;(_)@]', r'\s+', r'[A-Za-z]+']
for j in regex:
    df['Current Ver'] = df['Current Ver'].astype(str).apply(lambda x : re.sub(j, '0', x))

df['Current Ver'] = df['Current Ver'].astype(str).apply(lambda x : x.replace('.', ',',1).replace('.', '').replace(',', '.',1)).astype(float)
df['Current Ver'] = df['Current Ver'].fillna(df['Current Ver'].median())

In [None]:
i = df[df['Category'] == '1.9'].index
df.loc[i]
df = df.drop(i)
# Removing NaN values
df = df[pd.notnull(df['Last Updated'])]
df = df[pd.notnull(df['Content Rating'])]

In [None]:
# App values encoding
le = preprocessing.LabelEncoder()
df['App'] = le.fit_transform(df['App'])
# This encoder converts the values into numeric values

# Category features encoding
category_list = df['Category'].unique().tolist() 
category_list = ['cat_' + word for word in category_list]
df = pd.concat([df, pd.get_dummies(df['Category'], prefix='cat')], axis=1)

# Genres features encoding
le = preprocessing.LabelEncoder()
df['Genres'] = le.fit_transform(df['Genres'])

# Encode Content Rating features
le = preprocessing.LabelEncoder()
df['Content Rating'] = le.fit_transform(df['Content Rating'])

# Price cealning
df['Price'] = df['Price'].apply(lambda x : x.strip('$'))

# Installs cealning
df['Installs'] = df['Installs'].apply(lambda x : x.strip('+').replace(',', ''))

# Type encoding
df['Type'] = pd.get_dummies(df['Type'])

# Last Updated encoding
df['Last Updated'] = df['Last Updated'].apply(lambda x : time.mktime(datetime.datetime.strptime(x, '%B %d, %Y').timetuple()))

# Convert kbytes to Mbytes 
k_indices = df['Size'].loc[df['Size'].str.contains('k')].index.tolist()
converter = pd.DataFrame(df.loc[k_indices, 'Size'].apply(lambda x: x.strip('k')).astype(float).apply(lambda x: x / 1024).apply(lambda x: round(x, 3)).astype(str))
df.loc[k_indices,'Size'] = converter

# Size cleaning
df['Size'] = df['Size'].apply(lambda x: x.strip('M'))
df[df['Size'] == 'Varies with device'] = 0
df['Size'] = df['Size'].astype(float)

In [None]:
# Split data into training and testing sets
features = ['App', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver']
features.extend(category_list)
X = df[features]
y = df['Rating']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 10)


xgbModel = KNeighborsRegressor(n_neighbors=15)
xgbModel.fit(X_train, y_train)
accuracy = xgbModel.score(X_test,y_test)
'Accuracy: ' + str(np.round(accuracy*100, 2)) + '%'

---
# NEW DATA

---

In [51]:
data = pd.read_csv('Google-Playstore-32K.csv', delimiter=',')
data.dataframeName = 'Google-Playstore-32K.csv'
nRow, nCol = data.shape
print(f'There are {nRow} rows and {nCol} columns')


There are 32000 rows and 11 columns


In [52]:
data = data.sort_values('Installs', ascending=False)
data = data.drop_duplicates(subset='App Name', keep='first')
maxVal = 10000

#detect null cols and null rate
nulls = [i for i in data.isna().any().index if data.isna().any()[i]==True]
rates = []
counts = []
for i in nulls:    
    rates.append((data[i].isna().sum()/data.shape[0])*100)
    counts.append(data[i].isna().sum())



In [53]:
null_df = pd.DataFrame.from_dict({"Col":nulls,"Count":counts,"Null_Rates":rates})


In [54]:

df_train = data.copy()
#delete Reviews, Latest Version, null values row
for i in ['Reviews','Latest Version']:
    df_train = df_train.drop(df_train.loc[df_train[i].isnull()].index,0)

In [55]:

# PreProcessing Rating Attribute
df_train['Rating'].replace('Lessons', np.nan, inplace=True)
df_train['Rating'].replace('GAME_STRATEGY', np.nan, inplace=True)
df_train['Rating'].replace('NEWS_AND_MAGAZINES', np.nan, inplace=True)
df_train['Rating'] = df_train['Rating'].fillna(df_train['Rating'].median())


In [56]:
df_train['Rating'].astype(float).describe()


count    30843.000000
mean         4.268535
std          0.501046
min          1.000000
25%          4.086957
50%          4.373173
75%          4.581750
max          5.000000
Name: Rating, dtype: float64

In [57]:
# Remove '+' from end of each value
df_train['Installs'] = df_train['Installs'].apply(lambda x : x.strip('+').replace(',', ''))


df_train['Category'].replace(' Channel 2 News', np.nan, inplace=True)
df_train['Category'].replace(')', np.nan, inplace=True)
df_train = df_train[pd.notnull(df_train['Category'])]
regex = [r'GAME_[A-Za-z]+.*']
for j in regex:
    df_train['Category_Consolidated'] = df_train['Category'].astype(str).apply(lambda x : re.sub(j, 'GAME', x))
df_train

Unnamed: 0,App Name,Category,Rating,Reviews,Installs,Size,Price,Content Rating,Last Updated,Minimum Version,Latest Version,Category_Consolidated
20981,Samsung Health,HEALTH_AND_FITNESS,4.320483208,667452.0,500000000,79M,0,Everyone,"March 27, 2019",5.0 and up,6.2.0.075,HEALTH_AND_FITNESS
6747,Super-Bright LED Flashlight,PRODUCTIVITY,4.596871376,9123436.0,500000000,Varies with device,0,Everyone,"March 27, 2019",Varies with device,Varies with device,PRODUCTIVITY
420,Google Docs,PRODUCTIVITY,4.351906776,987137.0,500000000,Varies with device,0,Everyone,"April 1, 2019",Varies with device,Varies with device,PRODUCTIVITY
820,Pou,GAME_CASUAL,4.330340385,10752323.0,500000000,24M,0,Everyone,"May 25, 2018",4.0 and up,1.4.77,GAME
304,Netflix,ENTERTAINMENT,4.448670864,6385467.0,500000000,Varies with device,0,Teen,"March 29, 2019",Varies with device,Varies with device,ENTERTAINMENT
...,...,...,...,...,...,...,...,...,...,...,...,...
11503,Spell & Play: Vehicles,GAME_EDUCATIONAL,5,1.0,1,44M,$0.99,Everyone,"March 24, 2019",4.0 and up,1.0.2,GAME
4854,Grow your childrens intelligence,GAME_WORD,5,1.0,1,23M,0,Everyone,"March 6, 2019",4.1 and up,14,GAME
2916,Spell & Play: Fish Friends,GAME_EDUCATIONAL,5,2.0,1,45M,$0.99,Everyone,"March 24, 2019",4.0 and up,1.0.3,GAME
25708,Magic Room,ENTERTAINMENT,5,1.0,1,27M,$0.99,Everyone,"March 6, 2019",5.0 and up,1.9,ENTERTAINMENT


In [58]:
# Price 
df_train['Price'] = df_train['Price'].apply(lambda x : x.strip('$'))
df_train['Price'].replace('Varies with device', np.nan, inplace=True)
df_train['Price'] = df_train['Price'].fillna(df_train['Price'].median())

In [59]:

regex = [r'[-+|/:/;(_)@\[\]#�,>]', r'\s+', r'[A-Za-z]+']
for j in regex:
    df_train['Latest Version'] = df_train['Latest Version'].astype(str).apply(lambda x : re.sub(j, '0', x))
df_train['Latest Version'].replace('?.?', np.nan, inplace=True)

df_train['Latest Version'] = df_train['Latest Version'].astype(str).apply(lambda x : x.replace('.', ',',1).replace('.', '').replace(',', '.',1)[:3]).astype(float)
df_train['Latest Version'] = df_train['Latest Version'].fillna(df_train['Latest Version'].median())

df_train['Type'] = np.where(df_train['Price'].astype(float) > 0, 'Paid', 'Free')

df_train


Unnamed: 0,App Name,Category,Rating,Reviews,Installs,Size,Price,Content Rating,Last Updated,Minimum Version,Latest Version,Category_Consolidated,Type
20981,Samsung Health,HEALTH_AND_FITNESS,4.320483208,667452.0,500000000,79M,0,Everyone,"March 27, 2019",5.0 and up,6.2,HEALTH_AND_FITNESS,Free
6747,Super-Bright LED Flashlight,PRODUCTIVITY,4.596871376,9123436.0,500000000,Varies with device,0,Everyone,"March 27, 2019",Varies with device,0.0,PRODUCTIVITY,Free
420,Google Docs,PRODUCTIVITY,4.351906776,987137.0,500000000,Varies with device,0,Everyone,"April 1, 2019",Varies with device,0.0,PRODUCTIVITY,Free
820,Pou,GAME_CASUAL,4.330340385,10752323.0,500000000,24M,0,Everyone,"May 25, 2018",4.0 and up,1.4,GAME,Free
304,Netflix,ENTERTAINMENT,4.448670864,6385467.0,500000000,Varies with device,0,Teen,"March 29, 2019",Varies with device,0.0,ENTERTAINMENT,Free
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11503,Spell & Play: Vehicles,GAME_EDUCATIONAL,5,1.0,1,44M,0.99,Everyone,"March 24, 2019",4.0 and up,1.0,GAME,Paid
4854,Grow your childrens intelligence,GAME_WORD,5,1.0,1,23M,0,Everyone,"March 6, 2019",4.1 and up,14.0,GAME,Free
2916,Spell & Play: Fish Friends,GAME_EDUCATIONAL,5,2.0,1,45M,0.99,Everyone,"March 24, 2019",4.0 and up,1.0,GAME,Paid
25708,Magic Room,ENTERTAINMENT,5,1.0,1,27M,0.99,Everyone,"March 6, 2019",5.0 and up,1.9,ENTERTAINMENT,Paid


In [60]:
regex = [r',']
for j in regex:
    df_train['Size'] = df_train['Size'].astype(str).apply(lambda x : re.sub(j, '', x))
    
# Convert kbytes to Mbytes 
k_indices = df_train['Size'].loc[df_train['Size'].str.contains('k')].index.tolist()
converter = pd.DataFrame(df_train.loc[k_indices, 'Size'].apply(lambda x: x.strip('k')).astype(float).apply(lambda x: x / 1024).apply(lambda x: round(x, 3)).astype(str))
df_train.loc[k_indices,'Size'] = converter


df_train['Size'] = df_train['Size'].apply(lambda x: x.strip('M'))
df_train.loc[df_train['Size'] == 'Varies with device', 'Size'] = 0
# df_train[df_train['Size'] == 'Varies with device','Size'] = 0
df_train['Size'] = df_train['Size'].astype(float)

df_train

Unnamed: 0,App Name,Category,Rating,Reviews,Installs,Size,Price,Content Rating,Last Updated,Minimum Version,Latest Version,Category_Consolidated,Type
20981,Samsung Health,HEALTH_AND_FITNESS,4.320483208,667452.0,500000000,79.0,0,Everyone,"March 27, 2019",5.0 and up,6.2,HEALTH_AND_FITNESS,Free
6747,Super-Bright LED Flashlight,PRODUCTIVITY,4.596871376,9123436.0,500000000,0.0,0,Everyone,"March 27, 2019",Varies with device,0.0,PRODUCTIVITY,Free
420,Google Docs,PRODUCTIVITY,4.351906776,987137.0,500000000,0.0,0,Everyone,"April 1, 2019",Varies with device,0.0,PRODUCTIVITY,Free
820,Pou,GAME_CASUAL,4.330340385,10752323.0,500000000,24.0,0,Everyone,"May 25, 2018",4.0 and up,1.4,GAME,Free
304,Netflix,ENTERTAINMENT,4.448670864,6385467.0,500000000,0.0,0,Teen,"March 29, 2019",Varies with device,0.0,ENTERTAINMENT,Free
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11503,Spell & Play: Vehicles,GAME_EDUCATIONAL,5,1.0,1,44.0,0.99,Everyone,"March 24, 2019",4.0 and up,1.0,GAME,Paid
4854,Grow your childrens intelligence,GAME_WORD,5,1.0,1,23.0,0,Everyone,"March 6, 2019",4.1 and up,14.0,GAME,Free
2916,Spell & Play: Fish Friends,GAME_EDUCATIONAL,5,2.0,1,45.0,0.99,Everyone,"March 24, 2019",4.0 and up,1.0,GAME,Paid
25708,Magic Room,ENTERTAINMENT,5,1.0,1,27.0,0.99,Everyone,"March 6, 2019",5.0 and up,1.9,ENTERTAINMENT,Paid


In [61]:
df_train['Last Updated'] = df_train['Last Updated'].apply(lambda x : time.mktime(datetime.datetime.strptime(x, '%B %d, %Y').timetuple()))



In [68]:

regex = [r'[-+|/:/;(_)@\[\]#�,>]', r'\s+', r'[A-Za-z]+']
for j in regex:
    df_train['Minimum Version'] = df_train['Minimum Version'].astype(str).apply(lambda x : re.sub(j, '0', x))

df_train['Minimum Version'] = df_train['Minimum Version'].astype(str).apply(lambda x : x.replace('.', ',',1).replace('.', '').replace(',', '.',1)[:3]).astype(float)
df_train['Minimum Version'] = df_train['Minimum Version'].fillna(df_train['Minimum Version'].median())


df_train.to_csv("cleaned.csv")


In [63]:




# for i in ['Rating']:
#     df_train = df_train.drop(df_train.loc[df_train[i].isnull()].index,0)




# App values encoding
le = preprocessing.LabelEncoder()
df_train['App Name'] = le.fit_transform(df_train['App Name'])

# Category features encoding
category_list = df_train['Category_Consolidated'].unique().tolist() 
category_list = ['cat_' + word for word in category_list]
df_train = pd.concat([df_train, pd.get_dummies(df_train['Category_Consolidated'], prefix='cat')], axis=1)

# Encode Content Rating features
le = preprocessing.LabelEncoder()
df_train['Content Rating'] = le.fit_transform(df_train['Content Rating'])


# Type encoding
df_train['Type'] = pd.get_dummies(df_train['Type'])



# df_train['Installs'] = pd.get_dummies(df_train['Installs'])

# df_train['Last Updated'].replace('Everyone 10+', np.nan, inplace=True)
# df_train = df_train[pd.notnull(df_train['Last Updated'])]




df_train['Installs'] = df_train['Installs'].astype(int)
df_train['Price'] = df_train['Price'].astype(float)
df_train['Rating'] = df_train['Rating'].astype(float)
df_train['Reviews'] = df_train['Reviews'].astype(int)
# df_train['Last Updated'] = df_train['Last Updated'].astype(float)

# df_train['Rating'] =df_train['Rating'].apply(lambda x: round(x, 2))


In [64]:
# Split data into training and testing sets
features = ['App Name', 'Reviews', 'Size',
            'Rating', 'Type', 'Price', 
            'Content Rating', 'Last Updated',
            'Latest Version']

features.extend(category_list)
X = df_train[features][:maxVal]
y = df_train['Installs'][:maxVal]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 10)



import xgboost as xgb

xgbModel = xgb.XGBClassifier()
xgbModel.fit(X_train, y_train)
accuracy = xgbModel.score(X_test,y_test)
'Accuracy: ' + str(np.round(accuracy*100, 2)) + '%'

'Accuracy: 80.24%'

In [None]:
# Split data into training and testing sets
features = ['Reviews', 'Size',
            'Rating', 'Type', 'Price', 
            'Content Rating']

features.extend(category_list)
X = df_train[features][:maxVal]
y = df_train['Installs'][:maxVal]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 10)

xgbModel = xgb.XGBClassifier()
xgbModel.fit(X_train, y_train)
accuracy = xgbModel.score(X_test,y_test)
'Accuracy: ' + str(np.round(accuracy*100, 2)) + '%'

In [None]:
from sklearn.model_selection import validation_curve

# define range of n_estimators
paramRange = np.linspace(1, 800, 20).astype('int')

# get list of training and validation scores for cv = 5
(train_scores, test_scores) = validation_curve(xgb.XGBClassifier(), X_train,
                                               y_train, param_name='n_estimators',
                                               param_range= paramRange,
                                               scoring='accuracy', n_jobs=-1, verbose=True)


In [None]:
from pylab import rcParams
rcParams['figure.figsize'] = 15, 8

# Code Referred from https://chrisalbon.com/machine_learning/model_evaluation/plot_the_validation_curve/
# find mean value of training and validation f1-score
train_mean = np.mean(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)

plt.plot(paramRange, train_mean, 
         label='Training score',
         color='green')

plt.plot(paramRange, test_mean,
         label='Cross-validation score', 
         color='blue')

plt.axhline(y=1, color='k', ls='dashed')

# plot validation graph
plt.xlabel('n_estimators', fontsize=12)
plt.ylabel('f1 Score', fontsize=12)
plt.title('Validation Curve using XgBoost Classifier')
plt.ylim(0.7, 1.05)
plt.legend(loc='best')
plt.show()
i = np.argmax(test_mean)
print("Best cross-validation result ({0:.2f}) obtained for {1} trees".format(test_mean[i], paramRange[i]))


In [None]:
df_train["Installs"]

In [65]:
# Split data into training and testing sets
features = ['App Name', 'Reviews', 'Size',
            'Installs', 'Type', 'Price', 
            'Content Rating', 'Last Updated',
            'Latest Version']

features.extend(category_list)
X = df_train[features]
y = df_train['Rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 10)

xgbModel = KNeighborsRegressor(n_neighbors=15)
xgbModel.fit(X_train, y_train)
accuracy = xgbModel.score(X_test,y_test)
'Accuracy: ' + str(np.round(accuracy*100, 2)) + '%'

'Accuracy: -1.89%'

In [None]:
df_train

In [48]:
X_train

Unnamed: 0,App Name,Reviews,Size,Rating,Type,Price,Content Rating,Last Updated,Latest Version,cat_HEALTH_AND_FITNESS,...,cat_COMICS,cat_COMICS.1,cat_EVENTS,cat_EVENTS.1,cat_DATING,cat_DATING.1,cat_LIBRARIES_AND_DEMO,cat_LIBRARIES_AND_DEMO.1,cat_ART_AND_DESIGN,cat_ART_AND_DESIGN.1
29084,14559,5680,13.0,4.139965,1,0.00,1,1.513915e+09,4.1,0,...,0,0,0,0,0,0,0,0,0,0
13768,20381,9856,40.0,4.275568,1,0.00,1,1.551326e+09,4.8,0,...,0,0,0,0,0,0,0,0,0,0
6381,30584,12,8.7,4.583333,1,0.00,1,1.538363e+09,1.0,0,...,0,0,0,0,0,0,0,0,0,0
1881,21766,1255,34.0,4.137848,0,1.99,2,1.523848e+09,3.0,0,...,0,0,0,0,0,0,0,0,0,0
31709,28922,4686,0.0,4.502134,1,0.00,1,1.550462e+09,0.0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14264,27078,20714,30.0,4.561649,1,0.00,1,1.550030e+09,0.2,0,...,0,0,0,0,0,0,0,0,0,0
6652,30095,143571,13.0,3.935628,1,0.00,1,1.550117e+09,1.5,0,...,0,0,0,0,0,0,0,0,0,0
24156,1087,110,4.4,4.918182,1,0.00,1,1.552360e+09,0.4,0,...,0,0,0,0,0,0,0,0,0,0
19776,28388,104406,3.2,4.187911,1,0.00,1,1.552619e+09,2.2,0,...,0,0,0,0,0,0,0,0,0,0


In [66]:
y_train.unique()

array([4.13996458, 4.27556801, 4.58333349, ..., 4.9181819 , 4.18791056,
       4.22880697])