In [1]:
import re
import sys

import time
import datetime

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import random


In [2]:
data = pd.read_csv('Google-Playstore-32K.csv', delimiter=',')
data.dataframeName = 'Google-Playstore-32K.csv'
nRow, nCol = data.shape
print(f'There are {nRow} rows and {nCol} columns')


There are 32000 rows and 11 columns


In [3]:
data = data.sort_values('Installs', ascending=False)
data = data.drop_duplicates(subset='App Name', keep='first')
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30845 entries, 6941 to 24784
Data columns (total 11 columns):
App Name           30845 non-null object
Category           30845 non-null object
Rating             30845 non-null object
Reviews            30844 non-null float64
Installs           30845 non-null object
Size               30845 non-null object
Price              30845 non-null object
Content Rating     30845 non-null object
Last Updated       30845 non-null object
Minimum Version    30845 non-null object
Latest Version     30844 non-null object
dtypes: float64(1), object(10)
memory usage: 2.8+ MB


In [4]:
#detect null cols and null rate
nulls = [i for i in data.isna().any().index if data.isna().any()[i]==True]
rates = []
counts = []
for i in nulls:    
    rates.append((data[i].isna().sum()/data.shape[0])*100)
    counts.append(data[i].isna().sum())
null_df = pd.DataFrame.from_dict({"Col":nulls,"Count":counts,"Null_Rates":rates})



df_train = data.copy()
#delete Reviews, Latest Version, null values row
for i in ['Reviews','Latest Version']:
    df_train = df_train.drop(df_train.loc[df_train[i].isnull()].index,0)
df_train.info()

Unnamed: 0,Col,Count,Null_Rates
0,Reviews,1,0.003242
1,Latest Version,1,0.003242


<class 'pandas.core.frame.DataFrame'>
Int64Index: 30843 entries, 13504 to 24784
Data columns (total 11 columns):
App Name           30843 non-null object
Category           30843 non-null object
Rating             30843 non-null object
Reviews            30843 non-null float64
Installs           30843 non-null object
Size               30843 non-null object
Price              30843 non-null object
Content Rating     30843 non-null object
Last Updated       30843 non-null object
Minimum Version    30843 non-null object
Latest Version     30843 non-null object
dtypes: float64(1), object(10)
memory usage: 2.8+ MB


In [6]:
df_train['Rating'].replace('Lessons', np.nan, inplace=True)
df_train['Rating'].replace('GAME_STRATEGY', np.nan, inplace=True)
df_train['Rating'].replace('NEWS_AND_MAGAZINES', np.nan, inplace=True)
df_train['Rating'] = df_train['Rating'].fillna(df_train['Rating'].median())
# for i in ['Rating']:
#     df_train = df_train.drop(df_train.loc[df_train[i].isnull()].index,0)
df_train['Rating'].astype(float).describe()


count    30843.000000
mean         4.268535
std          0.501046
min          1.000000
25%          4.086957
50%          4.373173
75%          4.581750
max          5.000000
Name: Rating, dtype: float64

In [7]:
df_train['Installs'] = df_train['Installs'].apply(lambda x : x.strip('+').replace(',', ''))
df_train.Installs.unique()


array(['6', '500000000', '500000', '500', '50000000', '50000', '50',
       '5000000000', '5000000', '5000', '5', '11976', '100000000',
       '100000', '100', '10000000', '10000', '10', '1000000000',
       '1000000', '1000', '1', '0'], dtype=object)

In [8]:
len(df_train.Category.unique())

51

In [9]:

regex = [r'GAME_[A-Za-z]+.*']
for j in regex:
    df_train['Category'] = df_train['Category'].astype(str).apply(lambda x : re.sub(j, 'GAME', x))

In [10]:
df_train.Category.unique()

array([')', 'HEALTH_AND_FITNESS', 'PRODUCTIVITY', 'GAME', 'ENTERTAINMENT',
       'TOOLS', 'COMMUNICATION', 'NEWS_AND_MAGAZINES', 'VIDEO_PLAYERS',
       'SOCIAL', 'WEATHER', 'LIFESTYLE', 'MAPS_AND_NAVIGATION',
       'EDUCATION', 'PERSONALIZATION', 'SHOPPING', 'TRAVEL_AND_LOCAL',
       'FINANCE', 'SPORTS', 'BOOKS_AND_REFERENCE', 'MUSIC_AND_AUDIO',
       'FOOD_AND_DRINK', 'MEDICAL', 'BUSINESS', 'AUTO_AND_VEHICLES',
       'PHOTOGRAPHY', 'BEAUTY', 'HOUSE_AND_HOME', 'PARENTING', 'COMICS',
       'EVENTS', 'DATING', 'LIBRARIES_AND_DEMO', 'ART_AND_DESIGN',
       ' Channel 2 News'], dtype=object)

In [11]:
regex = [r'[-+|/:/;(_)@\[\]#�,>]', r'\s+', r'[A-Za-z]+']
for j in regex:
    df_train['Latest Version'] = df_train['Latest Version'].astype(str).apply(lambda x : re.sub(j, '0', x))
df_train['Latest Version'].replace('?.?', np.nan, inplace=True)

df_train['Latest Version'] = df_train['Latest Version'].astype(str).apply(lambda x : x.replace('.', ',',1).replace('.', '').replace(',', '.',1)[:3]).astype(float)
df_train['Latest Version'] = df_train['Latest Version'].fillna(df_train['Latest Version'].median())


In [12]:
df_train['Latest Version'].unique()

array([4.00e+00, 6.20e+00, 0.00e+00, 1.40e+00, 5.20e+00, 2.10e+00,
       1.36e+02, 1.00e+01, 3.10e+00, 4.90e+00, 1.20e+01, 1.10e+00,
       9.40e+00, 1.50e+00, 4.10e+00, 1.10e+01, 1.50e+01, 1.00e+00,
       4.70e+00, 4.30e+00, 1.60e+00, 1.30e+00, 5.10e+00, 1.20e+00,
       3.20e+00, 7.20e+00, 1.70e+00, 1.00e-01, 3.90e+00, 2.30e+00,
       9.20e+00, 3.30e+00, 1.90e+00, 2.40e+00, 1.80e+00, 4.20e+00,
       2.70e+00, 2.20e+00, 6.10e+00, 2.00e+00, 1.30e+01, 2.01e+02,
       5.50e+00, 4.00e+01, 4.60e+00, 6.30e+00, 3.70e+00, 3.00e+00,
       2.50e+00, 2.90e+00, 3.40e+00, 2.80e+00, 5.80e+00, 3.50e+00,
       5.00e-01, 2.60e+00, 2.50e+01, 4.50e+00, 4.60e+01, 2.10e+01,
       1.90e+01, 8.30e+00, 7.00e+00, 8.93e+02, 5.00e+00, 4.40e+00,
       3.80e+00, 7.60e+00, 2.00e-01, 3.60e+00, 9.90e+00, 3.10e+02,
       8.10e+00, 6.00e+00, 5.70e+00, 5.90e+00, 8.00e+00, 7.10e+00,
       6.60e+00, 7.30e+00, 6.80e+00, 7.50e+00, 2.73e+02, 5.30e+00,
       7.00e-01, 1.40e+01, 9.10e+00, 1.70e+01, 5.60e+00, 6.70e

In [13]:
df_train['Category'].replace(' Channel 2 News', np.nan, inplace=True)
df_train['Category'].replace(')', np.nan, inplace=True)
df_train = df_train[pd.notnull(df_train['Category'])]


In [14]:
# App values encoding
le = preprocessing.LabelEncoder()
df_train['App Name'] = le.fit_transform(df_train['App Name'])

In [15]:
# Category features encoding
category_list = df_train['Category'].unique().tolist() 
category_list = ['cat_' + word for word in category_list]
df_train = pd.concat([df_train, pd.get_dummies(df_train['Category'], prefix='cat')], axis=1)


In [16]:
# Encode Content Rating features
le = preprocessing.LabelEncoder()
df_train['Content Rating'] = le.fit_transform(df_train['Content Rating'])

In [17]:
# Price cealning
df_train['Price'] = df_train['Price'].apply(lambda x : x.strip('$'))

In [18]:
df_train.Price.unique()

array(['0', '1.99', '0.99', '4.99', '6.99', '2.99', '9.99', '3.75',
       '3.99', '2.49', '2.79', '7.99', '5.99', '14.99', '4.49', '4.95',
       '1.49', '3.85', '15.99', '12.99', '24.99', '6.49', '5.49', '3.00',
       '29.99', '3.49', '7.49', '84.99', '9.49', '39.99', '6.00', '8.99',
       '1.00', '12.00', '2.00', '2.50', '5.00', '5.74', '4.69', '11.99',
       '8.80', '1.20', '8.49', '54.99', '10.75', '1.40', '18.60', '5.78',
       '19.99', '2.40', '5.33', '3.22', '4.89', '10.99', '1.19', '4.00',
       '1.97', '3.29', '9.00', '2.20', '3.95', '1.29', '13.99', '14.01',
       '1.26', '14.93', '32.99', '1.09', '22.99', '9.79', '1.50', '1.80',
       '5.76', '14.73', '45.99', '9.95', '3.81', '10.00', '3.55', '1.25',
       '6.71', '1.90', '27.99', '1.59', '74.99', '7.74', '6.30', '23.99',
       '2.90', '1.05', '399.99', '6.29', '31.99', '79.99', '21.00',
       '34.99', '99.99', '2.59', '17.99', '4.50', '299.99'], dtype=object)

In [19]:
df_train['Price'].replace('Varies with device', np.nan, inplace=True)
df_train['Price'] = df_train['Price'].fillna(df_train['Price'].median())


In [20]:
df_train['Type'] = np.where(df_train['Price'].astype(float) > 0, 'Paid', 'Free')


In [21]:
# Type encoding
df_train['Type'] = pd.get_dummies(df_train['Type'])


In [22]:
df_train['Last Updated'].replace('Everyone 10+', np.nan, inplace=True)
df_train = df_train[pd.notnull(df_train['Last Updated'])]

df_train['Last Updated'] = df_train['Last Updated'].apply(lambda x : time.mktime(datetime.datetime.strptime(x, '%B %d, %Y').timetuple()))


In [23]:
# from datetime import datetime,date
# temp=pd.to_datetime(df_train['Last Updated'])
# temp.head()
# df_train['Last_Updated_Days'] = temp.apply(lambda x:date.today()-datetime.date(x))
# df_train['Last_Updated_Days'] = df_train['Last_Updated_Days'].astype(str)

# regex = [r'days.*']
# for j in regex:
#     df_train['Last_Updated_Days'] = df_train['Last_Updated_Days'].astype(str).apply(lambda x : re.sub(j, '', x))
# df_train['Last_Updated_Days'] = df_train['Last_Updated_Days'].astype(int)


In [24]:
regex = [r',']
for j in regex:
    df_train['Size'] = df_train['Size'].astype(str).apply(lambda x : re.sub(j, '', x))
    
# Convert kbytes to Mbytes 
k_indices = df_train['Size'].loc[df_train['Size'].str.contains('k')].index.tolist()
converter = pd.DataFrame(df_train.loc[k_indices, 'Size'].apply(lambda x: x.strip('k')).astype(float).apply(lambda x: x / 1024).apply(lambda x: round(x, 3)).astype(str))
df_train.loc[k_indices,'Size'] = converter


df_train['Size'] = df_train['Size'].apply(lambda x: x.strip('M'))
df_train.loc[df_train['Size'] == 'Varies with device', 'Size'] = 0
# df_train[df_train['Size'] == 'Varies with device','Size'] = 0
df_train['Size'] = df_train['Size'].astype(float)



In [25]:
regex = [r'[-+|/:/;(_)@\[\]#�,>]', r'\s+', r'[A-Za-z]+']
for j in regex:
    df_train['Minimum Version'] = df_train['Minimum Version'].astype(str).apply(lambda x : re.sub(j, '0', x))

df_train['Minimum Version'] = df_train['Minimum Version'].astype(str).apply(lambda x : x.replace('.', ',',1).replace('.', '').replace(',', '.',1)[:3]).astype(float)
df_train['Minimum Version'] = df_train['Minimum Version'].fillna(df_train['Minimum Version'].median())


In [26]:
df_train['Installs'] = df_train['Installs'].astype(int)
df_train['Price'] = df_train['Price'].astype(float)
df_train['Rating'] = df_train['Rating'].astype(float)
df_train['Reviews'] = df_train['Reviews'].astype(int)
# df_train['Last Updated'] = df_train['Last Updated'].astype(float)

In [27]:
df_train['Rating'] =df_train['Rating'].apply(lambda x: round(x, 2))


In [65]:
# Split data into training and testing sets
features = ['App Name', 'Reviews', 'Size', 'Rating', 'Type', 'Price', 'Content Rating', 'Last Updated', 'Latest Version']
features.extend(category_list)
X = df_train[features]
y = df_train['Installs']

# Takin
X_train, X_test, y_train, y_test = train_test_split(X[:10000], y[:10000], test_size = 0.25, random_state = 10)


In [66]:
import xgboost as xgb

xgbModel = xgb.XGBClassifier()
xgbModel.fit(X_train, y_train)
accuracy = xgbModel.score(X_test,y_test)
'Accuracy: ' + str(np.round(accuracy*100, 2)) + '%'

'Accuracy: 80.2%'