In [1]:
import re
import sys

import time
import datetime

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import random


In [2]:
data = pd.read_csv('Google-Playstore-32K.csv', delimiter=',')
data.dataframeName = 'Google-Playstore-32K.csv'
nRow, nCol = data.shape
print(f'There are {nRow} rows and {nCol} columns')


There are 32000 rows and 11 columns


In [3]:
data = data.sort_values('Installs', ascending=False)
data = data.drop_duplicates(subset='App Name', keep='first')
maxVal = 10000

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30845 entries, 6941 to 24784
Data columns (total 11 columns):
App Name           30845 non-null object
Category           30845 non-null object
Rating             30845 non-null object
Reviews            30844 non-null float64
Installs           30845 non-null object
Size               30845 non-null object
Price              30845 non-null object
Content Rating     30845 non-null object
Last Updated       30845 non-null object
Minimum Version    30845 non-null object
Latest Version     30844 non-null object
dtypes: float64(1), object(10)
memory usage: 2.8+ MB


In [4]:
#detect null cols and null rate
nulls = [i for i in data.isna().any().index if data.isna().any()[i]==True]
rates = []
counts = []
for i in nulls:    
    rates.append((data[i].isna().sum()/data.shape[0])*100)
    counts.append(data[i].isna().sum())
null_df = pd.DataFrame.from_dict({"Col":nulls,"Count":counts,"Null_Rates":rates})



df_train = data.copy()
#delete Reviews, Latest Version, null values row
for i in ['Reviews','Latest Version']:
    df_train = df_train.drop(df_train.loc[df_train[i].isnull()].index,0)

# PreProcessing Rating Attribute
df_train['Rating'].replace('Lessons', np.nan, inplace=True)
df_train['Rating'].replace('GAME_STRATEGY', np.nan, inplace=True)
df_train['Rating'].replace('NEWS_AND_MAGAZINES', np.nan, inplace=True)
df_train['Rating'] = df_train['Rating'].fillna(df_train['Rating'].median())
# for i in ['Rating']:
#     df_train = df_train.drop(df_train.loc[df_train[i].isnull()].index,0)
df_train['Rating'].astype(float).describe()


# Remove '+' from end of each value
df_train['Installs'] = df_train['Installs'].apply(lambda x : x.strip('+').replace(',', ''))

regex = [r'GAME_[A-Za-z]+.*']
for j in regex:
    df_train['Category'] = df_train['Category'].astype(str).apply(lambda x : re.sub(j, 'GAME', x))
    
regex = [r'[-+|/:/;(_)@\[\]#�,>]', r'\s+', r'[A-Za-z]+']
for j in regex:
    df_train['Latest Version'] = df_train['Latest Version'].astype(str).apply(lambda x : re.sub(j, '0', x))
df_train['Latest Version'].replace('?.?', np.nan, inplace=True)

df_train['Latest Version'] = df_train['Latest Version'].astype(str).apply(lambda x : x.replace('.', ',',1).replace('.', '').replace(',', '.',1)[:3]).astype(float)
df_train['Latest Version'] = df_train['Latest Version'].fillna(df_train['Latest Version'].median())

df_train['Category'].replace(' Channel 2 News', np.nan, inplace=True)
df_train['Category'].replace(')', np.nan, inplace=True)
df_train = df_train[pd.notnull(df_train['Category'])]

# App values encoding
le = preprocessing.LabelEncoder()
df_train['App Name'] = le.fit_transform(df_train['App Name'])

# Category features encoding
category_list = df_train['Category'].unique().tolist() 
category_list = ['cat_' + word for word in category_list]
df_train = pd.concat([df_train, pd.get_dummies(df_train['Category'], prefix='cat')], axis=1)

# Encode Content Rating features
le = preprocessing.LabelEncoder()
df_train['Content Rating'] = le.fit_transform(df_train['Content Rating'])


# Price 
df_train['Price'] = df_train['Price'].apply(lambda x : x.strip('$'))
df_train['Price'].replace('Varies with device', np.nan, inplace=True)
df_train['Price'] = df_train['Price'].fillna(df_train['Price'].median())

df_train['Type'] = np.where(df_train['Price'].astype(float) > 0, 'Paid', 'Free')
# Type encoding
df_train['Type'] = pd.get_dummies(df_train['Type'])

df_train['Last Updated'].replace('Everyone 10+', np.nan, inplace=True)
df_train = df_train[pd.notnull(df_train['Last Updated'])]

df_train['Last Updated'] = df_train['Last Updated'].apply(lambda x : time.mktime(datetime.datetime.strptime(x, '%B %d, %Y').timetuple()))


regex = [r',']
for j in regex:
    df_train['Size'] = df_train['Size'].astype(str).apply(lambda x : re.sub(j, '', x))
    
# Convert kbytes to Mbytes 
k_indices = df_train['Size'].loc[df_train['Size'].str.contains('k')].index.tolist()
converter = pd.DataFrame(df_train.loc[k_indices, 'Size'].apply(lambda x: x.strip('k')).astype(float).apply(lambda x: x / 1024).apply(lambda x: round(x, 3)).astype(str))
df_train.loc[k_indices,'Size'] = converter


df_train['Size'] = df_train['Size'].apply(lambda x: x.strip('M'))
df_train.loc[df_train['Size'] == 'Varies with device', 'Size'] = 0
# df_train[df_train['Size'] == 'Varies with device','Size'] = 0
df_train['Size'] = df_train['Size'].astype(float)

regex = [r'[-+|/:/;(_)@\[\]#�,>]', r'\s+', r'[A-Za-z]+']
for j in regex:
    df_train['Minimum Version'] = df_train['Minimum Version'].astype(str).apply(lambda x : re.sub(j, '0', x))

df_train['Minimum Version'] = df_train['Minimum Version'].astype(str).apply(lambda x : x.replace('.', ',',1).replace('.', '').replace(',', '.',1)[:3]).astype(float)
df_train['Minimum Version'] = df_train['Minimum Version'].fillna(df_train['Minimum Version'].median())


df_train['Installs'] = df_train['Installs'].astype(int)
df_train['Price'] = df_train['Price'].astype(float)
df_train['Rating'] = df_train['Rating'].astype(float)
df_train['Reviews'] = df_train['Reviews'].astype(int)
# df_train['Last Updated'] = df_train['Last Updated'].astype(float)

df_train['Rating'] =df_train['Rating'].apply(lambda x: round(x, 2))


In [5]:
# Split data into training and testing sets
features = ['App Name', 'Reviews', 'Size', 'Rating', 'Type', 'Price', 'Content Rating', 'Last Updated', 'Latest Version']
features.extend(category_list)
X = df_train[features][:maxVal]
y = df_train['Installs'][:maxVal]

# Takin
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 10)


In [6]:
import xgboost as xgb

xgbModel = xgb.XGBClassifier()
xgbModel.fit(X_train, y_train)
accuracy = xgbModel.score(X_test,y_test)
'Accuracy: ' + str(np.round(accuracy*100, 2)) + '%'

'Accuracy: 80.2%'