In [31]:
import os
import re
from datetime import datetime

import pandas as pd
import numpy as np

requireAppBrainData = False
bigCompanyThreshold = 10

def dropRowsWithMissingFeature(data, feature, missingValue):
    # consistency
    data[feature] = data[feature].astype(float)
    # easier handling
    data.loc[data[feature] == missingValue, feature] = np.nan
    # log
    print("{0} \t\t\t samples dropped due to missing {1}".format(data[feature].isnull().sum(), feature))
    # drop
    data.dropna(subset=[feature], how='any', inplace=True)
    # assert
    assert data[feature].isnull().sum() == 0


def dropFeatures(data, featureNames):
    for feature in featureNames:
        data.drop(feature, axis=1, inplace=True)

    print("{0} \t\t\t\t features dropped".format(len(featureNames)))


def dropRowsWithErrorsUsingAppId(data, appIds):
    for appId in appIds:
        data.drop(data.loc[data.appId == appId, :].index, inplace=True)

    print("{0} \t\t\t\t samples dropped due to problems with importing".format(len(appIds)))


data = pd.read_csv('data.csv', error_bad_lines=False)

print("{0} \t after read_csv".format(data.shape))


b'Skipping line 9694: expected 33 fields, saw 37\nSkipping line 11178: expected 33 fields, saw 34\n'


(28448, 33) 	 after read_csv


  interactivity=interactivity, compiler=compiler, result=result)


In [36]:
data.author.describe()

data.author.value_counts()

Oceanhouse Media, Inc.        177
Big Fish Games                170
Hit Songs Ringtones           137
Awesome Ringtones             124
GabySoft                      101
SaintBerlin                    81
Gluten Free Games              78
WatchMaster                    78
2Thumbz, Inc                   75
The Fool's Dog                 71
FLYTOMAP INC                   61
Diviniti Publishing Ltd        60
Monotype Imaging Inc.          60
PuzzleBoss Inc                 58
Nickelodeon                    57
KEMCO                          57
Smartwatch Bureaux             56
Upward Mobility                55
Authentic Ringtones            54
Noodlecake Studios Inc         51
Tecarta, Inc.                  50
Ringtone App                   48
Alawar Entertainment, Inc.     44
Ringtone Lord                  44
Artifex Mundi                  42
Kairosoft Co.,Ltd              42
Tapanifinal                    40
Teoti Graphix, LLC             39
memscape                       37
Ruslan Sokolov

In [22]:
    print("Missing Value Summary:")
    print(data.isnull().sum())

Missing Value Summary:
_id                               0
appId                             0
name                            322
linkName                        322
price                           322
starRating                      322
category                        325
badge                         28420
author                          322
totalNrOfReviews                322
reviewsPerStarRating            322
description                     323
whatsNew                       8530
lastUpdated                     325
size                          26766
installs                        416
currentVersion                  920
requiredAndroidVersion          323
contentRating                   323
permissions                     348
inAppProducts                 26480
appUrl                          322
similarApps                     323
ranking                       11585
binarySize                    11873
libraries                      5241
age                           11584
comme

In [2]:
    #
    #
    # DATA SELECTION
    #

    dropRowsWithErrorsUsingAppId(data, [
        # contains problem causing name "/ ESCAPE \"
        'air.com.kongregate.mobile.games.incredibleape.escape',
        # contains problem with installs (1.0.8 or 1.0) - again some export/read_csv problem
        'com.dotemu.thelastexpress',
        'rogerfgm.frameextractorex',
        # too many missing values
        'com.maiko.xscanpet'
    ])


    # drop rows with incomplete info from AppBrain
    #
    if requireAppBrainData:
        # drop rows which have no binarySize information
        # TODO: report how many
        data.dropna(subset=['binarySize'], how='any', inplace=True)
    else:
        # drop non-text features
        dropFeatures(data, [
            'ranking',
            'binarySize',
            'age'
        ])


    dropFeatures(data, [
        '_id',
        'lastAppInfoCrawlTimestamp',
        'appId',
        'name',
        'linkName',
        'badge',
        'size',
        'appUrl',
        'similarApps',
        'userComments',
        'commentsTag',
        'lastAppBrainCrawlTimestamp',
        'currentVersion',
        # TODO: inAppPurchase dropping: too much missing at this point
        'inAppProducts',
        'offersInAppPurchases',
        # TODO: 'permissions' & 'resourcePermissions', must be categorical or how many permissions
        'permissions',
        'resourcePermissions'
    ])


18 				 samples dropped due to problems with importing
3 				 features dropped
17 				 features dropped


In [3]:
    #
    #
    # DATA CLEANING & FEATURE EXTRACTION
    #

    # price: drop rows with missing
    #
    dropRowsWithMissingFeature(data, 'price', 0.0)


    # text features: concat
    #
    TEXT_FEATURES = ['description', 'whatsNew']

    data['text'] = data.loc[:, TEXT_FEATURES].apply(lambda x: ' '.join(map(str, x)), axis=1)
    data.drop(TEXT_FEATURES, axis=1, inplace=True)


    # author: extract bigCompany
    #
    author_counts = data['author'].value_counts()
    bigCompanies = author_counts[author_counts > bigCompanyThreshold]

    data['bigCompany'] = data['author'].apply(lambda x: 1.0 if x in bigCompanies else 0.0)
    data.drop('author', axis=1, inplace=True)


    # starRating: drop rows with missing
    #
    dropRowsWithMissingFeature(data, 'starRating', 0.0)


    # totalNrOfReviews: drop rows with missing
    #
    dropRowsWithMissingFeature(data, 'totalNrOfReviews', 0.0)


415 			 samples dropped due to missing price
3261 			 samples dropped due to missing starRating
0 			 samples dropped due to missing totalNrOfReviews


In [4]:
    # reviewsPerStarRating: make 5 separate features
    #


In [5]:
    # lastUpdated: transform into days since lastUpdated
    #
    data['days_since_lastUpdated'] = data['lastUpdated']\
        .apply(lambda x: (datetime.now() - datetime.strptime(str(x), '%B %d, %Y')).days).astype(float)

    data.drop('lastUpdated', axis=1, inplace=True)


    # installs: transform into 14 unique categories
    #
    # Handle missing values first
    # TODO: pipeline for imputing
    installsTop = data['installs'].describe().loc['top']

    print("{0} \t\t\t\t missing 'installs' replaced with their top value '{1}'".format(data['installs'].isnull().sum(), installsTop))

    data.loc[data['installs'].isnull(), 'installs'] = installsTop
    assert data['installs'].isnull().sum() == 0


    # requiredAndroidVersion: create two features representing the major and minor version
    #
    # Handle missing values first
    # TODO: pipeline for imputing
    versionTop = data['requiredAndroidVersion'].str.extractall('^(\d.\d)').describe().loc['top', 0]

    data.loc[data['requiredAndroidVersion'] == 'Varies with device', 'requiredAndroidVersion'] = np.nan

    print("{0} \t\t\t missing 'requiredAndroidVersion' replaced with their top value '{1}'".format(data['requiredAndroidVersion'].isnull().sum(), versionTop))

    data.loc[data['requiredAndroidVersion'].isnull(), 'requiredAndroidVersion'] = versionTop
    assert data['requiredAndroidVersion'].isnull().sum() == 0

    # Now we can parse out the numbers
    data['requiredAndroidVersion_major'] = \
        data['requiredAndroidVersion'].apply(lambda x: re.search(r'^(\d).\d', x).group(1)).astype(float)
    data['requiredAndroidVersion_minor'] = \
        data['requiredAndroidVersion'].apply(lambda x: re.search(r'^\d.(\d)', x).group(1)).astype(float)

    data.drop('requiredAndroidVersion', axis=1, inplace=True)


    # contentRating: transform into 5 unique categories
    #
    # Since we're categorizing later we need to handle missing values first
    # TODO: pipeline for imputing
    missingRating = 'Unrated'

    print("{0} \t\t\t\t missing 'contentRating' replaced with '{1}'".format(data['contentRating'].isnull().sum(), missingRating))

    data.loc[data['contentRating'].isnull(), 'contentRating'] = missingRating
    assert data['contentRating'].isnull().sum() == 0


    # libraries: fill missing with 0
    #
    # TODO: pipeline for imputing
    librariesMedian = data.loc[data['libraries'].notnull(), ['libraries']].median()[0].astype(float)

    print("{0} \t\t\t missing 'libraries' replaced with their median '{1}'".format(data['libraries'].isnull().sum(), librariesMedian))

    data.loc[data['libraries'].isnull(), 'libraries'] = str(librariesMedian)
    assert data['libraries'].isnull().sum() == 0

    data['libraries'] = data['libraries'].astype(float)


21 				 missing 'installs' replaced with their top value '100 - 500'
1588 			 missing 'requiredAndroidVersion' replaced with their top value '4.0'
0 				 missing 'contentRating' replaced with 'Unrated'
3902 			 missing 'libraries' replaced with their median '0.0'


In [6]:
# this is what still has to be worked on
data.isnull().sum()

price                           0
starRating                      0
category                        0
totalNrOfReviews                0
reviewsPerStarRating            0
installs                        0
contentRating                   0
libraries                       0
text                            0
bigCompany                      0
days_since_lastUpdated          0
requiredAndroidVersion_major    0
requiredAndroidVersion_minor    0
dtype: int64

In [7]:
    # category: just cleanup
    #
    # TODO: pipeline for imputing
    data['category'] = data['category'].str.upper()
    categoryTop = data['category'].describe().loc['top']

    print("{0} \t\t\t\t missing 'category' replaced with their top value '{1}'".format(data['category'].isnull().sum(), categoryTop))

    data.loc[data['category'].isnull(), 'category'] = categoryTop
    assert data['category'].isnull().sum() == 0


0 				 missing 'category' replaced with their top value 'PERSONALIZATION'


In [8]:
data.loc[data['category'] == 1, ['category']]
data.describe()
data.loc[data['category'].isnull(),:]

data['category'].describe().loc['top']

data.category.unique()

data.category.value_counts()
data.category.describe()
data.loc[data['category'] == '[{"1":0},{"2":0},{"3":0},{"4":0},{"5":0}]',:].index

data.loc[data.category.isnull(), :]
data['category'].describe().loc['top']

'PERSONALIZATION'

In [9]:
    print("{0} \t done cleaning".format(data.shape))

    print("Missing Value Summary:")
    print(data.isnull().sum())

(24768, 13) 	 done cleaning
Missing Value Summary:
price                           0
starRating                      0
category                        0
totalNrOfReviews                0
reviewsPerStarRating            0
installs                        0
contentRating                   0
libraries                       0
text                            0
bigCompany                      0
days_since_lastUpdated          0
requiredAndroidVersion_major    0
requiredAndroidVersion_minor    0
dtype: int64
