In [1]:
import pickle, json, collections, itertools
from zipfile import ZipFile
from datetime import datetime
from tqdm import tqdm_notebook as tqdm
import numpy as np
import pandas as pd
from scipy import stats
from matplotlib import pyplot as plt
import seaborn
%matplotlib inline

# Read Data

In [2]:
df = pd.read_csv('android_bids_us.csv')
df

Unnamed: 0,bidid,utc_time,app_id,user_state,user_isp,device_maker,device_model,device_osv,device_height,device_width,marketplace,click
0,87b0108c-6e9f-4783-8b80-8dd5aa3529a8,1536444331894,com.OppanaGames.CarSim,TX,AT&T Wireless,,,7.0,720,1280,chartboost,0
1,9284441f-a6ea-4698-9017-86436b92d416,1536444363932,com.mobilityware.CrownSolitaire,TX,T-Mobile USA,,,7.0,1280,720,chartboost,0
2,7e4779c2-f757-4324-8c2a-138b558b5a54,1536444386908,com.kuttigames.tenkyurollball,LA,AT&T Wireless,,,7.1.1,720,1280,chartboost,0
3,b64ea208-38ca-42ac-895e-0842d7352cc3,1536444405023,com.trendactionfree.call_of_sniper_duty_frontl...,TN,Comcast Cable,,,6.0.1,720,1280,chartboost,0
4,fd0c07cb-31f6-408c-9315-1cb652e76abc,1536444443458,com.landslab.my3rdgrademathgames,NC,AT&T U-verse,,,7.1.1,1024,768,chartboost,0
5,234541bb-9183-493e-8ad7-0d86fe619b15,1536444487429,com.kick.trucks.manual.shift.driving,FL,T-Mobile USA,,,7.0,720,1280,chartboost,0
6,31d4270d-32e8-441a-95e4-6a27ca21a356,1536444496213,com.slots.realvegas2,FL,Spectrum,,,7.1.1,720,1280,chartboost,0
7,3259e135-743b-48d9-b2e6-71707a2a9013,1536444316772,bitcoin.blockchain.game,GA,Sprint PCS,,,8.0.0,1920,1080,chartboost,0
8,9d897068-12a8-4a77-b12f-3150ffa9a6dd,1536444329288,com.feelingtouch.zf3d,ME,Spectrum,,,8.0.0,918,1887,chartboost,0
9,eba79a5a-a33b-4e54-af28-cad9b21d0739,1536444355834,com.ffgames.driftstar,UT,Comcast Cable,,,7.0,1440,2672,chartboost,0


# Enrich with app store details
I've scraped the app details from Google play store on your behalf, and stored it in to a zip.

In [5]:
app_details = ZipFile("play_apps.zip")
app_file = 'play_apps/a008.com.fc2.blog.androidkaihatu.datecamera2'
app = pickle.loads(app_details.read(app_file))
app

{'title': 'DateCamera2 (Auto timestamp)',
 'icon': 'https://lh3.googleusercontent.com/BvVD8_9aN_-wrqP7WTeF4u40MocWdbNoxxlU_HR1GnRT9SGxZmV3JEQbjCaRVj5880e3',
 'screenshots': ['https://lh3.googleusercontent.com/_CJBocPzM9CF2CPrnnvbEX5-9NllpY2AsqB9e8hyoDDwUIC_dv95Q7dehq8Tu7KKX1c=w720-h310-rw',
  'https://lh3.googleusercontent.com/zQUynoLG5VJd2dV4leUNCvqehyDMVbBVEiqWdKpL35CqkbxDsatCeGOuEviAGTEvHMI=w720-h310-rw',
  'https://lh3.googleusercontent.com/u7Yg9yiCLx9Jou2CqfUr0I1w1I08km_n0I-VCC7Tc8IE-lmqUtHfCemPChNnaO5n6dk=w720-h310-rw',
  'https://lh3.googleusercontent.com/KPcyVSjtX3dvVy8WP2G67U8G8xK2IQHhqWa0nnlPevCr-BaQm31t46StfEs96UTFtZBt=w720-h310-rw',
  'https://lh3.googleusercontent.com/dgUVp1C2nbIcj5XOKLGidzPkvBmWKwXzCxwuHwhhMkn7s8rve6gF-NkTHuuInr0GBA=w720-h310-rw',
  'https://lh3.googleusercontent.com/M0YDP301kQu6WsQxt9fZfsxGRumFrWZnTyPGNlcjeWDbLufl5CC4vDm1Z5F9SUiCR8I=w720-h310-rw',
  'https://lh3.googleusercontent.com/nWRsvYhsHsdmP3W7YC9L9J-vA8prbPIbWVi7mERMU_0jzI_92lU7mYd6gCajORcHBw=w720

## Get all apps categories

In [7]:
app2cat = {}
for app_file in tqdm(app_details.filelist):
    app_name = app_file.filename.split('/')[1]
    app = pickle.loads(app_details.read(app_file))
    app2cat[app_name] = app["category"]
app2cat

HBox(children=(IntProgress(value=0, max=29463), HTML(value='')))




{'a008.com.fc2.blog.androidkaihatu.datecamera2': ['PHOTOGRAPHY'],
 'a201706011153.xsky.txvpn': ['TOOLS'],
 'a201706021616.vpn.turbovpn': ['PRODUCTIVITY'],
 'a201707.grmo.a8bit.jp.beautytimer': ['TOOLS'],
 'a2x.studio.fast.charging.battery.supercharging': ['TOOLS'],
 'aasuited.net.word': ['GAME_TRIVIA'],
 'abc.kids.preschool.learning.phonics.songs.videos': ['EDUCATION',
  'FAMILY_EDUCATION'],
 'abraj.xalwan.com': ['LIFESTYLE'],
 'absworkout.bellyfatworkout.waistworkout.abdominalworkout': ['HEALTH_AND_FITNESS'],
 'abs.workout.fitness.tabata.hiit.stomach': ['HEALTH_AND_FITNESS'],
 'abs.workout.lose.belly': ['HEALTH_AND_FITNESS'],
 'abs.workout.women.fitness.tabata': ['HEALTH_AND_FITNESS'],
 'abuttha.android.project': ['BOOKS_AND_REFERENCE'],
 'aceviral.dragoncraft': ['GAME_SIMULATION'],
 'acrweb.fxnews': ['FINANCE'],
 'actiongames.games.cbb': ['GAME_CASUAL'],
 'addons.cars.for.minecraft': ['ENTERTAINMENT'],
 'addons.girlfriend.mod.formcpe': ['ENTERTAINMENT'],
 'addons.granny.map.forminecr

# Exercise:
1. Enrich the dataframe with `reviews`. `installs`, `size` and `category`
1. How would you choose categories for apps that have more than one ?
1. Which feature is most informative ?