In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly
import plotly.graph_objs as go
import plotly.plotly as py
import seaborn as sns

In [2]:
events = pd.read_pickle('data/events.pkl')

# Events

In [3]:
events.head()

Unnamed: 0,session_id,event,event_timestamp,event_value,user_id_hash
0,5558845121177764917,45,1542215397132,0.0,9943447915df3a45fd6720a026af905b6da6b56a37701b...
1,5558845121177764917,45,1542215484895,0.0,9943447915df3a45fd6720a026af905b6da6b56a37701b...
2,7689508378645584666,.m5100869650219008,1541124410372,0.0,9943447915df3a45fd6720a026af905b6da6b56a37701b...
3,2201961907282901522,4,1543713091129,0.0,9943447915df3a45fd6720a026af905b6da6b56a37701b...
4,2201961907282901522,6,1543713093116,0.0,9943447915df3a45fd6720a026af905b6da6b56a37701b...


In [4]:
events['event'].value_counts()[:5]

45    75493931
1      5363926
5      4887922
6      4068474
14     3491117
Name: event, dtype: int64

In [5]:
events.event[events.event == '8'].count()

265034

In [6]:
purchases = events[['user_id_hash', 'event']][events.event == '8'].groupby('user_id_hash').count()
purchases.head()

Unnamed: 0_level_0,event
user_id_hash,Unnamed: 1_level_1
002e447aed33ed4c51a68743cc293ef2148058b6a6239e37d013cc6dc49bdfdf,0
005c9d79cf18efc6c8b5fa767964b1def9b8e2f8abd23f453310f60d69584a9f,0
0061b4d30e8a9935b2ebeec954ff509b4f0cb500cd725c1aa7145841c82d907c,0
00fbbf507c7d3c2f259cd3329d241c29d35712e2d4699f5c6a51d5149219ecee,0
0107a0017873efc2dea9a9155832363ceacf6fe97bd428a73efa67b639962ac0,0


In [7]:
purchases.nlargest(5, 'event')

Unnamed: 0_level_0,event
user_id_hash,Unnamed: 1_level_1
286bdab39041ccbbc2d602d3581260f9373c9bcf198f193930fdb1b0006eeba9,445
a96b9b72fca9ed20a3e004b9b9a91321fa40d4069a354b0c7a05260803836bec,236
6bea4e3ad45f764cfbd0c57be198a4f949a177a520424e05bbdcee926152b4e1,235
84008b429d0c07a5a2389775e7811e9065039749b376cb6f0e26ed6ad1212583,233
180da4c89f6d5f1a12da04c6a2310063a86e27cd52c75c670a8fe81ba3c19fac,218


In [8]:
purchases = purchases[purchases.event > 0]

In [9]:
data = go.Box(
    y = purchases.event,
    name = 'Purchase Frequency'
)

layout = go.Layout(
    title = "# of In-App Purchases per User over Entire Time Period"
)

In [10]:
fig = dict( data=[data], layout=layout )
plotly.offline.plot(fig, filename='purchases.html')

'file:///home/wtq920828/ML-project/purchases.html'

In [11]:
avg_purch_value = events[['user_id_hash', 'event', 'event_value']][events.event == '8'].groupby('user_id_hash').mean()
avg_purch_value = avg_purch_value[avg_purch_value.event_value.notnull()]
avg_purch_value.head()

Unnamed: 0_level_0,event_value
user_id_hash,Unnamed: 1_level_1
0c7c07196340726ed212576754e4c2348e89df786e3ebeb3733b9b3535ab3487,1.392578
0e02d992cae31bf3cc09708fc70db2cf935288bf715aa73607851d8e32de4f37,4.921875
1859118795458356332a9e416c958850282b3168edab93846976f9d91d45324c,1.392578
23099a3bd2a6a9a9ef55707d1b45e935f759e31f27d427bc024f6301722643b1,8.25
233d517a59c9dcaac0291cff13b46921aca35d332149449f7fecaf558c63457a,3.414062


In [12]:
data = go.Box(
    y = avg_purch_value.event_value,
    name = 'Avg. Purchase Value (in USD)'
)

layout = go.Layout(
    title = "Avg. In-App Purchase Value per User over Entire Time Period"
)

In [13]:
fig = dict( data=[data], layout=layout )
plotly.offline.plot(fig, filename='purchases_value.html')

'file:///home/wtq920828/ML-project/purchases_value.html'

In [14]:
total_purch_value = events[['user_id_hash', 'event', 'event_value']][events.event == '8'].groupby('user_id_hash').sum()
total_purch_value = total_purch_value[(total_purch_value.event_value.notnull()) & total_purch_value.event_value > 0]
total_purch_value.head()

Unnamed: 0_level_0,event_value
user_id_hash,Unnamed: 1_level_1
0c7c07196340726ed212576754e4c2348e89df786e3ebeb3733b9b3535ab3487,8.359375
0e02d992cae31bf3cc09708fc70db2cf935288bf715aa73607851d8e32de4f37,241.125
1859118795458356332a9e416c958850282b3168edab93846976f9d91d45324c,8.359375
23099a3bd2a6a9a9ef55707d1b45e935f759e31f27d427bc024f6301722643b1,41.28125
233d517a59c9dcaac0291cff13b46921aca35d332149449f7fecaf558c63457a,30.734375


In [15]:
avg_purch = purchases.join(avg_purch_value, on='user_id_hash')
total_purch = purchases.join(total_purch_value, on='user_id_hash')

In [16]:
data1 = go.Scatter(
    x = avg_purch.event,
    y = avg_purch.event_value,
    mode = 'markers',
    name = 'Average'
)

data2 = go.Scatter(
    x = total_purch.event,
    y = total_purch.event_value,
    mode = 'markers',
    name = 'Total'
)

fig = plotly.tools.make_subplots(rows=2, cols=1, specs=[[{}], [{}]],
                                 shared_xaxes=True, vertical_spacing=0.1,
                                 subplot_titles=('Avg. In-App Purchase Value vs. # of Purchases per User',
                                                 'Total In-App Purchase Value vs. # of Purchases per User'))

fig.append_trace(data1, 1, 1)
fig.append_trace(data2, 2, 1)

fig['layout'].update(
#     title = "",
    xaxis = dict(title = 'Purchases per User'),
    yaxis = dict(title = 'Avg. Purchase Value (in USD)'),
    yaxis2 = dict(title = 'Total Purchase Value (in USD)')
)

This is the format of your plot grid:
[ (1,1) x1,y1 ]
[ (2,1) x1,y2 ]



Layout({
    'annotations': [{'font': {'size': 16},
                     'showarrow': False,
                     'text': 'Avg. In-App Purchase Value vs. # of Purchases per User',
                     'x': 0.5,
                     'xanchor': 'center',
                     'xref': 'paper',
                     'y': 1.0,
                     'yanchor': 'bottom',
                     'yref': 'paper'},
                    {'font': {'size': 16},
                     'showarrow': False,
                     'text': 'Total In-App Purchase Value vs. # of Purchases per User',
                     'x': 0.5,
                     'xanchor': 'center',
                     'xref': 'paper',
                     'y': 0.45,
                     'yanchor': 'bottom',
                     'yref': 'paper'}],
    'xaxis': {'anchor': 'y2', 'domain': [0.0, 1.0], 'title': {'text': 'Purchases per User'}},
    'yaxis': {'anchor': 'free', 'domain': [0.55, 1.0], 'position': 0.0, 'title': {'text': 'Avg. Purchase V

In [17]:
plotly.offline.plot(fig, filename='purchases_relationship.html')

'file:///home/wtq920828/ML-project/purchases_relationship.html'

In [18]:
del events

# Sessions

In [19]:
session = pd.read_pickle('data/session.pkl')

In [20]:
session.head()

Unnamed: 0,session_id,start_timestamp,timezone,timezone_offset,previous_sessions_duration,user_created_timestamp,is_user_first_session,country,region,city,latitude,longitude,locale,os_name,session_index,device_id,user_id_hash
0,5558845121177764917,1542215364580,Asia/Manila,28800000.0,25837591,1538874289458,False,PH,0,makati,14.554729,121.024445,en_GB,Android OS,30,546a3d98-d540-4e72-ad82-9ebd64e0839b,9943447915df3a45fd6720a026af905b6da6b56a37701b...
1,18781111175537580,1539215568666,Asia/Manila,28800000.0,11343848,1538874289458,False,PH,0,makati,14.554729,121.024445,en_GB,Android OS,10,546a3d98-d540-4e72-ad82-9ebd64e0839b,9943447915df3a45fd6720a026af905b6da6b56a37701b...
2,1477540082628742048,1540120743010,Asia/Manila,28800000.0,13499724,1538874289458,False,PH,11,davao city,7.190708,125.455338,en_GB,Android OS,13,546a3d98-d540-4e72-ad82-9ebd64e0839b,9943447915df3a45fd6720a026af905b6da6b56a37701b...
3,8184875317380844086,1542671625528,Asia/Manila,28800000.0,32788010,1538874289458,False,PH,0,makati,14.554729,121.024445,en_GB,Android OS,41,546a3d98-d540-4e72-ad82-9ebd64e0839b,9943447915df3a45fd6720a026af905b6da6b56a37701b...
4,4706180700083856343,1538997913013,Asia/Manila,28800000.0,5872534,1538874289458,False,PH,11,davao city,7.190708,125.455338,en_GB,Android OS,4,546a3d98-d540-4e72-ad82-9ebd64e0839b,9943447915df3a45fd6720a026af905b6da6b56a37701b...


In [21]:
activity = session[['session_id', 'country']].groupby('country').count()
activity.head()

Unnamed: 0_level_0,session_id
country,Unnamed: 1_level_1
AE,12657
AG,1158
AL,2584
AR,12511
AT,13641


In [22]:
# np.savetxt(r'./countries.txt', activity.index.values, fmt='%s')

In [23]:
alph3 = pd.read_csv('alph3.csv', sep='\n', header=None); alph3.head()

Unnamed: 0,0
0,ARE
1,ATG
2,ALB
3,ARG
4,AUT


In [24]:
activity = pd.concat([activity.reset_index(), alph3], axis=1); activity.head()

Unnamed: 0,country,session_id,0
0,AE,12657,ARE
1,AG,1158,ATG
2,AL,2584,ALB
3,AR,12511,ARG
4,AT,13641,AUT


In [25]:
data = [go.Choropleth(
    locations = activity[0],
    z = activity['session_id'],
    text = activity['session_id'],
    colorscale = [
        [0, "rgb(172, 10, 5)"],
        [0.35, "rgb(190, 60, 40)"],
        [0.5, "rgb(245, 100, 70)"],
        [0.6, "rgb(245, 120, 90)"],
        [0.7, "rgb(247, 137, 106)"],
        [1, "rgb(220, 220, 220)"]
    ],
    autocolorscale = False,
    reversescale = True,
    marker = go.choropleth.Marker(
        line = go.choropleth.marker.Line(
            color = 'rgb(180,180,180)',
            width = 0.5
        )),
    colorbar = go.choropleth.ColorBar(
        title = 'Logged Sessions'),
)]

layout = go.Layout(
    title = go.layout.Title(
        text = 'Session Activity by Country'
    ),
    geo = go.layout.Geo(
        showframe = False,
        showcoastlines = False,
        projection = go.layout.geo.Projection(
            type = 'equirectangular'
        )
    )
)

In [26]:
fig = go.Figure( data=data, layout=layout )
plotly.offline.plot( fig, validate=False, filename='activity.html' )

'file:///home/wtq920828/ML-project/activity.html'