In [142]:
import pandas as pd
import numpy as np
from statistics import mean
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta 
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score, log_loss
import chart_studio.plotly as py
import cufflinks as cf
import plotly.express as px
import plotly.graph_objects as go
%matplotlib inline
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
cf.go_offline()

## takehome_users
- `name`: the user's name
- `object_id`: the user's id
- `email`: email address
- `creation_source`: how their account was created. This takes on one of 5 values:
 - PERSONAL_PROJECTS: invited to join another user's personal workspace
 - GUEST_INVITE: invited to an organization as a guest (limited permissions)
 - ORG_INVITE: invited to an organization (as a full member)
 - SIGNUP: signed up via the website
 - SIGNUP_GOOGLE_AUTH: signed up using Google Authentication (using a Google email account for their login id)
- `creation_time`: when they created their account
- `last_session_creation_time`: unix timestamp of last login
- `opted_in_to_mailing_list`: whether they have opted into receiving marketing emails
- `enabled_for_marketing_drip`: whether they are on the regular marketing email drip
- `org_id`: the organization (group of users) they belong to
- `invited_by_user_id`: which user invited them to join (if applicable)

## takehome_user_engagement 
Has a row for each day that a user logged into the product.

Defining an `adopted_user` as a user who has logged into the product on <ins>three separate days</ins> in at least <ins>one sevenday period</ins> 

Goal:
<ins>__Identify which factors predict future user adoption.__</ins>

In [9]:
# pd.read_csv('takehome_users.csv', encoding='ISO-8859-1')

In [12]:
users = pd.read_csv('takehome_users.csv').drop(columns=['Unnamed: 0'], axis=1)
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [13]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   object_id                   12000 non-null  int64  
 1   creation_time               12000 non-null  object 
 2   name                        12000 non-null  object 
 3   email                       12000 non-null  object 
 4   creation_source             12000 non-null  object 
 5   last_session_creation_time  8823 non-null   float64
 6   opted_in_to_mailing_list    12000 non-null  int64  
 7   enabled_for_marketing_drip  12000 non-null  int64  
 8   org_id                      12000 non-null  int64  
 9   invited_by_user_id          6417 non-null   float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


In [38]:
logins = pd.read_csv('takehome_user_engagement.csv')
logins.time_stamp = pd.to_datetime(logins.time_stamp)
logins = logins.sort_values(['user_id', 'time_stamp'] )
logins.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [22]:
logins.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   time_stamp  207917 non-null  datetime64[ns]
 1   user_id     207917 non-null  int64         
 2   visited     207917 non-null  int64         
dtypes: datetime64[ns](1), int64(2)
memory usage: 4.8 MB


In [122]:
# check number of candidates for adoption
len(set(logins.user_id))

8823

In [129]:
# ids with atleast 3 logins
id_sums = logins[['user_id', 'visited']].groupby('user_id').sum()
unadopt = id_sums.loc[id_sums.visited<3].index.values
candidates = id_sums.loc[id_sums.visited>=3].index.values

In [135]:
def adoption(user):
    if user in unadopt:
        return 0
    else:
        dailys = logins[logins.user_id==user].set_index('time_stamp')[['visited']].resample('1d').sum()
        rolling7s = dailys.rolling(7).sum().dropna()
        greaterthan3 = rolling7s[rolling7s.visited>=3]
        if greaterthan3.size == 0:
            return 0
        else:
            for time_idx in range(len(greaterthan3)):
                start = greaterthan3.index[time_idx]-timedelta(days=7)
                end = greaterthan3.index[time_idx]
                span = dailys.loc[(dailys.index>start) & (dailys.index<=end)]
                sums = len(set(span.visited.cumsum().values))
                if sums<3:
                    return 0
                else: 
                    return 1

In [138]:
users['adoption'] = users.object_id.apply(adoption)

In [141]:
users[['object_id', 'adoption']].loc[users.adoption>=1]

Unnamed: 0,object_id,adoption
1,2,1
9,10,1
19,20,1
32,33,1
41,42,1
...,...,...
11964,11965,1
11966,11967,1
11968,11969,1
11974,11975,1


In [80]:
dir(greaterthan3.index[0])

['__add__',
 '__array_priority__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__radd__',
 '__reduce__',
 '__reduce_cython__',
 '__reduce_ex__',
 '__repr__',
 '__rsub__',
 '__setattr__',
 '__setstate__',
 '__setstate_cython__',
 '__sizeof__',
 '__str__',
 '__sub__',
 '__subclasshook__',
 '__weakref__',
 '_date_repr',
 '_freq',
 '_freqstr',
 '_repr_base',
 '_round',
 '_set_freq',
 '_short_repr',
 '_time_repr',
 'asm8',
 'astimezone',
 'ceil',
 'combine',
 'ctime',
 'date',
 'day',
 'day_name',
 'day_of_week',
 'day_of_year',
 'dayofweek',
 'dayofyear',
 'days_in_month',
 'daysinmonth',
 'dst',
 'floor',
 'fold',
 'freq',
 'freqstr',
 'fromisocalendar',
 'fromisoformat',
 'fromordinal',
 'fromtimestamp',
 'hour',
 'is_leap_year',
 'is_month_end',
 'is_month_start',
 'is_

In [72]:
testing[testing.visited>=3].size

3

In [62]:
with pd.option_context('display.max_rows', None,):
    print(logins[logins.user_id==2].set_index('time_stamp')[['visited']].resample('1d').sum())

            visited
time_stamp         
2013-11-15        1
2013-11-16        0
2013-11-17        0
2013-11-18        0
2013-11-19        0
2013-11-20        0
2013-11-21        0
2013-11-22        0
2013-11-23        0
2013-11-24        0
2013-11-25        0
2013-11-26        0
2013-11-27        0
2013-11-28        0
2013-11-29        1
2013-11-30        0
2013-12-01        0
2013-12-02        0
2013-12-03        0
2013-12-04        0
2013-12-05        0
2013-12-06        0
2013-12-07        0
2013-12-08        0
2013-12-09        1
2013-12-10        0
2013-12-11        0
2013-12-12        0
2013-12-13        0
2013-12-14        0
2013-12-15        0
2013-12-16        0
2013-12-17        0
2013-12-18        0
2013-12-19        0
2013-12-20        0
2013-12-21        0
2013-12-22        0
2013-12-23        0
2013-12-24        0
2013-12-25        1
2013-12-26        0
2013-12-27        0
2013-12-28        0
2013-12-29        0
2013-12-30        0
2013-12-31        1
2014-01-01        0


In [61]:
with pd.option_context('display.max_rows', None,):
    print(testing)

            visited
time_stamp         
2013-11-21      1.0
2013-11-22      0.0
2013-11-23      0.0
2013-11-24      0.0
2013-11-25      0.0
2013-11-26      0.0
2013-11-27      0.0
2013-11-28      0.0
2013-11-29      1.0
2013-11-30      1.0
2013-12-01      1.0
2013-12-02      1.0
2013-12-03      1.0
2013-12-04      1.0
2013-12-05      1.0
2013-12-06      0.0
2013-12-07      0.0
2013-12-08      0.0
2013-12-09      1.0
2013-12-10      1.0
2013-12-11      1.0
2013-12-12      1.0
2013-12-13      1.0
2013-12-14      1.0
2013-12-15      1.0
2013-12-16      0.0
2013-12-17      0.0
2013-12-18      0.0
2013-12-19      0.0
2013-12-20      0.0
2013-12-21      0.0
2013-12-22      0.0
2013-12-23      0.0
2013-12-24      0.0
2013-12-25      1.0
2013-12-26      1.0
2013-12-27      1.0
2013-12-28      1.0
2013-12-29      1.0
2013-12-30      1.0
2013-12-31      2.0
2014-01-01      1.0
2014-01-02      1.0
2014-01-03      1.0
2014-01-04      1.0
2014-01-05      1.0
2014-01-06      1.0
2014-01-07      0.0
