In [51]:
import pandas as pd
import numpy as np
from statistics import mean
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta 
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
import chart_studio.plotly as py
import cufflinks as cf
import plotly.express as px
import plotly.graph_objects as go
%matplotlib inline
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
cf.go_offline()

## takehome_users
- `name`: the user's name
- `object_id`: the user's id
- `email`: email address
- `creation_source`: how their account was created. This takes on one of 5 values:
 - PERSONAL_PROJECTS: invited to join another user's personal workspace
 - GUEST_INVITE: invited to an organization as a guest (limited permissions)
 - ORG_INVITE: invited to an organization (as a full member)
 - SIGNUP: signed up via the website
 - SIGNUP_GOOGLE_AUTH: signed up using Google Authentication (using a Google email account for their login id)
- `creation_time`: when they created their account
- `last_session_creation_time`: unix timestamp of last login (total seconds since Jan, 1 1970)
- `opted_in_to_mailing_list`: whether they have opted into receiving marketing emails
- `enabled_for_marketing_drip`: whether they are on the regular marketing email drip
- `org_id`: the organization (group of users) they belong to
- `invited_by_user_id`: which user invited them to join (if applicable)

## takehome_user_engagement 
Has a row for each day that a user logged into the product.

Defining an `adopted_user` as a user who has logged into the product on <ins>three separate days</ins> in at least <ins>one sevenday period</ins> 

Goal:
<ins>__Identify which factors predict future user adoption.__</ins>

## Comments
- I'm assuming that 3 seperate days refers to seperate weekdays and not atleast 24hrs gaps in betweenn each e.g. logging in at 11:59pm and then at 12:01am are logins on 2 seperate days. The seven day period applies in the same manner. 

In [2]:
# pd.read_csv('takehome_users.csv', encoding='ISO-8859-1')

In [3]:
users = pd.read_csv('takehome_users.csv').drop(columns=['Unnamed: 0'], axis=1)
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [4]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   object_id                   12000 non-null  int64  
 1   creation_time               12000 non-null  object 
 2   name                        12000 non-null  object 
 3   email                       12000 non-null  object 
 4   creation_source             12000 non-null  object 
 5   last_session_creation_time  8823 non-null   float64
 6   opted_in_to_mailing_list    12000 non-null  int64  
 7   enabled_for_marketing_drip  12000 non-null  int64  
 8   org_id                      12000 non-null  int64  
 9   invited_by_user_id          6417 non-null   float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


In [5]:
logins = pd.read_csv('takehome_user_engagement.csv')
logins.time_stamp = pd.to_datetime(logins.time_stamp)
logins = logins.sort_values(['user_id', 'time_stamp'] )
logins.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [6]:
logins.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 207917 entries, 0 to 207916
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   time_stamp  207917 non-null  datetime64[ns]
 1   user_id     207917 non-null  int64         
 2   visited     207917 non-null  int64         
dtypes: datetime64[ns](1), int64(2)
memory usage: 6.3 MB


In [7]:
# check number of candidates for adoption
len(set(logins.user_id))

8823

In [8]:
dailys = logins[logins.user_id==2].set_index('time_stamp')[['visited']].resample('1d').mean().dropna()
dailys.index[8]-dailys.index[6]

Timedelta('6 days 00:00:00')

In [9]:
# Seperate users who don't have atleast 3 logins
id_sums = logins[['user_id', 'visited']].groupby('user_id').sum()
unadopt = id_sums.loc[id_sums.visited<3].index.values #users who don't have atleast 3 logins
candidates = id_sums.loc[id_sums.visited>=3].index.values

In [145]:
def adoption(user):
    if user in unadopt:
        return 0
    else:
        dailys = logins[logins.user_id==user].set_index('time_stamp')[['visited']].resample('1d').sum()
        rolling7s = dailys.rolling(7).sum().dropna()
        greaterthan3 = rolling7s[rolling7s.visited>=3]
        if greaterthan3.size == 0:
            return 0
        else:
            for time_idx in range(len(greaterthan3)):
                start = greaterthan3.index[time_idx]-timedelta(days=7)
                end = greaterthan3.index[time_idx]
                span = dailys.loc[(dailys.index>start) & (dailys.index<=end)]
                sums = len(set(span.visited.cumsum().values))
                if sums>=3:
                    return 1
                else: 
                    continue
            return 0 

In [65]:
users['adoption'] = users.object_id.apply(adoption)
len(users[users.adoption==1])

1597

In [144]:
# users.to_csv('adoption.csv')

In [17]:
# dir(greaterthan3.index[0])

In [72]:
testing[testing.visited>=3].size

3

In [None]:
# with pd.option_context('display.max_rows', None,):
#     print(logins[logins.user_id==2].set_index('time_stamp')[['visited']].resample('1d').sum())

In [None]:
# with pd.option_context('display.max_rows', None,):
#     print(testing)

In [15]:
data = pd.read_csv('adoption.csv').drop(columns=['Unnamed: 0'], axis=1)
data.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adoption
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,1
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,0


In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   object_id                   12000 non-null  int64  
 1   creation_time               12000 non-null  object 
 2   name                        12000 non-null  object 
 3   email                       12000 non-null  object 
 4   creation_source             12000 non-null  object 
 5   last_session_creation_time  8823 non-null   float64
 6   opted_in_to_mailing_list    12000 non-null  int64  
 7   enabled_for_marketing_drip  12000 non-null  int64  
 8   org_id                      12000 non-null  int64  
 9   invited_by_user_id          6417 non-null   float64
 10  adoption                    12000 non-null  int64  
dtypes: float64(2), int64(5), object(4)
memory usage: 1.0+ MB


In [24]:
data.invited_by_user_id=data.invited_by_user_id.fillna(0).astype('int64')

In [27]:
data['invited']= np.where(data.invited_by_user_id==0, 0, 1)

In [29]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   object_id                   12000 non-null  int64  
 1   creation_time               12000 non-null  object 
 2   name                        12000 non-null  object 
 3   email                       12000 non-null  object 
 4   creation_source             12000 non-null  object 
 5   last_session_creation_time  8823 non-null   float64
 6   opted_in_to_mailing_list    12000 non-null  int64  
 7   enabled_for_marketing_drip  12000 non-null  int64  
 8   org_id                      12000 non-null  int64  
 9   invited_by_user_id          12000 non-null  int64  
 10  adoption                    12000 non-null  int64  
 11  invited                     12000 non-null  int32  
dtypes: float64(1), int32(1), int64(6), object(4)
memory usage: 1.1+ MB


In [30]:
data.describe()

Unnamed: 0,object_id,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adoption,invited
count,12000.0,8823.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0
mean,6000.5,1379279000.0,0.2495,0.149333,141.884583,3188.691333,0.133083,0.53475
std,3464.24595,19531160.0,0.432742,0.356432,124.056723,3869.027693,0.339679,0.498812
min,1.0,1338452000.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3000.75,1363195000.0,0.0,0.0,29.0,0.0,0.0,0.0
50%,6000.5,1382888000.0,0.0,0.0,108.0,875.0,0.0,1.0
75%,9000.25,1398443000.0,0.0,0.0,238.25,6317.0,0.0,1.0
max,12000.0,1402067000.0,1.0,1.0,416.0,11999.0,1.0,1.0


Nothing suspicious 

In [33]:
px.imshow(data.drop(columns=['last_session_creation_time', 'invited_by_user_id']).corr())

In [48]:
# independent var
X = data[['opted_in_to_mailing_list','enabled_for_marketing_drip', 'org_id', 'invited']]

# target
y = data.adoption

In [49]:
needs_encoding = data[['creation_source']]
encode = OneHotEncoder()
array_encode = encode.fit_transform(needs_encoding).toarray()
df_encode = pd.DataFrame(array_encode, columns = encode.get_feature_names())
df_encode.columns= ['GUEST_INVITE', 'ORG_INVITE', 'PERSONAL_PROJECTS', 'SIGNUP', 'SIGNUP_GOOGLE_AUTH']

# Join new features 
X = X.join(df_encode)

In [50]:
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=0)

x_train.shape, y_train.shape, x_test.shape, y_test.shape

((9000, 9), (9000,), (3000, 9), (3000,))

In [54]:
# Random Forest
steps = [('scaler', StandardScaler()), 
         ('rf', RandomForestClassifier(random_state = 0))]

pipe = Pipeline(steps)

params = {'rf__n_estimators':[int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]}

gs = GridSearchCV(pipe, params, cv=5).fit(x_train, y_train)

y_pred = gs.predict(x_test)

labels = ['Not-adopted', 'adopted']
print(classification_report(y_test, y_pred, target_names = labels, digits = 5))
pd.DataFrame(confusion_matrix(y_test, y_pred), index=labels, columns=labels)

              precision    recall  f1-score   support

 Not-adopted    0.87264   0.93752   0.90392      2609
     adopted    0.17259   0.08696   0.11565       391

    accuracy                        0.82667      3000
   macro avg    0.52261   0.51224   0.50978      3000
weighted avg    0.78140   0.82667   0.80118      3000



Unnamed: 0,Not-adopted,adopted
Not-adopted,2446,163
adopted,357,34


In [58]:
coefficients = pd.DataFrame()
coefficients['features'] = X.columns
coefficients['coefficients'] = np.transpose(gs.best_estimator_[1].feature_importances_).round(3)
fig = px.bar(coefficients, x='features', y='coefficients', text = 'coefficients')
fig.update_traces(textposition='outside')

Well...There you have it haha