In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
% matplotlib inline
plt.style.use('ggplot')

In [None]:
engagement = pd.read_csv('takehome_user_engagement.csv')
df = pd.read_csv('takehome_users.csv', encoding = 'latin') #Getting the data loaded

In [None]:
engagement.head()

In [None]:
df.head()

In [None]:
df.describe()

The main useful information here is that relatively few members opted for the mailing list (about 25%) or are enabled for maketing drip (15%).

In [None]:
engagement.info()

In [None]:
df.info()

There are missing values for the last_session_creation_time and who they were invited by. There are no missing values in the engagement dataframe. It is also of note that the last_session_creation_time appears to be in the wrong format. 

### The plan
I will use the engagement data to identify users who log on at least 3 times in a seven day period. Then using those user_id's, I will use the main dataframe to identify features that predict if the user will be active. Initial thoughts are that creation_source, org_id, mailing_list, market_drip, and perhaps if they were invited (not by whom, just if they were) could be good predictors.

In [None]:
# selecting only the users who appear at least 3 times anyway since they can't occur three times in a week with less than 3 occurances
engage = engagement.groupby('user_id').filter(lambda x: len(x) >= 3)


In [None]:
engage.reset_index(drop=True, inplace = True) # by removing the rows the index is now off, so reset

In [None]:
#this sequence runs through the dataframe, creates a week from a date, and then checks if the next two dates fall within that
#time. Then it ouputs the user_id to a list.
active = 0
active_users = []
for i in range(len(engage)-2):
    user = engage['user_id'][i] #get current user_id
    if user != active and user == engage['user_id'][i+2]: #so we can skip a user that has already been active and to make sure we don't compare to the next user
        st = pd.Timestamp(engage['time_stamp'][i]) #get start date
        et = st + pd.Timedelta('7D') # make range for end date
        if st < pd.Timestamp(engage['time_stamp'][i+1]) < et and st < pd.Timestamp(engage['time_stamp'][i+2]) < et:
            active_users.append(user) # now that the active condition has been met, return that user
            active = user                  #define the user as active


In [None]:
len(active_users)

In [None]:
y = pd.Series(np.random.randn(len(df)))
n = 0
for i in range(len(df)):
    if df['object_id'][i] == active_users[n]:
        y[i] = 1
        n = n+1
        if n > len(active_users)-1:
            n = n -1
    else:
        y[i] = 0
y.head()

In [None]:
df1 = pd.DataFrame(y,columns = ['active_users'])

In [None]:
df = pd.concat([df,df1], axis = 1)

In [None]:
df.head()

In [None]:
# This is to turn the invited column into simply a binary 'was invited' or 'not'
inv = pd.Series(np.random.randn(len(df)))
for i in range(len(df)):
    if df['invited_by_user_id'][i] >=1:
        inv[i] = 1
    else:
        inv[i] = 0
df2 = pd.DataFrame(inv, columns = ['invited'])
df = pd.concat([df,df2], axis = 1)

In [None]:
inv.value_counts()

Alright, we have identified the active users and identified them in the dataframe as the column active_users so that we can now use that to discover which features predict active users.

One thing to note here is that the early creation times did not produce active users as frequently.

In [None]:
sns.regplot(y=df['active_users'], x=df['last_session_creation_time'], fit_reg = True)

Last step is to turn the creation_source information into numbers so that sklearn/xgboost are happy.

In [None]:
df['creation_source'].value_counts()

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(['ORG_INVITE','GUEST_INVITE','PERSONAL_PROJECTS','SIGNUP','SIGNUP_GOOGLE_AUTH'])

In [None]:
creation = le.transform(df['creation_source'])

In [None]:
df3 = pd.DataFrame(creation,columns = ['creation'])

In [None]:
df = pd.concat([df,df3],axis=1)

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
y = df['active_users']
X = df[['creation','last_session_creation_time','opted_in_to_mailing_list','enabled_for_marketing_drip','org_id']]

In [None]:
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
# fit model to training data
model = XGBClassifier()
model.fit(X_train, y_train)

In [None]:
feat_imp = pd.Series(model.get_booster().get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')

In [None]:
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

## Conclusion
With little optimization this model can predict an active user with 92% accuracy. With that in mind, it appears that the most important features for making this prediction are the "last session creation time", the organization the user is from, the "creation source", and if they are in a marketing drip.