In [1]:
"""
There are six files in all: train.csv, test.csv, users.csv, 
user_friends.csv, events.csv, and event_attendees.csv.

train.csv has six columns:  user, event, invited, timestamp, 
interested, and not_interested.  Test.csv contains the same 
columns as train.csv, except for interested and not_interested. 
Each row corresponds to an event that was shown to a user in 
our application.  event is an id identifying an event in a our system.  
user is an id representing a user in our system.  invited is a 
binary variable indicated whether the user has been invited to 
the event. timestamp is a ISO-8601 UTC time string representing 
the approximate time (+/- 2 hours) when the user saw the event in 
our application. interested is a binary variable indicating whether 
a user clicked on the "Interested" button for this event; it is 1 
if the user clicked Interested and 0 if the user did not click the 
button.  Similarly, not_interested is a binary variable indicating 
whether a user clicked on the "Not Interested" button for this event; 
it is 1 if the user clicked the button and 0 if not.  It is possible 
that the user saw an event and clicked neither Interested nor Not 
Interested, and hence there are rows that contain 0,0 as values for 
interested,not_interested.

users.csv contains demographic data about our some of our users 
(including all of the users appearing in the train and test files), 
and it has the following columns: user_id, locale, birthyear, 
gender, joinedAt, location, and timezone. user_id is the id of 
the user in our system.  locale is a string representing the 
user's locale, which should be of the form language_territory. 
birthyear is a 4-digit integer representing the year when the user 
was born. gender is either male or female, depending on the user's 
gender.  joinedAt is an ISO-8601 UTC time string representing when 
the user first used our application.  location is a string 
representing the user's location (if known).  timezone is a 
signed integer representing the user's UTC offset (in minutes).

user_friends.csv contains social data about this user, and contains 
two columns:  user and friends.  user is the user's id in our system, 
and friends is a space-delimited list of the user's friends' ids.

events.csv contains data about events in our system, and has 110 
columns.  The first nine columns are event_id, user_id, start_time, 
city, state, zip, country, lat, and lng.  event_id is the id of 
the event, and user_id is the id of the user who created the event.  
city, state, zip, and country represent more details about the 
location of the venue (if known).  lat and lng are floats 
representing the latitude and longitude coordinates of the venue, 
rounded to three decimal places.  start_time is the ISO-8601 UTC 
time string representing when the event is scheduled to begin.  
The last 101 columns require a bit more explanation; first, we 
determined the 100 most common word stems (obtained via Porter 
Stemming) occuring in the name or description of a large random 
subset of our events.  The last 101 columns are count_1, count_2, 
..., count_100, count_other, where count_N is an integer representing 
the number of times the Nth most common word stem appears in the 
name or description of this event.  count_other is a count of the 
rest of the words whose stem wasn't one of the 100 most common stems.

event_attendees.csv contains information about which users attended 
various events, and has the following columns: event_id, yes, maybe, 
invited, and no. event_id identifies the event. yes, maybe, invited, 
and no are space-delimited lists of user id's representing users who 
indicated that they were going, maybe going, invited to, or not going 
to the event.
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import datetime as dt

In [2]:
train_df = pd.read_csv('./data/30Nov2020/train.csv')
test_df = pd.read_csv('./data/30Nov2020/test.csv')
users_df = pd.read_csv('./data/30Nov2020/users.csv')
user_friends_df = pd.read_csv('./data/30Nov2020/user_friends.csv')
events_df = pd.read_csv('./data/30Nov2020/events.csv')
event_attendees_df = pd.read_csv('./data/30Nov2020/event_attendees.csv')

In [None]:
train_df.sample(3)

In [None]:
train_df.shape

In [None]:
train_df.info()

In [None]:
test_df.sample(3)

In [None]:
test_df.shape

In [None]:
users_df.sample(3)

In [None]:
users_df.shape

In [None]:
user_friends_df.sample(3)

In [None]:
user_friends_df.shape

In [None]:
events_df.sample(3)

In [None]:
events_df.shape

In [None]:
event_attendees_df.sample(3)

In [None]:
event_attendees_df.shape

### Random Forest Classifier

#### Balance Data Set

In [3]:
train_df.interested.value_counts()

0    11267
1     4131
Name: interested, dtype: int64

In [4]:
train_df.not_interested.value_counts()

0    14884
1      514
Name: not_interested, dtype: int64

In [5]:
# learning from: https://elitedatascience.com/imbalanced-classes
from sklearn.utils import resample

In [28]:
X1_major = train_df[train_df.interested == 0]
X1_minor = train_df[train_df.interested == 1]

X2_major = train_df[train_df.not_interested == 0]
X2_minor = train_df[train_df.not_interested == 1]

X1_minor_upsampled = resample(X1_minor
                              , replace = True
                              , n_samples = 11267
                              , random_state = 42
                             )

X2_minor_upsampled = resample(X2_minor
                              , replace = True
                              , n_samples = 14884
                              , random_state = 42
                             )

X1_upsampled = pd.concat([X1_major, X1_minor_upsampled])
X2_upsampled = pd.concat([X2_major, X2_minor_upsampled])

In [29]:
X1_upsampled.columns, X2_upsampled.columns

(Index(['user', 'event', 'invited', 'timestamp', 'interested',
        'not_interested'],
       dtype='object'),
 Index(['user', 'event', 'invited', 'timestamp', 'interested',
        'not_interested'],
       dtype='object'))

In [30]:
X1_upsampled.interested.value_counts()

1    11267
0    11267
Name: interested, dtype: int64

In [31]:
X2_upsampled.not_interested.value_counts()

1    14884
0    14884
Name: not_interested, dtype: int64

In [32]:
# shuffle using sample
X1_upsampled = X1_upsampled.sample(frac=1).reset_index(drop=True)
X2_upsampled = X2_upsampled.sample(frac=1).reset_index(drop=True)

In [33]:
# shuffle using sklearn.utils.shuffle()
from sklearn.utils import shuffle
X1_upsampled = shuffle(X1_upsampled)
X2_upsampled = shuffle(X2_upsampled)

X1_upsampled.reset_index(inplace=True, drop=True)
X2_upsampled.reset_index(inplace=True, drop=True)

In [34]:
X1_upsampled.head()

Unnamed: 0,user,event,invited,timestamp,interested,not_interested
0,2033574675,952519340,0,2012-10-30 14:02:19.873000+00:00,1,0
1,1107615001,173949238,1,2012-11-19 08:11:35.202000+00:00,0,0
2,988160405,1203717384,0,2012-11-01 10:22:21.125000+00:00,1,0
3,2643058163,2790657295,0,2012-11-12 05:22:27.681000+00:00,1,0
4,2116203935,2867959207,0,2012-11-10 02:49:12.032000+00:00,1,0


In [35]:
from sklearn.model_selection import train_test_split

In [36]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1_upsampled.drop(columns = ['interested', 'not_interested'])
                                                        , X1_upsampled.interested
                                                        , test_size = 0.1
                                                        , random_state = 42
                                                       )

X2_train, X2_test, y2_train, y2_test = train_test_split(X2_upsampled.drop(columns = ['interested', 'not_interested'])
                                                        , X2_upsampled.not_interested
                                                        , test_size = 0.1
                                                        , random_state = 42
                                                       )

In [None]:
train_df.info()

In [None]:
train_df.loc[:, 'timestamp'] = pd.to_datetime(train_df.timestamp).map(dt.datetime.toordinal)

In [None]:
train_df.info()

In [None]:
train_df.interested.unique(), train_df.not_interested.unique()

In [None]:
X = train_df.drop(['interested', 'not_interested'], axis = 1).copy()
y1 = train_df.interested.copy()
y2 = train_df.not_interested.copy()

In [None]:
X.sample(3)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc1 = RandomForestClassifier()
rfc1.fit(X, y1)
rfc2 = RandomForestClassifier()
rfc2.fit(X, y2)

### Random Forest Classifier Metrics

In [None]:
y1_probs = rfc1.predict_proba(X)[:, 1]
y2_probs = rfc2.predict_proba(X)[:, 1]

In [None]:
print(y1_probs[:10])
print(y2_probs[:10])

In [None]:
from sklearn.model_selection import cross_val_score
cval1 = cross_val_score(rfc1
                       , X
                       , y1
                       , scoring='accuracy'
                       , cv=7
                      )

print(f'Cross-Validation Score for interested: {np.mean(cval1)}')

cval2 = cross_val_score(rfc2
                       , X
                       , y2
                       , scoring='accuracy'
                       , cv=7
                      )

print(f'Cross-Validation Score for interested: {np.mean(cval2)}')

In [None]:
from sklearn.metrics import (roc_auc_score
                             , precision_score
                             , recall_score
                             , roc_curve
                             , confusion_matrix
                             , plot_confusion_matrix
                             , precision_recall_curve
                             , auc
                            )

In [None]:
roc_score = roc_auc_score(y1
                          , y1_probs
                         )
roc_score

### Make Prediction

In [None]:
test_df.columns

In [None]:
test_df.loc[:, 'timestamp'] = pd.to_datetime(test_df.timestamp).map(dt.datetime.toordinal)

In [None]:
test_df_rfc1 = test_df.copy()
test_df_rfc2 = test_df.copy()
test_df['interested'] = rfc.predict(test_df_rfc1)
test_df['not_interested'] = rfc.predict(test_df_rfc2)

In [None]:
test_df.sample(5)