# Imports

In [1]:
import os
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Load in Citizen Science Data

In [2]:
data_path = os.path.join(os.getcwd(), '', 'citizenscience.csv')
citsci = pd.read_csv(data_path, delimiter = ',',  header=None, names=['user_id', 'timestamp'])

In [3]:
citsci['timestamp'] = pd.to_datetime(citsci['timestamp'])

In [4]:
citsci.describe()

Unnamed: 0,user_id,timestamp
count,99999,99999
unique,943,13370
top,5050a6760454e27aae0026db,2014-12-29 21:38:00
freq,5531,69
first,,2014-12-23 16:53:00
last,,2015-01-05 17:33:00


In [5]:
citsci.dtypes

user_id              object
timestamp    datetime64[ns]
dtype: object

In [6]:
citsci.head()

Unnamed: 0,user_id,timestamp
0,54999c8d9cd118282b001784,2014-12-23 16:53:00
1,54999c91bb7b56040d0011e8,2014-12-23 16:53:00
2,54999c8fbb7b565d7e000a7c,2014-12-23 16:53:00
3,54999cb29cd1184d77000539,2014-12-23 16:53:00
4,54068fab91ad6b597e000f97,2014-12-23 16:53:00


In [7]:
citsci = citsci.sort_values(['user_id', 'timestamp'])

# Group Records by User_Id

In [8]:
grouped_users = citsci.groupby('user_id')['timestamp'].agg(['first','last'])

In [9]:
grouped_users.head()

Unnamed: 0_level_0,first,last
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
4d5835cc2a82e278e6000028,2015-01-05 09:07:00,2015-01-05 15:52:00
4fcaa6daae60e44a1900000b,2015-01-01 03:36:00,2015-01-01 03:38:00
50085cfc44dff81137000020,2014-12-28 22:32:00,2015-01-01 20:55:00
5013adf30454e27ae4000002,2015-01-02 22:12:00,2015-01-02 22:12:00
503e2b5eba40af242e000052,2014-12-26 13:12:00,2014-12-29 17:24:00


In [10]:
grouped_users['length'] = grouped_users['last'] - grouped_users['first']

In [11]:
longest_user_id = grouped_users.sort_values('length', ascending=False).iloc[0]
longest_user_id = '508920379177d041ea003fd8'
longest_user = citsci[citsci['user_id'] == '508920379177d041ea003fd8']
longest_user

Unnamed: 0,user_id,timestamp
12,508920379177d041ea003fd8,2014-12-23 16:53:00
16,508920379177d041ea003fd8,2014-12-23 16:53:00
21,508920379177d041ea003fd8,2014-12-23 16:53:00
44,508920379177d041ea003fd8,2014-12-23 16:54:00
612,508920379177d041ea003fd8,2014-12-23 17:10:00
698,508920379177d041ea003fd8,2014-12-23 17:25:00
7210,508920379177d041ea003fd8,2014-12-24 16:01:00
7211,508920379177d041ea003fd8,2014-12-24 16:01:00
7212,508920379177d041ea003fd8,2014-12-24 16:02:00
7213,508920379177d041ea003fd8,2014-12-24 16:02:00


## Group by User_Id, Timestamp

In [12]:
click_freq = citsci.groupby(['user_id', 'timestamp']).size()

In [13]:
click_freq['508920379177d041ea003fd8']

timestamp
2014-12-23 16:53:00    3
2014-12-23 16:54:00    1
2014-12-23 17:10:00    1
2014-12-23 17:25:00    1
2014-12-24 16:01:00    2
2014-12-24 16:02:00    3
2014-12-26 15:52:00    2
2014-12-26 15:53:00    1
2014-12-26 15:54:00    4
2014-12-26 15:55:00    3
2014-12-26 15:57:00    1
2014-12-27 15:36:00    1
2014-12-27 15:37:00    2
2014-12-27 15:38:00    1
2014-12-27 15:55:00    3
2014-12-27 15:56:00    5
2014-12-27 15:57:00    1
2014-12-27 15:58:00    1
2014-12-27 16:01:00    2
2014-12-27 16:11:00    2
2014-12-27 16:12:00    1
2014-12-27 16:13:00    1
2014-12-27 16:14:00    4
2014-12-27 16:15:00    5
2014-12-27 16:24:00    4
2014-12-27 16:25:00    1
2014-12-27 16:34:00    3
2014-12-27 16:36:00    1
2014-12-29 15:22:00    5
2014-12-29 15:24:00    2
                      ..
2015-01-05 15:48:00    1
2015-01-05 16:00:00    1
2015-01-05 16:01:00    4
2015-01-05 16:04:00    1
2015-01-05 16:05:00    5
2015-01-05 16:06:00    3
2015-01-05 16:07:00    3
2015-01-05 16:09:00    1
2015-01-05 16:1

# Group into Session Data

In [14]:
sorted_citsci = citsci.sort_values(['user_id', 'timestamp'])

In [15]:
gt_30min = sorted_citsci.timestamp.diff() > pd.datetools.timedelta(minutes=30)

  """Entry point for launching an IPython kernel.


In [16]:
diff_user = sorted_citsci.user_id != sorted_citsci.user_id.shift()

In [17]:
session_id = (diff_user | gt_30min).cumsum()

In [18]:
from string import ascii_uppercase

In [19]:
citsci['session_id'] = session_id

In [20]:
citsci

Unnamed: 0,user_id,timestamp,session_id
95970,4d5835cc2a82e278e6000028,2015-01-05 09:07:00,1
95971,4d5835cc2a82e278e6000028,2015-01-05 09:07:00,1
95972,4d5835cc2a82e278e6000028,2015-01-05 09:08:00,1
96058,4d5835cc2a82e278e6000028,2015-01-05 09:33:00,1
98712,4d5835cc2a82e278e6000028,2015-01-05 15:04:00,2
98728,4d5835cc2a82e278e6000028,2015-01-05 15:04:00,2
98735,4d5835cc2a82e278e6000028,2015-01-05 15:05:00,2
98745,4d5835cc2a82e278e6000028,2015-01-05 15:06:00,2
98747,4d5835cc2a82e278e6000028,2015-01-05 15:06:00,2
98762,4d5835cc2a82e278e6000028,2015-01-05 15:07:00,2


In [21]:
citsci[citsci['user_id'] == '508920379177d041ea003fd8']

Unnamed: 0,user_id,timestamp,session_id
12,508920379177d041ea003fd8,2014-12-23 16:53:00,301
16,508920379177d041ea003fd8,2014-12-23 16:53:00,301
21,508920379177d041ea003fd8,2014-12-23 16:53:00,301
44,508920379177d041ea003fd8,2014-12-23 16:54:00,301
612,508920379177d041ea003fd8,2014-12-23 17:10:00,301
698,508920379177d041ea003fd8,2014-12-23 17:25:00,301
7210,508920379177d041ea003fd8,2014-12-24 16:01:00,302
7211,508920379177d041ea003fd8,2014-12-24 16:01:00,302
7212,508920379177d041ea003fd8,2014-12-24 16:02:00,302
7213,508920379177d041ea003fd8,2014-12-24 16:02:00,302


# Add new columns to each HIT

### Add session duration so far field

In [23]:
session_duration = citsci['timestamp'] - citsci.groupby('session_id')['timestamp'].transform('first')
citsci['session_duration'] = session_duration

### Add times since last HIT field

In [25]:
idle_time = citsci.groupby('session_id')['timestamp'].diff().fillna(0)
citsci['idle_time'] = idle_time