# Rec Sys 2015 Challenge's Data Exploration

## Reading the data

In [1]:
%matplotlib inline
%pylab inline
import pandas as pd
import numpy as np
import ggplot

Populating the interactive namespace from numpy and matplotlib


In [2]:
buys = pd.read_csv("/Users/ksankar/Data-Science/recSys/recsysChallenge2015/yoochoose-data/yoochoose-buys.dat", 
                   names=["session", "timestamp", "item", "price", "qty"], 
                   parse_dates=["timestamp"])

In [6]:
buys[:5]

Unnamed: 0,session,timestamp,item,price,qty
0,420374,2014-04-06 18:44:58.314000,214537888,12462,1
1,420374,2014-04-06 18:44:58.325000,214537850,10471,1
2,281626,2014-04-06 09:40:13.032000,214535653,1883,1
3,420368,2014-04-04 06:13:28.848000,214530572,6073,1
4,420368,2014-04-04 06:13:28.858000,214835025,2617,1


In [7]:
def read_clicks(filename):
    return pd.read_csv(filename, 
                     names=["session", "timestamp", "item", "category"], 
                     parse_dates=["timestamp"],
                     converters={"category": lambda c: -1 if c == "S" else c})
clicks = read_clicks("/Users/ksankar/Data-Science/recSys/recsysChallenge2015/yoochoose-data/yoochoose-clicks.dat")
test = read_clicks("/Users/ksankar/Data-Science/recSys/recsysChallenge2015/yoochoose-data/yoochoose-test.dat")

In [5]:
clicks[:5]

Unnamed: 0,session,timestamp,item,category
0,1,2014-04-07 10:51:09.277000,214536502,0
1,1,2014-04-07 10:54:09.868000,214536500,0
2,1,2014-04-07 10:54:46.998000,214536506,0
3,1,2014-04-07 10:57:00.306000,214577561,0
4,2,2014-04-07 13:56:37.614000,214662742,0


## Data analysis

### Confirm that all datasets are from same time range (2014-04-01 to 2014-09-30)

In [9]:
clicks["timestamp"].min(), clicks["timestamp"].max()

(Timestamp('2014-04-01 03:00:00.124000'),
 Timestamp('2014-09-30 02:59:59.430000'))

In [10]:
buys["timestamp"].min(), buys["timestamp"].max()

(Timestamp('2014-04-01 03:05:31.743000'),
 Timestamp('2014-09-30 02:35:12.859000'))

In [11]:
test["timestamp"].min(), test["timestamp"].max()

(Timestamp('2014-04-01 03:00:08.250000'),
 Timestamp('2014-09-30 02:59:23.866000'))

In [12]:
# Test size percentage
test.shape[0]/float(clicks.shape[0])

0.25002439102429697

### Basic statistics

In [13]:
# Number of clicks
clicks.shape[0]

33003944

In [14]:
# Number of items bought
buys.shape[0]

1150753

In [15]:
# Number of unique sessions
clicks["session"].unique().size

9249729

In [16]:
# Number of sessions with buys
buys["session"].unique().size

509696

In [17]:
# Percentage of sessions with buys
buys["session"].unique().size / float(clicks["session"].unique().size)

0.05510388466516154

In [18]:
# Number of unique items
clicks["item"].unique().size

52739

In [19]:
# Number of unique items bought
buys["item"].unique().size

19949

In [20]:
# Number of categories
clicks["category"].unique().size

339

In [21]:
# Avg. number of categories per item
clicks.groupby(["item","category"]).count()["session"].size / float(buys["item"].unique().size)

5.018296656474009

In [22]:
# Items bought per session (only sessions with items bought)
buys[["session","item"]].groupby("session").count().describe()

Unnamed: 0,item
count,509696.0
mean,2.257724
std,1.933342
min,1.0
25%,1.0
50%,2.0
75%,3.0
max,144.0


In [23]:
# Clicks per session
clicks[["session","item"]].groupby("session").count().describe()

Unnamed: 0,item
count,9249729.0
mean,3.568098
std,3.78752
min,1.0
25%,2.0
50%,2.0
75%,4.0
max,200.0


In [24]:
# Session duration
timestamp_by_session = clicks[["session", "timestamp"]].groupby("session")
mints = timestamp_by_session.min()
maxts = timestamp_by_session.max()
duration = maxts - mints
duration.describe()

Unnamed: 0,timestamp
count,9249729
mean,0 days 00:06:21.633378
std,0 days 00:12:36.645009
min,0 days 00:00:00
25%,0 days 00:00:32.214000
50%,0 days 00:02:07.939000
75%,0 days 00:06:24.132000
max,2 days 20:18:57.996000


## Clicks x Buys

In [25]:
session_item_buys = buys[["session", "item", "qty"]].groupby(["session", "item"]).sum()
session_item_buys = session_item_buys["qty"]
session_item_buys = session_item_buys.to_frame("bought")

In [26]:
session_item_clicks = clicks[["session", "item", "timestamp"]].groupby(["session", "item"]).count()
session_item_clicks = session_item_clicks["timestamp"]
session_item_clicks = session_item_clicks.to_frame(name="clicks")

In [27]:
session_items = pd.merge(session_item_clicks, session_item_buys, how='outer', left_index=True, right_index=True)
session_items.fillna(0, inplace=True)
session_items[:5]

Unnamed: 0_level_0,Unnamed: 1_level_0,clicks,bought
session,item,Unnamed: 2_level_1,Unnamed: 3_level_1
1,214536500,1,0
1,214536502,1,0
1,214536506,1,0
1,214577561,1,0
2,214551617,1,0


In [28]:
# All items bought had at least one click?
session_items[session_items["clicks"] == 0].count()

clicks    0
bought    0
dtype: int64

In [29]:
session_items.describe()

Unnamed: 0,clicks,bought
count,26615576.0,26615576.0
mean,1.240024,0.027934
std,0.661383,0.31007
min,1.0,0.0
25%,1.0,0.0
50%,1.0,0.0
75%,1.0,0.0
max,200.0,140.0


In [30]:
# More clicks result on more buys?
session_items["clicks"].corr(session_items["bought"])

0.10215779997428075

## Plots

In [36]:
# some display options to make figures bigger
pd.set_option('display.max_columns', 15)
pd.set_option('display.width', 400)
pd.set_option('display.mpl_style', 'default')
#rcParams['figure.figsize'] = (14, 7)
import matplotlib
font = {'family' : 'Helvetica',
        'weight' : 'bold',
        'size'   : 22}

matplotlib.rc('font', **font)

In [38]:
#Buys and clicks per day
buys_grouped_dates = buys["timestamp"].groupby(buys["timestamp"].apply(lambda x : x.date()))
buys_by_date = buys_grouped_dates.count()
buys_by_date = buys_by_date.to_frame(name="buys")

clicks_grouped_dates = clicks["timestamp"].groupby(clicks["timestamp"].apply(lambda x : x.date()))
clicks_by_date = clicks_grouped_dates.count()
clicks_by_date = clicks_by_date.to_frame(name="clicks")

buys_and_click_by_date = pd.merge(buys_by_date, clicks_by_date, how='outer', left_index=True, right_index=True)
buys_and_click_by_date.fillna(0, inplace=True)


#buys_and_click_by_date.plot()

NameError: name 'qplot' is not defined

In [41]:
import ggplot2
qplot(buys_and_click_by_date)

ImportError: No module named ggplot2