In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import re

## Data ingestion and pre-processing

In [2]:
response = pd.read_csv('response.csv')
page = pd.read_csv('pagelist.csv', header = 1)
record = pd.read_csv('recording.csv', header = 1)

In [3]:
# take useful cols
response.columns = ['Time', 'uniqueid', 'age', 'gender', 'education', 'freq']

In [4]:
response.drop(columns = 'Time', inplace=True)

In [5]:
response.sample(3)

Unnamed: 0,uniqueid,age,gender,education,freq
375,0.9102548047,25 ~ 34 years old,Male,Master’s degree,At least once a week
58,0.6002320745,18 ~ 24 years old,Female,Master’s degree,At least once a week
6,0.5810475076,18 ~ 24 years old,Female,Master’s degree,At least once a month


In [6]:
response.shape

(523, 5)

In [7]:
page

Unnamed: 0,displayUrl,uri,starred,title,views,visitTime,engagementTime,clicks,friction,renderTime,scroll,fold,height,size,totalSessions,entry,exit
0,/experiment/ab-login_button/end.html#onloadvar...,https://weijia.io/Experiment/AB-Login_Button/e...,False,End,4,5871,8055,1,0.500000,8,100,625,625,3612,0,0,0
1,/experiment/ab-login_button/a.html#onloadvaria...,https://weijia.io/Experiment/AB-Login_Button/a...,False,A,3,24572,26374,2,0.333333,120,100,625,766,10555,0,0,0
2,/experiment/ab-login_button/b.html#onloadvaria...,https://weijia.io/Experiment/AB-Login_Button/b...,False,B,3,27231,27233,37,3.666667,533,100,625,779,9051,0,0,0
3,/experiment/ab-login_button/b.html#onloadvaria...,https://weijia.io/Experiment/AB-Login_Button/b...,False,B,3,8292,8326,3,0.000000,958,86,609,765,5210,0,0,0
4,/experiment/ab-login_button/c.html#onloadvaria...,https://weijia.io/Experiment/AB-Login_Button/c...,False,C,3,15316,15180,79,1.666667,239,99,757,766,11059,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
376,/experiment/ab-login_button/end.html#onloadvar...,https://weijia.io/Experiment/AB-Login_Button/e...,False,End,1,20002,13564,2,0.000000,18,100,1716,1716,3539,0,0,0
377,/experiment/ab-login_button/end.html#onloadvar...,https://weijia.io/Experiment/AB-Login_Button/e...,False,End,1,61988,31537,1,1.000000,25,100,911,911,3612,0,0,0
378,/experiment/ab-login_button/end.html#onloadvar...,https://weijia.io/Experiment/AB-Login_Button/e...,False,End,1,104845,33962,0,1.000000,5,100,671,671,3539,0,0,0
379,/experiment/ab-login_button/end.html#onloadvar...,https://weijia.io/Experiment/AB-Login_Button/e...,False,End,1,50547,10500,1,1.000000,16,100,657,657,3612,0,0,0


In [8]:
page = page[['displayUrl', 'title', 'views', 'visitTime', 'engagementTime', 'clicks', 'friction', 
            'renderTime', 'scroll', 'fold', 'height', 'size']]

In [9]:
page.shape

(381, 12)

In [10]:
page.sample(3)

Unnamed: 0,displayUrl,title,views,visitTime,engagementTime,clicks,friction,renderTime,scroll,fold,height,size
288,/experiment/ab-login_button/end.html#onloadvar...,End,1,58987,17505,2,1.0,26,100,937,937,3612
331,/experiment/ab-login_button/end.html#onloadvar...,End,1,14092,9254,0,0.0,34,100,1857,1857,3539
203,/experiment/ab-login_button/d.html#onloadvaria...,D,1,60768,30023,2,1.0,4262,100,657,766,10530


In [11]:
record = record[['pageViews', 'pages', 'duration', 'country', 'browser', 'os', 'lng', 'lat']]

In [12]:
record.shape

(243, 8)

In [13]:
record.sample(3)

Unnamed: 0,pageViews,pages,duration,country,browser,os,lng,lat
34,/experiment/ab-login_button/b.html#onloadvaria...,1,4976,in,Chrome,Windows,80.2209,12.8996
85,/experiment/ab-login_button/b.html#onloadvaria...,2,61687,us,Chrome,Windows,-96.805,32.7767
129,/experiment/ab-login_button/d.html#onloadvaria...,2,121054,us,Firefox,Windows,-81.6492,30.3298


## Extract unique ID

In [14]:
page['displayUrl'].sample(3)
page.rename(columns = {'displayUrl': 'uniqueid'}, inplace= True )

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [15]:
# extract unique id from the URL
page['uniqueid'] = page['uniqueid'].str.split('=', 1).str[1]
page['uniqueid'] = page['uniqueid'].str.split('&', 1).str[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  page['uniqueid'] = page['uniqueid'].str.split('=', 1).str[1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  page['uniqueid'] = page['uniqueid'].str.split('&', 1).str[0]


In [16]:
page.uniqueid

0        0.548908298878352
1      0.04412466234962431
2       0.3753349811633586
3      0.43052179002676416
4       0.5306845270199696
              ...         
376     0.9737659669854349
377     0.9766367046439439
378     0.9792503676698106
379      0.981891829233072
380     0.9820805168612354
Name: uniqueid, Length: 381, dtype: object

In [17]:
record['pageViews'].sample(3)

150    /experiment/ab-login_button/d.html#onloadvaria...
108    /experiment/ab-login_button/d.html#onloadvaria...
76     /experiment/ab-login_button/d.html#onloadvaria...
Name: pageViews, dtype: object

In [18]:
# extract records unique id from URL
record['pageViews'] = record['pageViews'].str.split('=', 1).str[1]
record['pageViews'] = record['pageViews'].str.split('&', 1).str[0]

In [19]:
record.rename(columns = {'pageViews': 'uniqueid'}, inplace= True )
record['uniqueid'].sample(3)

217    0.13937094090353797
46     0.25773791551994374
95       0.981891829233072
Name: uniqueid, dtype: object

In [20]:
response.rename(columns = {'What is your unique id in this page (the red number)?': 'uniqueid'}, inplace= True )

In [21]:
response['uniqueid']

0      0.1942976512
1       0.590570241
2       0.441330743
3      0.2285685669
4      0.3671576493
           ...     
518    0.2631959072
519    0.2312231593
520    0.1953285861
521    0.5690302475
522    0.6059561764
Name: uniqueid, Length: 523, dtype: object

## Merge Dataframe on unique ID

In [22]:
response.sample(3)

Unnamed: 0,uniqueid,age,gender,education,freq
237,0.3795034403,25 ~ 34 years old,Male,Bachelor’s degree,At least once a week
473,0.9788482935,18 ~ 24 years old,Male,Bachelor’s degree,At least once a week
47,0.02503611548,18 ~ 24 years old,Male,Bachelor’s degree,At least once a month


In [23]:
page.sample(3)

Unnamed: 0,uniqueid,title,views,visitTime,engagementTime,clicks,friction,renderTime,scroll,fold,height,size
143,0.5095627197644916,C,1,30240,25050,9,1.0,849,100,657,766,6623
298,0.3259801876578776,End,1,31429,8805,0,1.0,3,100,852,852,6564
73,0.2312231592644822,A,1,2633,2633,1,0.0,9,85,657,766,0


In [24]:
record.head(3)

Unnamed: 0,uniqueid,pages,duration,country,browser,os,lng,lat
0,0.3799766860403608,1,10016,us,Chrome,Windows,-73.9183,40.8205
1,0.4754048878111701,2,26938,in,Chrome,Windows,76.2157,10.516
2,0.1786541608534397,1,11486,us,WeChat mobile App,Android,-122.0289,37.5337


In [25]:
for i in range(len(record)):
    record.iloc[i,0] = record.iloc[i,0][0:11]

In [26]:
for i in range(len(page)):
    page.iloc[i,0] = page.iloc[i,0][0:11]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


In [32]:
for i in range(len(response)):
    response.iloc[i,0] = response.iloc[i,0][0:11]

In [33]:
df = pd.merge(record, page, how = 'inner', on = 'uniqueid', copy = False)

In [38]:
df.shape

(361, 19)

In [35]:
df2 = pd.merge(df, response, how = 'inner', on = 'uniqueid', copy = False)

In [39]:
df2.shape

(282, 23)

In [41]:
df2.sample(10)

Unnamed: 0,uniqueid,pages,duration,country,browser,os,lng,lat,title,views,...,friction,renderTime,scroll,fold,height,size,age,gender,education,freq
152,0.326262847,1,5133,us,Chrome,Windows,-124.2131,43.4323,B,1,...,1.0,211,94,722,766,5628,25 ~ 34 years old,Male,Master’s degree,At least once a week
217,0.821549658,2,47321,us,Chrome,Windows,-118.2529,34.0485,End,1,...,1.0,107,100,625,625,3621,25 ~ 34 years old,Female,Bachelor’s degree,At least once a week
63,0.613269392,2,17854,us,Chrome Mobile,Android,-71.0922,42.3427,End,1,...,0.0,14,100,1786,1786,3539,18 ~ 24 years old,Male,Bachelor’s degree,At least once a week
3,0.839316985,1,20468,us,Chrome,Windows,-97.822,37.751,End,1,...,1.0,45,100,625,625,3612,18 ~ 24 years old,Female,Bachelor’s degree,At least once a week
102,0.982080516,2,17433,us,Chrome,Windows,-75.0363,40.09,End,1,...,1.0,32,100,625,625,3728,25 ~ 34 years old,Male,Bachelor’s degree,At least once a week
54,0.311090118,3,40722,us,Safari,macOS,-73.8341,40.9352,End,1,...,0.0,10,100,760,760,3539,18 ~ 24 years old,Female,Bachelor’s degree,At least once a week
106,0.76938154,1,4376,us,Chrome,Windows,-115.1446,36.0383,D,1,...,1.0,0,0,625,0,6169,25 ~ 34 years old,Male,Bachelor’s degree,At least once a week
91,0.686664348,3,28624,us,Chrome,Windows,-97.6689,30.5154,C,3,...,0.333333,55,95,657,766,6550,25 ~ 34 years old,Male,Bachelor’s degree,At least once a day
71,0.625803829,2,26611,us,Chrome,Windows,-80.1886,25.7634,C,1,...,0.0,13,100,657,766,6623,35 ~ 44 years old,Male,Bachelor’s degree,At least once a week
211,0.845790541,1,30003,us,WeChat mobile App,iOS,-75.1565,39.9195,D,1,...,0.0,2436,100,1707,1707,10530,25 ~ 34 years old,Female,Master’s degree,At least once a month


In [43]:
df2.to_csv('Cleaned_data.csv')