In [1]:
import os
import json
from datetime import datetime

import pandas as pd

#### Initial Load

In [None]:
SESSION_FOLDER_PATH = '/home/uchan/Documents/me/browsing-sessions'
FILES = list(map(lambda p: os.path.join(SESSION_FOLDER_PATH, p), os.listdir(SESSION_FOLDER_PATH)))

In [2]:
FILES

['/home/uchan/Documents/me/browsing-sessions/Browsing-Sessions - 2020-08-16 22-22-41.json',
 '/home/uchan/Documents/me/browsing-sessions/884 Sessions - 2022-04-18 06-36-16.json',
 '/home/uchan/Documents/me/browsing-sessions/Sessions - 2020-12-05 15-59-23.json',
 '/home/uchan/Documents/me/browsing-sessions/883 Sessions - 2022-03-25 09-19-12.json',
 '/home/uchan/Documents/me/browsing-sessions/Sessions-Feb2020.json']

In [3]:
def timestamp_to_date(some_date, as_string=True):
    converted_date = datetime.utcfromtimestamp(some_date/1000)
    if as_string:
        return converted_date.strftime('%d-%m-%Y %H:%M:%S')
    return converted_date


def _load_file(file):
    with open(file, 'r') as f:
        data = json.load(f)
    return data


def check_browsing_periods(file):
    print(f'loading {file}')
    data = _load_file(file)
    
    total_record = len(data)
    earliest_record = timestamp_to_date(data[0]['date'])
    latest_record   = timestamp_to_date(data[-1]['date'])
    
    print(f'Total record   : {total_record}')
    print(f'Earliest record: {earliest_record}')
    print(f'Latest record  : {latest_record}')
    print('=' * 15)

In [4]:
"""There are 5 files, and their records have overlapping timestamps"""
for file in FILES:
    check_browsing_periods(file)

loading /home/uchan/Documents/me/browsing-sessions/Browsing-Sessions - 2020-08-16 22-22-41.json
Total record   : 166
Earliest record: 26-10-2019 15:08:59
Latest record  : 23-06-2019 15:19:00
loading /home/uchan/Documents/me/browsing-sessions/884 Sessions - 2022-04-18 06-36-16.json
Total record   : 884
Earliest record: 19-08-2021 00:01:33
Latest record  : 16-01-2022 03:51:30
loading /home/uchan/Documents/me/browsing-sessions/Sessions - 2020-12-05 15-59-23.json
Total record   : 324
Earliest record: 26-10-2019 15:08:59
Latest record  : 13-10-2020 14:46:29
loading /home/uchan/Documents/me/browsing-sessions/883 Sessions - 2022-03-25 09-19-12.json
Total record   : 883
Earliest record: 19-08-2021 00:01:33
Latest record  : 16-01-2022 03:51:30
loading /home/uchan/Documents/me/browsing-sessions/Sessions-Feb2020.json
Total record   : 468
Earliest record: 26-10-2019 15:08:59
Latest record  : 13-10-2020 14:46:29


#### FOR EXPLORATION PURPOSES

Data Structure:

Session: 
    ___ attr: windows, tabsNumber, name, date, tag, sessionStartTime, id
    
Windows:
    ___ attr: id, name, url, lastAccessed

Session: {
    'windows': {
        '1': {
            {
                '1': Windows...,
                '2': Windows...,
                '3': Windows...,
            }
        }
    }
}

In [155]:
sample = _load_file(FILES[0])
sample[0].keys()

sample[0]['windows']['1']['2'].keys()
sample[0]['windows']['1']['2']['lastAccessed']

1572102539132

In [156]:
# Windows ID
sample[5]['windows'].keys()

dict_keys(['1', '79', '261'])

In [157]:
sample[5]['windows']['79'].keys()

dict_keys(['33', '34', '51', '53', '60', '61', '62', '63', '64', '67', '69', '70', '71'])

In [158]:
sample[5]['windows']['261'].keys()

dict_keys(['55', '56', '58', '59'])

In [159]:
# Tabs ID
sample[5]['windows']['1'].keys()

dict_keys(['1', '4', '5', '6', '7', '8', '9', '10', '12', '13', '21', '22', '24', '25'])

In [160]:
sample[5]['windows']['79'].keys()

dict_keys(['33', '34', '51', '53', '60', '61', '62', '63', '64', '67', '69', '70', '71'])

In [161]:
sample[5]['windows']['261'].keys()

dict_keys(['55', '56', '58', '59'])

In [162]:
# Tabs properties
sample[5]['windows']['1']['1'].keys()

dict_keys(['id', 'index', 'windowId', 'highlighted', 'active', 'attention', 'pinned', 'status', 'hidden', 'discarded', 'incognito', 'width', 'height', 'lastAccessed', 'audible', 'mutedInfo', 'isArticle', 'isInReaderMode', 'sharingState', 'successorTabId', 'cookieStoreId', 'url', 'title', 'favIconUrl'])

In [163]:
# Print out tabs properties
print(
    sample[5]['windows']['1']['1']['title'], '\n',
    sample[5]['windows']['1']['1']['url'], '\n',
    datetime.fromtimestamp(
        int(sample[5]['windows']['1']['1']['lastAccessed'])/1000
    ).strftime('%Y-%m-%d %H:%M:%S')
)

Google Translate 
 https://translate.google.com/#view=home&op=translate&sl=en&tl=id&text=weave 
 2020-04-08 21:39:26


In [164]:
# Try out other tabs
print(
    sample[5]['windows']['1']['4']['title'], '\n',
    sample[5]['windows']['1']['4']['url'], '\n',
    datetime.fromtimestamp(
        int(sample[5]['windows']['1']['1']['lastAccessed'])/1000
    ).strftime('%Y-%m-%d %H:%M:%S')
)

10 Must-Read Software Development Blogs – The Phrase Blog | Software Localization Experts 
 https://phrase.com/blog/posts/10-must-read-blogs-for-software-developer/ 
 2020-04-08 21:39:26


In [165]:
# Try out other tabs
print(
    sample[5]['windows']['1']['5']['title'], '\n',
    sample[5]['windows']['1']['5']['url'], '\n',
    datetime.fromtimestamp(
        int(sample[5]['windows']['1']['1']['lastAccessed'])/1000
    ).strftime('%Y-%m-%d %H:%M:%S')
)

tech-interview-handbook/preparing at master · yangshun/tech-interview-handbook 
 https://github.com/yangshun/tech-interview-handbook/tree/master/preparing 
 2020-04-08 21:39:26


In [66]:
# Try out other windows
print(
    sample[5]['windows']['79']['33']['title'], '\n',
    sample[5]['windows']['79']['33']['url'], '\n',
    datetime.fromtimestamp(
        int(sample[5]['windows']['79']['33']['lastAccessed'])/1000
    ).strftime('%Y-%m-%d %H:%M:%S')
)

(71) etcd - YouTube 
 https://www.youtube.com/results?search_query=etcd 
 2020-04-08 22:24:39


In [67]:
# Try out other windows
print(
    sample[5]['windows']['79']['34']['title'], '\n',
    sample[5]['windows']['79']['34']['url'], '\n',
    datetime.fromtimestamp(
        int(sample[5]['windows']['79']['33']['lastAccessed'])/1000
    ).strftime('%Y-%m-%d %H:%M:%S')
)

(71) The Twelve-Factor Container — Casey West - YouTube 
 https://www.youtube.com/watch?v=69UlcL5DTao 
 2020-04-08 22:24:39


In [69]:
# Try out other windows
print(
    sample[5]['windows']['261']['55']['title'], '\n',
    sample[5]['windows']['261']['55']['url'], '\n',
    datetime.fromtimestamp(
        int(sample[5]['windows']['261']['55']['lastAccessed'])/1000
    ).strftime('%Y-%m-%d %H:%M:%S')
)

Google Translate 
 https://translate.google.com/#view=home&op=translate&sl=en&tl=id&text=sophisticated 
 2020-04-08 22:32:34


In [70]:
# Try out other windows
print(
    sample[5]['windows']['261']['56']['title'], '\n',
    sample[5]['windows']['261']['56']['url'], '\n',
    datetime.fromtimestamp(
        int(sample[5]['windows']['261']['56']['lastAccessed'])/1000
    ).strftime('%Y-%m-%d %H:%M:%S')
)

(1142) Scott Meyers – The Most Important Design Guideline - YouTube 
 https://www.youtube.com/watch?v=5tg1ONG18H8 
 2020-04-08 22:32:34


#### Extraction

In [108]:
"""Testing Samples"""
# for session in sample:
#     record = {
#         'session_id': session['id'],
#         'session_tabsNumber': session['tabsNumber'],
#         'session_name': session['name'],
#         'session_date': session['date'],
#         'session_tag': session['tag'],
#         'session_sessionStartTime': session['sessionStartTime'],
#     }
    
#     for window_id in session['windows']:
#         for tab in session['windows'][window_id].values():
#             print(tab['title'])
#             print(tab['url'])
#             print(tab['lastAccessed'])
#         break
#     break

Google Translate
https://translate.google.com/#view=home&op=translate&sl=en&tl=id&text=desperation
1572102539132
(1425) Computer Networking Complete Course by Google - Beginner to Advanced - YouTube
https://www.youtube.com/watch?v=QKfk7YFILws&t=25s
1572102299151
(1425) Network Direction - YouTube
https://www.youtube.com/channel/UCtuXekfqj-paqsxtqVNCC2A
1572102364663
(1425) CS144 Introduction to Computer Networking Fall 2016 Stanford University - YouTube
https://www.youtube.com/playlist?list=PLvFG2xYBrYAQCyz4Wx3NPoYJOFjvU7g2Z
1572102312487
(1425) Computer Networks (CIS 345) - YouTube
https://www.youtube.com/playlist?list=PLLFIgriuZPAcCkmSTfcq7oaHcVy3rzEtc
1572102319580
(1425) Computer Networking: Part 1 of 3 - Georgia Tech - YouTube
https://www.youtube.com/playlist?list=PLAwxTw4SYaPn21MqCiFq2r0FSjk9l6cW2
1572102336044


In [196]:
"""Debugging"""
# test_data = [
#     {
#         'windows': {
#             '1': {
#                 '1': { 'name': 'data 1' },
#                 '2': { 'name': 'data 2' }
#             }
#         }
#     },
#     {
#        'windows': {
#             '1': {
#                 '1': { 'name': 'next data 1' },
#                 '2': { 'name': 'next data 2' }
#             }
#         }
#     }, 
# ]

# def extract_data_from_window(data):
#     records = []
#     for session in data:
#         record = { }
#         for window_id in session['windows']:
#             for tab in session['windows'][window_id].values():
#                 record['name'] = tab['name']
#                 print(f'appending {record}')
#                 records.append(record)

#     return records

# extract_data_from_window(test_data)

'Debugging'

In [192]:
def extract_data_from_window(data):
    records = []
    for session in data:
        for window_id in session['windows']:
            for tab in session['windows'][window_id].values():
                record = {
                    'session_id': session['id'],
                    'session_tabsNumber': session['tabsNumber'],
                    'session_name': session['name'],
                    'session_date': session['date'],
                    'session_tag': session['tag'],
                    'session_sessionStartTime': session['sessionStartTime'],
                    'tab_name': tab['title'],
                    'tab_url': tab['url'],
                    'tab_lastAccessed': tab['lastAccessed']
                }
                records.append(record)

    return records

In [183]:
"""Join all the data for easier analysis"""
data = []

for file in FILES:
    print(f'processing {file}')
    json_data = _load_file(file)
    records = extract_data_from_window(json_data)
    data.extend(records)
    
print(f'Amount of data: {len(data)}')

processing /home/uchan/Documents/me/browsing-sessions/Browsing-Sessions - 2020-08-16 22-22-41.json
processing /home/uchan/Documents/me/browsing-sessions/884 Sessions - 2022-04-18 06-36-16.json
processing /home/uchan/Documents/me/browsing-sessions/Sessions - 2020-12-05 15-59-23.json
processing /home/uchan/Documents/me/browsing-sessions/883 Sessions - 2022-03-25 09-19-12.json
processing /home/uchan/Documents/me/browsing-sessions/Sessions-Feb2020.json


82509

In [None]:
df = pd.DataFrame(data)

In [184]:
df.head()

Unnamed: 0,session_id,session_tabsNumber,session_name,session_date,session_tag,session_sessionStartTime,tab_name,tab_url,tab_lastAccessed
0,002383d7-cf3d-4d17-b9c2-701bbfb99b22,6,Networking,1572102539130,[networking],1572060943565,Google Translate,https://translate.google.com/#view=home&op=tra...,1572102539132
1,002383d7-cf3d-4d17-b9c2-701bbfb99b22,6,Networking,1572102539130,[networking],1572060943565,(1425) Computer Networking Complete Course by ...,https://www.youtube.com/watch?v=QKfk7YFILws&t=25s,1572102299151
2,002383d7-cf3d-4d17-b9c2-701bbfb99b22,6,Networking,1572102539130,[networking],1572060943565,(1425) Network Direction - YouTube,https://www.youtube.com/channel/UCtuXekfqj-paq...,1572102364663
3,002383d7-cf3d-4d17-b9c2-701bbfb99b22,6,Networking,1572102539130,[networking],1572060943565,(1425) CS144 Introduction to Computer Networki...,https://www.youtube.com/playlist?list=PLvFG2xY...,1572102312487
4,002383d7-cf3d-4d17-b9c2-701bbfb99b22,6,Networking,1572102539130,[networking],1572060943565,(1425) Computer Networks (CIS 345) - YouTube,https://www.youtube.com/playlist?list=PLLFIgri...,1572102319580


In [197]:
"""Dumping files"""
CSV_PATH = 'merged_sessions.csv'
df.to_csv(CSV_PATH, index=False)

JSON_PATH = 'merged_sessions.json'
df.to_json(JSON_PATH, indent=4)

#### Transform data for QA

In [11]:
df.session_date = df.session_date.apply(timestamp_to_date, as_string=False)
df.session_sessionStartTime = df.session_sessionStartTime.apply(timestamp_to_date, as_string=False)
df.window_lastAccessed = df.window_lastAccessed.apply(timestamp_to_date, as_string=False)

df.head()

Unnamed: 0,session_id,session_tabsNumber,session_name,session_date,session_tag,session_sessionStartTime,window_name,window_url,window_lastAccessed
0,002383d7-cf3d-4d17-b9c2-701bbfb99b22,6,Networking,2019-10-26 15:08:59.130,[networking],2019-10-26 03:35:43.565,(1425) Computer Networking: Part 1 of 3 - Geor...,https://www.youtube.com/playlist?list=PLAwxTw4...,2019-10-26 15:05:36.044
1,002383d7-cf3d-4d17-b9c2-701bbfb99b22,6,Networking,2019-10-26 15:08:59.130,[networking],2019-10-26 03:35:43.565,(1425) Computer Networking: Part 1 of 3 - Geor...,https://www.youtube.com/playlist?list=PLAwxTw4...,2019-10-26 15:05:36.044
2,002383d7-cf3d-4d17-b9c2-701bbfb99b22,6,Networking,2019-10-26 15:08:59.130,[networking],2019-10-26 03:35:43.565,(1425) Computer Networking: Part 1 of 3 - Geor...,https://www.youtube.com/playlist?list=PLAwxTw4...,2019-10-26 15:05:36.044
3,002383d7-cf3d-4d17-b9c2-701bbfb99b22,6,Networking,2019-10-26 15:08:59.130,[networking],2019-10-26 03:35:43.565,(1425) Computer Networking: Part 1 of 3 - Geor...,https://www.youtube.com/playlist?list=PLAwxTw4...,2019-10-26 15:05:36.044
4,002383d7-cf3d-4d17-b9c2-701bbfb99b22,6,Networking,2019-10-26 15:08:59.130,[networking],2019-10-26 03:35:43.565,(1425) Computer Networking: Part 1 of 3 - Geor...,https://www.youtube.com/playlist?list=PLAwxTw4...,2019-10-26 15:05:36.044


In [12]:
df.tail()

Unnamed: 0,session_id,session_tabsNumber,session_name,session_date,session_tag,session_sessionStartTime,window_name,window_url,window_lastAccessed
82504,fe2ece1d-f7e0-4e6b-92ab-37da4dbbd38d,90,ETC,2020-10-13 14:46:29.142,[],2020-10-12 21:02:16.831,pretrained models deep learning - Google Search,https://www.google.com/search?client=ubuntu&ch...,2020-10-13 05:21:55.810
82505,fe2ece1d-f7e0-4e6b-92ab-37da4dbbd38d,90,ETC,2020-10-13 14:46:29.142,[],2020-10-12 21:02:16.831,pretrained models deep learning - Google Search,https://www.google.com/search?client=ubuntu&ch...,2020-10-13 05:21:55.810
82506,fe2ece1d-f7e0-4e6b-92ab-37da4dbbd38d,90,ETC,2020-10-13 14:46:29.142,[],2020-10-12 21:02:16.831,pretrained models deep learning - Google Search,https://www.google.com/search?client=ubuntu&ch...,2020-10-13 05:21:55.810
82507,fe2ece1d-f7e0-4e6b-92ab-37da4dbbd38d,90,ETC,2020-10-13 14:46:29.142,[],2020-10-12 21:02:16.831,pretrained models deep learning - Google Search,https://www.google.com/search?client=ubuntu&ch...,2020-10-13 05:21:55.810
82508,fe2ece1d-f7e0-4e6b-92ab-37da4dbbd38d,90,ETC,2020-10-13 14:46:29.142,[],2020-10-12 21:02:16.831,pretrained models deep learning - Google Search,https://www.google.com/search?client=ubuntu&ch...,2020-10-13 05:21:55.810


#### Quality Check (Date Missing Assumptions)

In [18]:
dummy_years_and_months = set([
    (year, month) for year in range(2019,2023) for month in range(1,13)
])

In [14]:
def extract_years_and_months(date):
    return (date.year, date.month)

In [15]:
data_years_and_months = df.session_date.apply(extract_years_and_months)
data_years_and_months = set(data_years_and_months)

In [19]:
# it's dates that are not on the data.
# possible explanation:
#     2019-1 ~ 2019-3 I was on an internship;
#     2019-7 ~ 2019-8 I was on a social service
dummy_years_and_months.difference(data_years_and_months)

{(2019, 1),
 (2019, 2),
 (2019, 3),
 (2019, 7),
 (2019, 8),
 (2022, 5),
 (2022, 6),
 (2022, 7),
 (2022, 8),
 (2022, 9),
 (2022, 10),
 (2022, 11),
 (2022, 12)}

In [20]:
df.shape

(82509, 9)

In [21]:
sum(df[['session_id', 'session_date', 'window_name', 'window_lastAccessed']].duplicated())

81088