In [13]:
import os
import json
from datetime import datetime

#### Initial Data Loading

In [2]:
SESSION_FOLDER_PATH = '/home/uchan/Documents/me/browsing-sessions'
FILES = [os.path.join(SESSION_FOLDER_PATH, f)
         for f in os.listdir(SESSION_FOLDER_PATH)
         if os.path.isfile(os.path.join(SESSION_FOLDER_PATH, f))
        ]

In [3]:
FILES

['/home/uchan/Documents/me/browsing-sessions/Sessions - 2022-09-09 23-37-22.json',
 '/home/uchan/Documents/me/browsing-sessions/Sessions - 2020-12-05 15-59-23.json',
 '/home/uchan/Documents/me/browsing-sessions/Sessions - 2020-08-16 22-22-41.json',
 '/home/uchan/Documents/me/browsing-sessions/Sessions-Feb2020.json']

In [4]:
def timestamp_to_date(some_date, as_string=True):
    """convert timestamp to date type or date string type"""
    converted_date = datetime.utcfromtimestamp(some_date/1000)  # only right after divided by 1000
    if as_string:
        return converted_date.strftime('%d-%m-%Y %H:%M:%S')
    return converted_date

def _load_file(file):
    """accept a file name. load data from the file"""
    with open(file, 'r') as f:
        data = json.load(f)
    return data

def load_data(files):
    """accept a list of files. load data in all files"""
    data = []
    for file in files:
        data.extend(_load_file(file))
    return data

def check_browsing_periods(file):
    """check earliest and latest browsing periods from the data"""
    print(f'loading {file}')
    data = _load_file(file)
    
    dates = list(map(lambda d: timestamp_to_date(d['date'], False), data))
    sorted_dates = sorted(dates)
        
    print(f'Total record   : {len(data)}')
    print(f'Earliest record: {sorted_dates[0]}')
    print(f'Latest record  : {sorted_dates[-1]}')
    print('=' * 15)

In [5]:
"""Files and records' overlapping timestamps"""
for file in FILES:
    check_browsing_periods(file)

loading /home/uchan/Documents/me/browsing-sessions/Sessions - 2022-09-09 23-37-22.json
Total record   : 976
Earliest record: 2020-07-06 14:53:21.279000
Latest record  : 2022-09-09 16:37:02.711000
loading /home/uchan/Documents/me/browsing-sessions/Sessions - 2020-12-05 15-59-23.json
Total record   : 324
Earliest record: 2018-12-15 15:21:45.508000
Latest record  : 2020-12-05 08:59:09.831000
loading /home/uchan/Documents/me/browsing-sessions/Sessions - 2020-08-16 22-22-41.json
Total record   : 166
Earliest record: 2018-12-15 15:21:45.508000
Latest record  : 2020-08-16 15:22:38.862000
loading /home/uchan/Documents/me/browsing-sessions/Sessions-Feb2020.json
Total record   : 468
Earliest record: 2018-12-15 15:21:45.508000
Latest record  : 2021-02-16 15:00:27.819000


In [6]:
data = load_data(FILES)

In [7]:
s = data[-2]
len(s)

10

In [40]:
len(s['windows'].keys()) == s['windowsNumber']

True

In [37]:
for winId, v in s['windows'].items(): # ['53'].keys()
    print(winId)
    print(len(v.keys()))

53
29
29


In [13]:
s.keys()

dict_keys(['windows', 'windowsNumber', 'windowsInfo', 'tabsNumber', 'name', 'date', 'lastEditedTime', 'tag', 'sessionStartTime', 'id'])

In [36]:
# for d in data:
#     if d['name'] == "AZZ":
#         print(d)
# s['windows'].values()
for v in s['windows'].values():
    for k, vv in v.items():
        print(k)
        print(vv)
    break

7
{'id': 7, 'index': 26, 'windowId': 53, 'highlighted': False, 'active': False, 'attention': False, 'pinned': False, 'status': 'complete', 'hidden': False, 'discarded': False, 'incognito': False, 'width': 1366, 'height': 667, 'lastAccessed': 1612633439827, 'audible': False, 'mutedInfo': {'muted': False}, 'isArticle': False, 'isInReaderMode': False, 'sharingState': {'camera': False, 'microphone': False}, 'successorTabId': -1, 'cookieStoreId': 'firefox-default', 'url': 'https://www.youtube.com/watch?v=NcUhYQQrvyI', 'title': 'Champagne Supernova (Remastered) - YouTube', 'favIconUrl': '

In [15]:
s['windows']['53'].keys()

dict_keys(['7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35'])

In [17]:
# Trying to access some properties
s['windowsInfo']

{'53': {'id': 53,
  'focused': False,
  'top': 27,
  'left': 0,
  'width': 1366,
  'height': 741,
  'incognito': False,
  'type': 'normal',
  'state': 'maximized',
  'alwaysOnTop': False,
  'title': 'perspectivism - Google Search — Mozilla Firefox'}}

In [18]:
s['windows']['53']['7']

{'id': 7,
 'index': 26,
 'windowId': 53,
 'highlighted': False,
 'active': False,
 'attention': False,
 'pinned': False,
 'status': 'complete',
 'hidden': False,
 'discarded': False,
 'incognito': False,
 'width': 1366,
 'height': 667,
 'lastAccessed': 1612633439827,
 'audible': False,
 'mutedInfo': {'muted': False},
 'isArticle': False,
 'isInReaderMode': False,
 'sharingState': {'camera': False, 'microphone': False},
 'successorTabId': -1,
 'cookieStoreId': 'firefox-default',
 'url': 'https://www.youtube.com/watch?v=NcUhYQQrvyI',
 'title': 'Champagne Supernova (Remastered) - YouTube',
 'favIconUrl': '

In [19]:
def print_windows_info(s):
    """Utility for printing sessions information"""
    for k, v in s.items():
        print(f"Key: {k}")
        print(f"Value type: {type(v)}")

        if type(v) == dict:
            # if item is a windows, dig deeper
            if k == "windows":
                for k2, v2 in v.items():
                    print(f"Length of Windows Values' Keys: {len(v2.keys())}")
            # if not windows, just print the value
            print(f"Value Keys: {v.keys()}")
        else:
            # type of value is either an int, str, or list
            # if key is a timestamp field, convert it to time
            if k in ('date', 'lastEditedTime', 'sessionStartTime'):
                v = timestamp_to_date(v, True)
            print(f"{k}: {v}\n")

In [20]:
# print several sessions info
for index, sample in enumerate(data[:10], start=1):
    print(f"Printing Information for Windows {index}")
    print_windows_info(sample)
    print("=" * 40)

Printing Information for Windows 1
Key: windows
Value type: <class 'dict'>
Length of Windows Values' Keys: 6
Value Keys: dict_keys(['1'])

Key: windowsNumber
Value type: <class 'int'>
windowsNumber: 1

Key: windowsInfo
Value type: <class 'dict'>
Value Keys: dict_keys(['1'])

Key: tabsNumber
Value type: <class 'int'>
tabsNumber: 6

Key: name
Value type: <class 'str'>
name: Networking

Key: date
Value type: <class 'int'>
date: 26-10-2019 15:08:59

Key: tag
Value type: <class 'list'>
tag: ['networking']

Key: sessionStartTime
Value type: <class 'int'>
sessionStartTime: 26-10-2019 03:35:43

Key: id
Value type: <class 'str'>
id: 002383d7-cf3d-4d17-b9c2-701bbfb99b22

Key: lastEditedTime
Value type: <class 'int'>
lastEditedTime: 08-04-2020 15:37:46

Printing Information for Windows 2
Key: windows
Value type: <class 'dict'>
Length of Windows Values' Keys: 27
Length of Windows Values' Keys: 49
Value Keys: dict_keys(['1', '274'])

Key: windowsNumber
Value type: <class 'int'>
windowsNumber: 2

Ke

---

In [52]:
# TODO: 
#     (0) load all the data together ✅️
#     (1) create a validator with Pydantic ✅️
#     (2) loop through all the data and pass them all through the validator ✅️
#     (3) dump them to csv (for other purposes) ✅️
#     (4) create a lightweight loader after the data is validated ✅️

In [40]:
# How much rows do I expect to have?
row_count = 0
for d in data:
    row_count += d['tabsNumber']
row_count

120462

In [21]:
"""Trying out Pydantic!"""
from typing import List, Optional
from pydantic import BaseModel

In [44]:
"""These two classes are used to model the data"""
class Tab(BaseModel):
    id: int
    index: int
    windowId: int
    pinned: bool
    lastAccessed: datetime
    url: str
    title: str
    
class Session(BaseModel):
    id: str
    name: str
    sessionStartTime: datetime
    date: datetime
    tag: List[str]
    tabs: List[Tab]  # A session can have many tabs

In [45]:
for i, d in enumerate(data):
    tabs = []
    for _, tabsId in d['windows'].items():
        for _, tab in tabsId.items():
            tabs.append(tab)
    # number of tab per-session info is given in the session data
    # assert if the currently looped number of tab matches that
    assert len(tabs) == d['tabsNumber'], f"Wrong on index {i}"
    d['tabs'] = tabs

In [25]:
sessions: List[Session] = []

for d in data:
    session = Session(**d)
    sessions.append(session)
    
assert len(sessions) == len(data)

In [None]:
sessions: List[Session] = [Session(**d) for d in data]
assert len(sessions) == len(data)

In [26]:
sessions[0].schema()['properties'].keys(), sessions[0].tabs[0].schema()['properties'].keys()

(dict_keys(['id', 'name', 'sessionStartTime', 'date', 'tag', 'tabs']),
 dict_keys(['id', 'index', 'windowId', 'pinned', 'lastAccessed', 'url', 'title']))

In [27]:
sessions[0].sessionStartTime.strftime('%Y-%m-%d %H:%M:%S'), \
sessions[0].tabs[0].lastAccessed.strftime('%Y-%m-%d %H:%M:%S')

('2019-10-26 03:35:43', '2019-10-26 15:08:59')

In [48]:
import csv

DUMP_PATH = './data/dump.csv'
with open(DUMP_PATH, 'w') as f:
    session_headers = ['id', 'name', 'sessionStartTime', 'date', 'tag']
    tabs_headers = ['tab_id', 'index', 'windowId', 'pinned', 'lastAccessed', 'url', 'title']
    
    csv_writer = csv.writer(f)
    csv_writer.writerow(session_headers + tabs_headers)

    for s in sessions:
        d = s.dict()
        
        d['date'] = d['date'].strftime('%Y-%m-%d %H:%M:%S')
        d['sessionStartTime'] = d['sessionStartTime'].strftime('%Y-%m-%d %H:%M:%S')
        
        row_session_part = [d[k] for k in session_headers]
        
        for tab in d['tabs']:
            tab['tab_id'] = tab['id']  # lol
            tab['lastAccessed'] = tab['lastAccessed'].strftime('%Y-%m-%d %H:%M:%S')
            row_tabs_part = [tab[k] for k in tabs_headers]
        
            csv_writer.writerow(
                row_session_part + row_tabs_part
            )

#### FOR EXPLORATION PURPOSES

Data Structure:

Session: 
    ___ attr: windows, tabsNumber, name, date, tag, sessionStartTime, id
    
Windows:
    ___ attr: id, name, url, lastAccessed

Session: {
    'windows': {
        '1': {
            {
                '1': Windows...,
                '2': Windows...,
                '3': Windows...,
            }
        }
    }
}

In [155]:
sample = _load_file(FILES[0])
sample[0].keys()

sample[0]['windows']['1']['2'].keys()
sample[0]['windows']['1']['2']['lastAccessed']

1572102539132

In [156]:
# Windows ID
sample[5]['windows'].keys()

dict_keys(['1', '79', '261'])

In [157]:
sample[5]['windows']['79'].keys()

dict_keys(['33', '34', '51', '53', '60', '61', '62', '63', '64', '67', '69', '70', '71'])

In [158]:
sample[5]['windows']['261'].keys()

dict_keys(['55', '56', '58', '59'])

In [159]:
# Tabs ID
sample[5]['windows']['1'].keys()

dict_keys(['1', '4', '5', '6', '7', '8', '9', '10', '12', '13', '21', '22', '24', '25'])

In [160]:
sample[5]['windows']['79'].keys()

dict_keys(['33', '34', '51', '53', '60', '61', '62', '63', '64', '67', '69', '70', '71'])

In [161]:
sample[5]['windows']['261'].keys()

dict_keys(['55', '56', '58', '59'])

In [162]:
# Tabs properties
sample[5]['windows']['1']['1'].keys()

dict_keys(['id', 'index', 'windowId', 'highlighted', 'active', 'attention', 'pinned', 'status', 'hidden', 'discarded', 'incognito', 'width', 'height', 'lastAccessed', 'audible', 'mutedInfo', 'isArticle', 'isInReaderMode', 'sharingState', 'successorTabId', 'cookieStoreId', 'url', 'title', 'favIconUrl'])

In [163]:
# Print out tabs properties
print(
    sample[5]['windows']['1']['1']['title'], '\n',
    sample[5]['windows']['1']['1']['url'], '\n',
    datetime.fromtimestamp(
        int(sample[5]['windows']['1']['1']['lastAccessed'])/1000
    ).strftime('%Y-%m-%d %H:%M:%S')
)

Google Translate 
 https://translate.google.com/#view=home&op=translate&sl=en&tl=id&text=weave 
 2020-04-08 21:39:26


In [164]:
# Try out other tabs
print(
    sample[5]['windows']['1']['4']['title'], '\n',
    sample[5]['windows']['1']['4']['url'], '\n',
    datetime.fromtimestamp(
        int(sample[5]['windows']['1']['1']['lastAccessed'])/1000
    ).strftime('%Y-%m-%d %H:%M:%S')
)

10 Must-Read Software Development Blogs – The Phrase Blog | Software Localization Experts 
 https://phrase.com/blog/posts/10-must-read-blogs-for-software-developer/ 
 2020-04-08 21:39:26


In [165]:
# Try out other tabs
print(
    sample[5]['windows']['1']['5']['title'], '\n',
    sample[5]['windows']['1']['5']['url'], '\n',
    datetime.fromtimestamp(
        int(sample[5]['windows']['1']['1']['lastAccessed'])/1000
    ).strftime('%Y-%m-%d %H:%M:%S')
)

tech-interview-handbook/preparing at master · yangshun/tech-interview-handbook 
 https://github.com/yangshun/tech-interview-handbook/tree/master/preparing 
 2020-04-08 21:39:26


In [66]:
# Try out other windows
print(
    sample[5]['windows']['79']['33']['title'], '\n',
    sample[5]['windows']['79']['33']['url'], '\n',
    datetime.fromtimestamp(
        int(sample[5]['windows']['79']['33']['lastAccessed'])/1000
    ).strftime('%Y-%m-%d %H:%M:%S')
)

(71) etcd - YouTube 
 https://www.youtube.com/results?search_query=etcd 
 2020-04-08 22:24:39


In [67]:
# Try out other windows
print(
    sample[5]['windows']['79']['34']['title'], '\n',
    sample[5]['windows']['79']['34']['url'], '\n',
    datetime.fromtimestamp(
        int(sample[5]['windows']['79']['33']['lastAccessed'])/1000
    ).strftime('%Y-%m-%d %H:%M:%S')
)

(71) The Twelve-Factor Container — Casey West - YouTube 
 https://www.youtube.com/watch?v=69UlcL5DTao 
 2020-04-08 22:24:39


In [69]:
# Try out other windows
print(
    sample[5]['windows']['261']['55']['title'], '\n',
    sample[5]['windows']['261']['55']['url'], '\n',
    datetime.fromtimestamp(
        int(sample[5]['windows']['261']['55']['lastAccessed'])/1000
    ).strftime('%Y-%m-%d %H:%M:%S')
)

Google Translate 
 https://translate.google.com/#view=home&op=translate&sl=en&tl=id&text=sophisticated 
 2020-04-08 22:32:34


In [70]:
# Try out other windows
print(
    sample[5]['windows']['261']['56']['title'], '\n',
    sample[5]['windows']['261']['56']['url'], '\n',
    datetime.fromtimestamp(
        int(sample[5]['windows']['261']['56']['lastAccessed'])/1000
    ).strftime('%Y-%m-%d %H:%M:%S')
)

(1142) Scott Meyers – The Most Important Design Guideline - YouTube 
 https://www.youtube.com/watch?v=5tg1ONG18H8 
 2020-04-08 22:32:34


#### Extraction

In [108]:
"""Testing Samples"""
# for session in sample:
#     record = {
#         'session_id': session['id'],
#         'session_tabsNumber': session['tabsNumber'],
#         'session_name': session['name'],
#         'session_date': session['date'],
#         'session_tag': session['tag'],
#         'session_sessionStartTime': session['sessionStartTime'],
#     }
    
#     for window_id in session['windows']:
#         for tab in session['windows'][window_id].values():
#             print(tab['title'])
#             print(tab['url'])
#             print(tab['lastAccessed'])
#         break
#     break

Google Translate
https://translate.google.com/#view=home&op=translate&sl=en&tl=id&text=desperation
1572102539132
(1425) Computer Networking Complete Course by Google - Beginner to Advanced - YouTube
https://www.youtube.com/watch?v=QKfk7YFILws&t=25s
1572102299151
(1425) Network Direction - YouTube
https://www.youtube.com/channel/UCtuXekfqj-paqsxtqVNCC2A
1572102364663
(1425) CS144 Introduction to Computer Networking Fall 2016 Stanford University - YouTube
https://www.youtube.com/playlist?list=PLvFG2xYBrYAQCyz4Wx3NPoYJOFjvU7g2Z
1572102312487
(1425) Computer Networks (CIS 345) - YouTube
https://www.youtube.com/playlist?list=PLLFIgriuZPAcCkmSTfcq7oaHcVy3rzEtc
1572102319580
(1425) Computer Networking: Part 1 of 3 - Georgia Tech - YouTube
https://www.youtube.com/playlist?list=PLAwxTw4SYaPn21MqCiFq2r0FSjk9l6cW2
1572102336044


In [196]:
"""Debugging"""
# test_data = [
#     {
#         'windows': {
#             '1': {
#                 '1': { 'name': 'data 1' },
#                 '2': { 'name': 'data 2' }
#             }
#         }
#     },
#     {
#        'windows': {
#             '1': {
#                 '1': { 'name': 'next data 1' },
#                 '2': { 'name': 'next data 2' }
#             }
#         }
#     }, 
# ]

# def extract_data_from_window(data):
#     records = []
#     for session in data:
#         record = { }
#         for window_id in session['windows']:
#             for tab in session['windows'][window_id].values():
#                 record['name'] = tab['name']
#                 print(f'appending {record}')
#                 records.append(record)

#     return records

# extract_data_from_window(test_data)

'Debugging'

In [192]:
def extract_data_from_window(data):
    records = []
    for session in data:
        for window_id in session['windows']:
            for tab in session['windows'][window_id].values():
                record = {
                    'session_id': session['id'],
                    'session_tabsNumber': session['tabsNumber'],
                    'session_name': session['name'],
                    'session_date': session['date'],
                    'session_tag': session['tag'],
                    'session_sessionStartTime': session['sessionStartTime'],
                    'tab_name': tab['title'],
                    'tab_url': tab['url'],
                    'tab_lastAccessed': tab['lastAccessed']
                }
                records.append(record)

    return records

In [183]:
"""Join all the data for easier analysis"""
data = []

for file in FILES:
    print(f'processing {file}')
    json_data = _load_file(file)
    records = extract_data_from_window(json_data)
    data.extend(records)
    
print(f'Amount of data: {len(data)}')

processing /home/uchan/Documents/me/browsing-sessions/Browsing-Sessions - 2020-08-16 22-22-41.json
processing /home/uchan/Documents/me/browsing-sessions/884 Sessions - 2022-04-18 06-36-16.json
processing /home/uchan/Documents/me/browsing-sessions/Sessions - 2020-12-05 15-59-23.json
processing /home/uchan/Documents/me/browsing-sessions/883 Sessions - 2022-03-25 09-19-12.json
processing /home/uchan/Documents/me/browsing-sessions/Sessions-Feb2020.json


82509

In [None]:
df = pd.DataFrame(data)

In [184]:
df.head()

Unnamed: 0,session_id,session_tabsNumber,session_name,session_date,session_tag,session_sessionStartTime,tab_name,tab_url,tab_lastAccessed
0,002383d7-cf3d-4d17-b9c2-701bbfb99b22,6,Networking,1572102539130,[networking],1572060943565,Google Translate,https://translate.google.com/#view=home&op=tra...,1572102539132
1,002383d7-cf3d-4d17-b9c2-701bbfb99b22,6,Networking,1572102539130,[networking],1572060943565,(1425) Computer Networking Complete Course by ...,https://www.youtube.com/watch?v=QKfk7YFILws&t=25s,1572102299151
2,002383d7-cf3d-4d17-b9c2-701bbfb99b22,6,Networking,1572102539130,[networking],1572060943565,(1425) Network Direction - YouTube,https://www.youtube.com/channel/UCtuXekfqj-paq...,1572102364663
3,002383d7-cf3d-4d17-b9c2-701bbfb99b22,6,Networking,1572102539130,[networking],1572060943565,(1425) CS144 Introduction to Computer Networki...,https://www.youtube.com/playlist?list=PLvFG2xY...,1572102312487
4,002383d7-cf3d-4d17-b9c2-701bbfb99b22,6,Networking,1572102539130,[networking],1572060943565,(1425) Computer Networks (CIS 345) - YouTube,https://www.youtube.com/playlist?list=PLLFIgri...,1572102319580


In [197]:
"""Dumping files"""
CSV_PATH = 'data/merged_sessions.csv'
df.to_csv(CSV_PATH, index=False)

JSON_PATH = 'data/merged_sessions.json'
df.to_json(JSON_PATH, indent=4)

#### Transform data for QA

In [11]:
df.session_date = df.session_date.apply(timestamp_to_date, as_string=False)
df.session_sessionStartTime = df.session_sessionStartTime.apply(timestamp_to_date, as_string=False)
df.window_lastAccessed = df.window_lastAccessed.apply(timestamp_to_date, as_string=False)

df.head()

Unnamed: 0,session_id,session_tabsNumber,session_name,session_date,session_tag,session_sessionStartTime,window_name,window_url,window_lastAccessed
0,002383d7-cf3d-4d17-b9c2-701bbfb99b22,6,Networking,2019-10-26 15:08:59.130,[networking],2019-10-26 03:35:43.565,(1425) Computer Networking: Part 1 of 3 - Geor...,https://www.youtube.com/playlist?list=PLAwxTw4...,2019-10-26 15:05:36.044
1,002383d7-cf3d-4d17-b9c2-701bbfb99b22,6,Networking,2019-10-26 15:08:59.130,[networking],2019-10-26 03:35:43.565,(1425) Computer Networking: Part 1 of 3 - Geor...,https://www.youtube.com/playlist?list=PLAwxTw4...,2019-10-26 15:05:36.044
2,002383d7-cf3d-4d17-b9c2-701bbfb99b22,6,Networking,2019-10-26 15:08:59.130,[networking],2019-10-26 03:35:43.565,(1425) Computer Networking: Part 1 of 3 - Geor...,https://www.youtube.com/playlist?list=PLAwxTw4...,2019-10-26 15:05:36.044
3,002383d7-cf3d-4d17-b9c2-701bbfb99b22,6,Networking,2019-10-26 15:08:59.130,[networking],2019-10-26 03:35:43.565,(1425) Computer Networking: Part 1 of 3 - Geor...,https://www.youtube.com/playlist?list=PLAwxTw4...,2019-10-26 15:05:36.044
4,002383d7-cf3d-4d17-b9c2-701bbfb99b22,6,Networking,2019-10-26 15:08:59.130,[networking],2019-10-26 03:35:43.565,(1425) Computer Networking: Part 1 of 3 - Geor...,https://www.youtube.com/playlist?list=PLAwxTw4...,2019-10-26 15:05:36.044


In [12]:
df.tail()

Unnamed: 0,session_id,session_tabsNumber,session_name,session_date,session_tag,session_sessionStartTime,window_name,window_url,window_lastAccessed
82504,fe2ece1d-f7e0-4e6b-92ab-37da4dbbd38d,90,ETC,2020-10-13 14:46:29.142,[],2020-10-12 21:02:16.831,pretrained models deep learning - Google Search,https://www.google.com/search?client=ubuntu&ch...,2020-10-13 05:21:55.810
82505,fe2ece1d-f7e0-4e6b-92ab-37da4dbbd38d,90,ETC,2020-10-13 14:46:29.142,[],2020-10-12 21:02:16.831,pretrained models deep learning - Google Search,https://www.google.com/search?client=ubuntu&ch...,2020-10-13 05:21:55.810
82506,fe2ece1d-f7e0-4e6b-92ab-37da4dbbd38d,90,ETC,2020-10-13 14:46:29.142,[],2020-10-12 21:02:16.831,pretrained models deep learning - Google Search,https://www.google.com/search?client=ubuntu&ch...,2020-10-13 05:21:55.810
82507,fe2ece1d-f7e0-4e6b-92ab-37da4dbbd38d,90,ETC,2020-10-13 14:46:29.142,[],2020-10-12 21:02:16.831,pretrained models deep learning - Google Search,https://www.google.com/search?client=ubuntu&ch...,2020-10-13 05:21:55.810
82508,fe2ece1d-f7e0-4e6b-92ab-37da4dbbd38d,90,ETC,2020-10-13 14:46:29.142,[],2020-10-12 21:02:16.831,pretrained models deep learning - Google Search,https://www.google.com/search?client=ubuntu&ch...,2020-10-13 05:21:55.810


#### Quality Check (Date Missing Assumptions)

In [18]:
dummy_years_and_months = set([
    (year, month) for year in range(2019,2023) for month in range(1,13)
])

In [14]:
def extract_years_and_months(date):
    return (date.year, date.month)

In [15]:
data_years_and_months = df.session_date.apply(extract_years_and_months)
data_years_and_months = set(data_years_and_months)

In [19]:
# it's dates that are not on the data.
# possible explanation:
#     2019-1 ~ 2019-3 I was on an internship;
#     2019-7 ~ 2019-8 I was on a social service
dummy_years_and_months.difference(data_years_and_months)

{(2019, 1),
 (2019, 2),
 (2019, 3),
 (2019, 7),
 (2019, 8),
 (2022, 5),
 (2022, 6),
 (2022, 7),
 (2022, 8),
 (2022, 9),
 (2022, 10),
 (2022, 11),
 (2022, 12)}

In [20]:
df.shape

(82509, 9)

In [21]:
sum(df[['session_id', 'session_date', 'window_name', 'window_lastAccessed']].duplicated())

81088