In [28]:
import sqlite3
import os
import shutil
import pandas as pd
import matplotlib.pyplot as plt
import datetime as datetime
from datetime import datetime as dt
from urllib.parse import urlparse
import pytz
from urllib.parse import urlparse, parse_qs
from IPython.display import display, HTML

In [29]:
#locate the browsing history database
history_db_path = r'C:\Users\xjing\AppData\Local\Google\Chrome\User Data\Profile 5\History'

# Create a copy of the database to avoid issues with running Chrome at the same time
temp_history_db_path = r'C:\Users\xjing\AppData\Local\Temp\History_Copy'
shutil.copyfile(history_db_path, temp_history_db_path)

#connect to the SQLite database
conn = sqlite3.connect(temp_history_db_path)
cursor = conn.cursor() 

#sql query for obtaining the url,url title and visit time of browsing histories
query = """
    SELECT 
        visits.id, 
        urls.url, 
        urls.title, 
        visits.visit_time, 
        keyword_search_terms.term AS search_term
    FROM 
        visits
    INNER JOIN 
        urls ON visits.url = urls.id
    LEFT JOIN 
        keyword_search_terms ON visits.url = keyword_search_terms.url_id
"""
#read the obtained data into a dataframe
df = pd.read_sql_query(query, conn)

# convert the visit_time to readable format,in Melbourne timezone
def convert_time(timestamp):
    melbourne_tz = pytz.timezone('Australia/Melbourne')
    dt = datetime.datetime(1601, 1, 1,tzinfo=pytz.UTC) + datetime.timedelta(microseconds=timestamp)
    dt = dt.astimezone(melbourne_tz)
    return dt.replace(microsecond=0)

# apply the function to the 'visit_time' column
df['visit_time'] = df['visit_time'].apply(convert_time)

#close the database connection
conn.close()

#remove temporary copy
os.remove(temp_history_db_path)

In [30]:
#process the browsing hisstory
#if the records have the same url and were visited within the same second, keep only one of them.

df['visit_time_second'] = df['visit_time'].dt.floor('S')
duplicates = df.duplicated(subset=['visit_time_second', 'url'], keep='last')
df_filtered = df[~duplicates]

# drop the 'visit_time_second' column
df_filtered = df_filtered.drop(columns='visit_time_second')
df_filtered.reset_index(drop=True, inplace=True)

In [31]:
# consider how to process the "duplicated" urls resulted from ad click tracking ,auto-refresh etc, consider standardizing them?

In [32]:
def standardize_url(url):
    parsed = urlparse(url)
    # keep only the scheme, netloc (domain), and path (ignoring parameters, queries, fragments)
    return f"{parsed.scheme}://{parsed.netloc}{parsed.path}"

In [35]:
#--------------------------------browsing history by day (list view)-------------------------------------------
#add in a column for visit date
df_filtered['visit_date'] = df_filtered['visit_time'].dt.date

#sort by visit date and visit time
df_filtered.sort_values(['visit_date', 'visit_time'], ascending=[False, False], inplace=True)

# group by visit date, sort descendingly
grouped_by_day = df_filtered.groupby('visit_date', sort=False)

# print out the records for each day, time + url+ url.title
    print(f"Date:{name}")
    print("="*50)  #add separator between days
    for index, row in group.iterrows():
        print(f",{row['visit_time'].time()}, {row['url']},{row['title']}")
    print()


Date:2023-09-07
https://en.wikipedia.org/wiki/Ultimate_Fighting_Championship#Anti-doping_policy,Ultimate Fighting Championship - Wikipedia,18:16:04
https://en.wikipedia.org/wiki/Ultimate_Fighting_Championship#UFC_events,Ultimate Fighting Championship - Wikipedia,18:15:33
https://en.wikipedia.org/wiki/Ultimate_Fighting_Championship#UFC_Hall_of_Fame,Ultimate Fighting Championship - Wikipedia,18:15:30
https://en.wikipedia.org/wiki/Ultimate_Fighting_Championship,Ultimate Fighting Championship - Wikipedia,18:13:51
https://www.google.com/search?q=ufc&oq=&aqs=chrome.0.69i59i450l8.2502j0j15&sourceid=chrome&ie=UTF-8&bshm=rime/1,ufc - Google Search,18:13:47
https://www.google.com/search?q=ufc&oq=&aqs=chrome.0.69i59i450l8.2502j0j15&sourceid=chrome&ie=UTF-8,ufc - Google Search,18:13:45
https://www.google.com/search?q=ufc&oq=&aqs=chrome.0.69i59i450l8.2502j0j15&sourceid=chrome&ie=UTF-8,ufc - Google Search,18:13:44
https://www.formula1.com/,F1 - The Official Home of Formula 1® Racing,13:22:15
https:/

In [41]:
# Alternatively ,print out the records for each day, time + url.title as hyperlink
for name, group in grouped_by_day:
    print(f"Date:{name}")
    print("="*50)  #add separator between days
    for index, row in group.iterrows():
        #convert the title into a hyperlink pointing to the URL
        # use the title for display if it exists, otherwise use the URL
        display_text = row["title"] if row["title"] else row["url"]
        hyperlink = f'<a href="{row["url"]}">{display_text}</a>'
        display(HTML(f"{row['visit_time'].time()} {hyperlink}"))
    print()

Date:2023-09-07



Date:2023-09-06



Date:2023-09-03



Date:2023-09-02



