In [1]:
import sqlite3
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np


In [2]:
conn = sqlite3.connect("../../places.sqlite")

full_data = pd.read_sql_query(
    """

SELECT moz_historyvisits.id,
    moz_places.url as url,
    moz_historyvisits.'visit_date' as 'visit_date',
    moz_historyvisits.from_visit as from_visit,
    t2.url as from_url
FROM moz_historyvisits
    JOIN moz_places ON moz_places.id = moz_historyvisits.place_id
    JOIN (
        SELECT moz_historyvisits.id,
            moz_places.url as url
        FROM moz_historyvisits
            JOIN moz_places ON moz_places.id = moz_historyvisits.place_id
    ) AS t2 ON t2.id = moz_historyvisits.from_visit;
    
""",
    conn,
)

# SELECT moz_historyvisits.id,
#     moz_places.url as url,
#     moz_historyvisits.'visit_date' as 'visit_date',
#     moz_historyvisits.from_visit as from_visit,
#     NULL AS from_url
# FROM moz_historyvisits
#     JOIN moz_places ON moz_places.id = moz_historyvisits.place_id
# WHERE moz_historyvisits.from_visit NOT IN (
#         SELECT id
#         FROM moz_historyvisits
#     )
# UNION


In [3]:
full_data.head()


Unnamed: 0,id,url,visit_date,from_visit,from_url
0,5,https://addons.mozilla.org/en-US/firefox/searc...,1685958697019279,4,https://addons.mozilla.org/en-US/firefox/searc...
1,6,https://addons.mozilla.org/en-US/firefox/addon...,1685958699268107,5,https://addons.mozilla.org/en-US/firefox/searc...
2,8,https://firefox.com/,1685958833083090,7,http://firefox.com/
3,9,https://www.mozilla.org/firefox/new/?redirect_...,1685958833572129,8,https://firefox.com/
4,10,https://www.mozilla.org/en-US/firefox/new/?red...,1685958834207886,9,https://www.mozilla.org/firefox/new/?redirect_...


In [4]:
import csv
import re

full_data["from_host"] = full_data["from_url"].map(
    lambda x: re.match(r"https?:\/\/(([^/]+\.)+[^/]+)", x).group(1)
    if x is not None and re.match(r"https?:\/\/(([^/]+\.)+[^/]+)", x) is not None
    else pd.NA
)

full_data["to_host"] = full_data["url"].map(
    lambda x: re.match(r"https?:\/\/(([^/]+\.)+[^/]+)", x).group(1)
    if re.match(r"https?:\/\/(([^/]+\.)+[^/]+)", x) is not None
    else pd.NA
)

full_data["to_host"].isna().sum()


661

In [5]:
from datetime import datetime

full_data["visit_date_obj"] = full_data["visit_date"].map(
    lambda x: datetime.fromtimestamp(x / 1000000)
)


In [6]:
full_data.drop("url", axis=1, inplace=True)
full_data.drop("from_url", axis=1, inplace=True)
full_data["visit_count"] = 1

full_data.dropna(axis=0, how="any", inplace=True)
full_data.reset_index(inplace=True, drop=True)

full_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27229 entries, 0 to 27228
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   id              27229 non-null  int64         
 1   visit_date      27229 non-null  int64         
 2   from_visit      27229 non-null  int64         
 3   from_host       27229 non-null  object        
 4   to_host         27229 non-null  object        
 5   visit_date_obj  27229 non-null  datetime64[ns]
 6   visit_count     27229 non-null  int64         
dtypes: datetime64[ns](1), int64(4), object(2)
memory usage: 1.5+ MB


In [7]:
full_data.head()


Unnamed: 0,id,visit_date,from_visit,from_host,to_host,visit_date_obj,visit_count
0,5,1685958697019279,4,addons.mozilla.org,addons.mozilla.org,2023-06-05 09:51:37.019279,1
1,6,1685958699268107,5,addons.mozilla.org,addons.mozilla.org,2023-06-05 09:51:39.268107,1
2,8,1685958833083090,7,firefox.com,firefox.com,2023-06-05 09:53:53.083090,1
3,9,1685958833572129,8,firefox.com,www.mozilla.org,2023-06-05 09:53:53.572129,1
4,10,1685958834207886,9,www.mozilla.org,www.mozilla.org,2023-06-05 09:53:54.207886,1


In [8]:
unique_hosts = full_data["from_host"].unique()
full_data["visits"] = 1

final_data = full_data.groupby(["to_host", "from_host"]).agg({"visits": "sum"})

final_data.reset_index(inplace=True)

final_data.head()


Unnamed: 0,to_host,from_host,visits
0,10.0.0.1:4100,10.0.0.1:4100,1771
1,127.0.0.1:1123,app.tabnine.com,1
2,127.0.0.1:36081,codeium.com,1
3,127.0.0.1:39635,codeium.com,1
4,127.0.0.1:63342,account.jetbrains.com,1


In [9]:
def top_n(host: str, n: int):
    x = final_data[final_data["from_host"] == host]
    x = x.sort_values("visits", ascending=False)

    tot_visits = x["visits"].sum()
    x["probability"] = x["visits"].map(lambda x: x / tot_visits)

    res = x.head(n)[["to_host", "probability"]]

    print(f"From: {host}\n")
    print(res)
    print("")


top_n("stackoverflow.com", 5)
top_n("www.startpage.com", 5)
top_n("www.youtube.com", 5)
top_n("twitter.com", 15)
top_n("github.com", 15)


From: stackoverflow.com

                to_host  probability
1542  stackoverflow.com       0.3125
595     docs.gitlab.com       0.1875
563   doc.rust-lang.org       0.1250
608        docs.ipfs.io       0.0625
641     docs.traefik.io       0.0625

From: www.startpage.com

                    to_host  probability
1543      stackoverflow.com     0.087546
897              github.com     0.056720
29        access.redhat.com     0.032059
2152         www.reddit.com     0.024661
1954  www.geeksforgeeks.org     0.023428

From: www.youtube.com

                  to_host  probability
2273      www.youtube.com     0.996668
73    accounts.google.com     0.000606
1561   studio.youtube.com     0.000606
1972       www.google.com     0.000606
900            github.com     0.000454

From: twitter.com

             to_host  probability
1703     twitter.com     0.909155
1616            t.co     0.084457
1340   pbs.twimg.com     0.005678
1969  www.google.com     0.000710

From: github.com

              