In [5]:
import sqlite3
import json

# Path to the Brave browser's history database
history_db = './data/History'

# Connect to the SQLite database
conn = sqlite3.connect(history_db)
c = conn.cursor()

# Query to fetch browsing history
query = "SELECT url, title, visit_count, last_visit_time FROM urls ORDER BY last_visit_time DESC"
c.execute(query)
results = c.fetchall()

# Export to JSON
with open('./data/brave_history.json', 'w', encoding='utf-8') as f:
    history = [{'url': row[0], 'title': row[1], 'visit_count': row[2], 'last_visit_time': row[3]} for row in results]
    json.dump(history, f, ensure_ascii=False, indent=4)

In [7]:
import dotenv
import os

dotenv.load_dotenv()
google_api_key = os.getenv('GOOGLE_API_KEY')

In [None]:
import json
import requests
from urllib.parse import parse_qs, urlparse

# Load the JSON file
with open('./data/brave_history.json', 'r', encoding='utf-8') as f:
    history = json.load(f)

video_title_url = "https://www.googleapis.com/youtube/v3/videos?part=snippet&id={video_id}&key={google_api_key}"

# Iterate through the history and process YouTube URLs
youtube_history = []
for video in history:
    url = video['url']
    if 'youtube.com/watch' in url:
        try:
            video_id = parse_qs(urlparse(url).query)['v'][0]
        
            video_id = parse_qs(urlparse(video['url']).query)['v'][0]
            #print(f"video_id: {video_id}")
            json_result = requests.get(video_title_url.format(video_id=video_id, google_api_key=google_api_key)).json()
            
            if json_result['items']:
                snippet = json_result['items'][0]['snippet']
                video['title'] = snippet['title']
                
                video['publishedAt'] = snippet['publishedAt']
                video['description'] = snippet['description']
                video['channelTitle'] = snippet['channelTitle']
                video['channelId'] = snippet['channelId']
                video['publishedAt'] = snippet['publishedAt']
                
                if 'tags' in snippet:
                    video['tags'] = snippet['tags']
                youtube_history.append(video)
        except:
            #print(json.dumps(json_result, indent=2))
            continue

# Save the modified entries to a new file
with open('./data/youtube_history.json', 'w', encoding='utf-8') as f:
    json.dump(youtube_history, f, ensure_ascii=False, indent=4)

In [29]:
from langchain_community.llms import Ollama
from langchain.embeddings import OllamaEmbeddings

ollama_url = os.getenv('OLLAMA_URL')
model = OllamaEmbeddings(base_url=ollama_url, model="nomic-embed-text")
embedding = model.embed_documents(['Huginn: Free Open Source Automated Agents Platform open source software open source alternative elestio open source free software free open source software huginn platform huginn tutorial huginn platform overview'])
#print(embedding)

In [80]:
with open('./data/youtube_history.json', 'r', encoding='utf-8') as f:
    videos = json.load(f)

documents = []
    
for video in videos:
    text = video['title']
    if 'tags' in video:
        text += f" {" ".join(video['tags'])}"
    
    documents.append(text)    
    #text = ' '.join(set(text.split()))
    #print(text)
    
embeddings = model.embed_documents(documents)
#print(embeddings)  

In [81]:
from sklearn.cluster import KMeans
import numpy as np
import torch
import random
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
import feedparser

In [82]:
X = np.array(embeddings)
kmeans = KMeans(n_clusters=10, random_state=42).fit(X)
#print(kmeans.cluster_centers_)

In [117]:
from dataclasses import dataclass, field
from datetime import datetime
from typing import List

@dataclass
class Article:
    feed_name: str
    title: str
    link: str
    summary: str
    embedding: List[float]
    pub_date: datetime
    updated: datetime
    distance: float = None

    def __init__(self, feed_name: str, title: str, link: str, summary: str, embedding: List[float], pub_date: datetime, updated: datetime):
        self.feed_name = feed_name
        self.title = title
        self.link = link
        self.summary = summary
        self.embedding = embedding
        self.pub_date = pub_date
        self.updated = updated

In [128]:
import html
from bs4 import BeautifulSoup

def clean_text(text):
  text = text.replace('\n', ' ').replace('\r', ' ').strip()
  text = BeautifulSoup(html.unescape(text), 'lxml').text
  return text

def parse_feed(url):
	feed_item = feedparser.parse(url)
	entries = []
	for article in feed_item.entries:
		entries.append(Article(
			feed_name = feed_item.feed.title,
			title = article.get('title', ''),
			link = article.link,
			summary = article.get('summary', ''),
			embedding = model.embed_query(f"{article.get('title', '')} {article.get('summary', '')}".strip()),
			pub_date = datetime.now(),
			updated = datetime.now()
		))
  
	return entries

entries =  parse_feed('https://www.reddit.com/r/selfhosted/rising.rss')
entries += parse_feed("https://www.battleswarmblog.com/?feed=rss2")
entries += parse_feed("https://cafehayek.com/feed")
entries += parse_feed("https://rss.slashdot.org/Slashdot/slashdotMain")
entries += parse_feed("https://www.realclearpolitics.com/index.xml")
entries += parse_feed("https://instapundit.com/feed/")
entries += parse_feed("https://twitchy.com/feed")
entries += parse_feed("https://hnrss.org/frontpage")
entries += parse_feed("https://www.reddit.com/r/selfhosted/top.rss?t=day")



In [126]:
def filter_articles(
    articles: list[Article],
    cluster_centers: np.ndarray,
    filter_ratio: float = 0.5,
    random_ratio: float = 0.1,
) -> list[Article]:
    """Filter out articles according to the user's preferences.

    This function tries to return a list of articles that are most relevant to
    the user's interests. It does so by clustering the read articles and then
    calculating the distance of each passed article to the closest cluster. The
    articles are then sorted by this distance and the top `filter_ratio` fraction
    of articles are returned. A small fraction of random articles are also included
    to add some diversity and allow for discovery of new topics.

    If there aren't enough read articles to form clusters, the original list of
    articles is returned identically.

    Args:
        articles: List of articles to filter.
        read_articles: List of articles that the user has read.
        filter_ratio: Fraction of articles to return (default 0.5).
        random_ratio: Fraction of random articles to include (default 0.1).

    Returns:
        List of articles sorted by relevance
    """
    random.Random(42).shuffle(articles)
    n_random = int(len(articles) * random_ratio)
    random_articles = articles[:n_random]
    del articles[:n_random]

    articles_embeddings_list = [article.embedding for article in articles if article.embedding]

    if not articles_embeddings_list:
        print("No embeddings found for articles. Returning articles as is.")
        return articles
    # Calculate distance of each passed article to the closest cluster
    articles_embeddings = np.array(articles_embeddings_list)
    distances = cdist(articles_embeddings, cluster_centers, metric="cosine")
    print(distances)
    min_distances = distances.min(axis=1)

    # Sort articles by distance to the closest cluster
    sorted_articles_with_distance = sorted(
        zip(min_distances, articles), key=lambda x: x[0]
    )
    
    for distance, article in sorted_articles_with_distance:
        article.distance = distance
    
    sorted_articles = [article for _, article in sorted_articles_with_distance]

    # Filter out articles based on the filter_ratio
    num_to_filter = int(len(sorted_articles) * filter_ratio)
    print(num_to_filter)
    return sorted(
        sorted_articles[:num_to_filter] + random_articles,
        key=lambda x: x.pub_date or x.updated,
        reverse=True,
    )

In [None]:
filtered_articles = filter_articles(
            articles=entries,
            cluster_centers=kmeans.cluster_centers_,
            filter_ratio=0.5,
            random_ratio=0,
        )

for article in filtered_articles:
	print(f"{article.distance} - {article.feed_name} - {article.title}")

In [129]:


def calculate_article_distances(kmeans, articles):
    # Extract the embeddings from the articles
    embeddings = [article.embedding for article in articles]
    X = np.array(embeddings)

    # Calculate the distances from each article embedding to the KMeans centers
    distances = []
    for embedding in X:
        min_distance = float('inf')
        for center in kmeans.cluster_centers_:
            distance = 1 - np.dot(embedding, center) / (np.linalg.norm(embedding) * np.linalg.norm(center))
            min_distance = min(min_distance, distance)
        distances.append(min_distance)

    # Set the distance attribute for each article
    for i, article in enumerate(articles):
        article.distance = distances[i]

    # Sort the articles by the calculated distances
    sorted_articles = sorted(articles, key=lambda x: x.distance)

    return sorted_articles

In [None]:
filtered_articles = calculate_article_distances(kmeans, entries)


for article in filtered_articles:
	print(f"{article.distance} - {article.feed_name} - {article.title}")

In [136]:
! pip install pandas matplotlib nltk

Defaulting to user installation because normal site-packages is not writeable
Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2023.12.25-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tqdm (from nltk)
  Downloading tqdm-4.66.2-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading regex-2023.12.25-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (789 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m789.1/789.1 kB[0m

In [137]:
import pandas as pd
from IPython.display import display
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import json
from ast import literal_eval

import requests

from scipy import sparse
from sklearn.feature_extraction.text import CountVectorizer

import numpy as np
import sklearn.metrics.pairwise as pw

from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import scipy

import math
import random
import sklearn
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split

from scipy.sparse.linalg import svds
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

In [138]:
tfidf = TfidfVectorizer(stop_words='english')

In [156]:
data = pd.DataFrame(documents)
tfidf_matrix = tfidf.fit_transform(data[0])

# Output the shape of tfidf_matrix
print(tfidf_matrix.shape)
# print(tfidf.get_stop_words())

# count vectorize
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(data[0])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix, True)
# display(cosine_sim.shape)
# display(cosine_sim)

(2810, 12324)


In [167]:
# Construct a reverse map of indices and movie titles
# Reset index of your main DataFrame and construct reverse mapping as before
metadata = data.reset_index()
# indices = pd.Series(metadata.index, index=metadata['title'])
indices = pd.Series(metadata.index, index=metadata[0]).drop_duplicates()
# print(indices.shape)
# display(indices[:10])


(2810,)


0
Build ENTIRE Apps With A Single Prompt - FREE Open-Source Devika Tutorial devika ai coding open ai llm ai open source                                                                                                                                                                                                                     0
Let's build a room sensor - Part 1 - Temperature, Humidity, and Bluetooth ESPHome DHT22 ESP ESP32 Home Assistant Bluetooth BLE                                                                                                                                                                                                            1
Everyone's Racing To Replace Redis - Who Will Win? web development full stack typescript javascript react programming programmer theo t3 stack t3 t3.gg t3dotgg                                                                                                                                                                           2
An

In [161]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, indices, cosine_sim, data):
    # Get the index of the article that matches the title
    idx = indices[title]
    # print(idx)
    # return 0
    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    # print(sim_scores)
    # return 0
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # print(sim_scores)
    # return 0
    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]
    # print(sim_scores)
    # return 0
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    # print(movie_indices)
    # return 0
    # Return the top 10 most similar movies
    return data[0].iloc[movie_indices]

In [164]:
print(get_recommendations('The Rise And Growth of Ethereum Gets Mainstream Coverage', indices, cosine_sim, metadata))

KeyError: 'The Rise And Growth of Ethereum Gets Mainstream Coverage'

In [None]:
# print(get_recommendations('Google Data Center 360° Tour', indices, cosine_sim, metadata))
# print(get_recommendations('Intel\'s internal IoT platform for real-time enterprise analytics', indices, cosine_sim, metadata))