In [1]:
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.model_selection import train_test_split

data = pd.read_json('./data/youtube_history.json')



In [10]:
data['clean_tags'] = data['tags'].fillna('').apply(lambda x: list(set(x))).apply(lambda x: ' '.join(map(str, x))).apply(lambda x: x.strip())

def remove_duplicates(text):
    # Remove special characters and split the text into words
    words = re.findall(r'\b\w+\b', text)
    unique_words = []
    for word in words:
        if word not in unique_words:
            unique_words.append(word.lower())
    return ' '.join(unique_words)

# Apply the function to the 'tags' column
data['clean_tags'] = data['clean_tags'].apply(remove_duplicates)

data['clean_tags'].head(10)

0                     open ai llm devika source coding
1    esp32 home assistant bluetooth esphome ble esp...
2    t3 programming react programmer typescript ful...
3                                                     
4    open source software free huginn tutorial plat...
5    learn ai openai api tutorial engineer course f...
6    learn ai openai api tutorial engineer course f...
7        bookmarks brave browser export how to in from
8                                                     
9                                                     
Name: clean_tags, dtype: object

In [11]:
data['clean_title'] = data['title'].apply(remove_duplicates)
data['clean_title'].head()

0    build entire apps with a single prompt free op...
1    let s build a room sensor part 1 temperature h...
2      everyone s racing to replace redis who will win
3                    anthropic claude prompt generator
4    huginn free open source automated agents platform
Name: clean_title, dtype: object

In [12]:
# Combine all the text features into a single column
data['text'] = data['clean_title'] + ' ' + data['clean_tags']
data['text']

0       build entire apps with a single prompt free op...
1       let s build a room sensor part 1 temperature h...
2       everyone s racing to replace redis who will wi...
3                      anthropic claude prompt generator 
4       huginn free open source automated agents platf...
                              ...                        
2805    diy synth vcf part 2 active filters resonance ...
2806             smart tmux sessions with zoxide and fzf 
2807    easiest guitar pedal build tutorial 7 minute f...
2808    cheapest budget way for soundproofing a room s...
2809    world s shortest ui ux design course ui design...
Name: text, Length: 2810, dtype: object

In [14]:
# Create a TF-IDF matrix
vectorizer = TfidfVectorizer(
  min_df=3, max_features=None,
  strip_accents='unicode',
  analyzer='word',
  token_pattern=r'\w{1,}',
  ngram_range=(1, 3),
  stop_words='english')
X = vectorizer.fit_transform(data['text'])

feature_names = vectorizer.get_feature_names_out()

X.shape


(2810, 7733)

In [15]:
from sklearn.metrics.pairwise import sigmoid_kernel

sig = sigmoid_kernel(X, X)
sig[0]

array([0.76164846, 0.76159551, 0.76159416, ..., 0.76159765, 0.76159416,
       0.76159454])

In [16]:
indices = pd.Series(data.index, index=data['text']).drop_duplicates()
indices

text
build entire apps with a single prompt free open source devika tutorial open ai llm devika source coding                                                                                 0
let s build a room sensor part 1 temperature humidity and bluetooth esp32 home assistant bluetooth esphome ble esp dht22                                                                 1
everyone s racing to replace redis who will win t3 programming react programmer typescript full stack gg theo web development javascript t3dotgg                                         2
anthropic claude prompt generator                                                                                                                                                        3
huginn free open source automated agents platform open source software free huginn tutorial platform alternative automation elestio overview zapier tool                                 4
                                                            