In [1]:
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.model_selection import train_test_split

def remove_duplicates(text):
    # Remove special characters and split the text into words
    words = re.findall(r'\b\w+\b', text)
    unique_words = []
    for word in words:
        if word not in unique_words:
            unique_words.append(word.lower())
    return ' '.join(unique_words)

data = pd.read_json('./data/youtube_history.json')
data = data.drop(columns=['url', 'visit_count', 'last_visit_time', 'publishedAt', 'description', 'channelTitle', 'channelId'])
data['tags'] = data['tags'].fillna('').apply(lambda x: list(set(x))).apply(lambda x: ' '.join(map(str, x))).apply(lambda x: x.strip())

data['tags'] = data['tags'].apply(remove_duplicates)
data['title'] = data['title'].apply(remove_duplicates)

data['text'] = data['title'] + ' ' + data['tags']

if 'topics' not in data.columns:
	# Add a new column initialized as an empty list
	data['topics'] = [[]] * len(data)
	
if 'topic_words'	not in data.columns:
	# Add a new column initialized as an empty list
	data['topic_words'] = [[]] * len(data)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2810 entries, 0 to 2809
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        2810 non-null   object
 1   tags         2810 non-null   object
 2   text         2810 non-null   object
 3   topics       2810 non-null   object
 4   topic_words  2810 non-null   object
dtypes: object(5)
memory usage: 109.9+ KB


In [2]:
def create_topics(data, topic_count):
  vectorizer = TfidfVectorizer(stop_words='english')
  X = vectorizer.fit_transform(data['text'])
 
  nmf = NMF(n_components=topic_count, random_state=42)
  W = nmf.fit_transform(X)
  H = nmf.components_

  feature_names = vectorizer.get_feature_names_out()
  top_words_list = []
  for topic_idx, topic in enumerate(H):
    topic_words = " ".join([feature_names[i] for i in topic.argsort()[:-21:-1]])
    top_words_list.append(topic_words)
    
  topic_assignments = W.argmax(axis=1)
  top_words = [top_words_list[i] for i in topic_assignments]
  topics = [topic_assignments[i] for i in range(len(topic_assignments))]
  for index, row in data.iterrows():
    row['topics'].append(topics[index])
    row['topic_words'].append(top_words[index])
  

In [3]:
create_topics(data, 5)

In [4]:
data['topics']

0       [4, 4, 4, 4, 4, 4, 4, 0, 4, 4, 4, 4, 2, 4, 3, ...
1       [4, 4, 4, 4, 4, 4, 4, 0, 4, 4, 4, 4, 2, 4, 3, ...
2       [4, 4, 4, 4, 4, 4, 4, 0, 4, 4, 4, 4, 2, 4, 3, ...
3       [4, 4, 4, 4, 4, 4, 4, 0, 4, 4, 4, 4, 2, 4, 3, ...
4       [4, 4, 4, 4, 4, 4, 4, 0, 4, 4, 4, 4, 2, 4, 3, ...
                              ...                        
2805    [4, 4, 4, 4, 4, 4, 4, 0, 4, 4, 4, 4, 2, 4, 3, ...
2806    [4, 4, 4, 4, 4, 4, 4, 0, 4, 4, 4, 4, 2, 4, 3, ...
2807    [4, 4, 4, 4, 4, 4, 4, 0, 4, 4, 4, 4, 2, 4, 3, ...
2808    [4, 4, 4, 4, 4, 4, 4, 0, 4, 4, 4, 4, 2, 4, 3, ...
2809    [4, 4, 4, 4, 4, 4, 4, 0, 4, 4, 4, 4, 2, 4, 3, ...
Name: topics, Length: 2810, dtype: object

In [3]:
# Create a TF-IDF matrix
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data['text'])

topic_count = 5

# Use Non-negative Matrix Factorization (NMF) to extract topics
nmf = NMF(n_components=topic_count, random_state=42)
W = nmf.fit_transform(X)
H = nmf.components_

# Get the topic assignments for each row
topic_assignments = W.argmax(axis=1)

# Count the number of rows matched to each topic
topic_counts = [sum(topic_assignments == i) for i in range(topic_count)]

data['first_topic_id'] = topic_assignments

# Print the top words for each topic
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(H):
    topic_words = " ".join([feature_names[i] for i in topic.argsort()[:-21:-1]])
    print(f"Topic[{topic_idx}] {topic_counts[topic_idx]} matches: {topic_words}")

print()
data.info()

Topic[0] 311 matches: fantasy map maps draw drawing cartography mountains tutorial making world building step mpa finish campaign beginner easy start procreate tabletop
Topic[1] 418 matches: comic book conservation age books comics cleaning cgc restoration pressing golden results wet clean label press paper purple removal tutorial
Topic[2] 222 matches: hornet dogfight eagle dcs combat air viper 22 raptor navy tomcat force military army f22 14 war f35 wwii thunder
Topic[3] 847 matches: dnd dragons dungeons dm rpg ttrpg dungeon tabletop game master screen role gaming 5e dice tips roleplaying table pathfinder diy
Topic[4] 1012 matches: web programming programmer software development engineer engineering developer humor design ai reacts prime open source javascript tutorial theo python linux

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2810 entries, 0 to 2809
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   t

In [4]:
# Create a list to store the top words for each topic
top_words_list = []

# Print the items for each topic
for topic_idx, topic in enumerate(H):
    topic_words = [feature_names[i] for i in topic.argsort()[:-21:-1]]
    top_words_list.append(" ".join(topic_words))


data['top_words'] = [top_words_list[i] for i in topic_assignments]

#data.iloc[2805]['top_words']
#data.style.set_properties(**{'text-align': 'left'}).set_table_styles([ dict(selector='th', props=[('text-align', 'left')] ) ])

topic_four = data[data['first_topic_id'] == 4]
topic_four.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1012 entries, 0 to 2809
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   title           1012 non-null   object
 1   tags            1012 non-null   object
 2   text            1012 non-null   object
 3   first_topic_id  1012 non-null   int64 
 4   top_words       1012 non-null   object
dtypes: int64(1), object(4)
memory usage: 47.4+ KB
