#### Setup

##### Install Packages

In [None]:
!pip install --quiet -U git+https://github.com/scikit-learn-contrib/hdbscan.git@ccd8535d3db241398afa9299cd279c4cd85133f5

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
  Building wheel for hdbscan (PEP 517) ... [?25l[?25hdone


In [None]:
!pip install --quiet -U transformers==4.20.1

[K     |████████████████████████████████| 4.4 MB 24.0 MB/s 
[K     |████████████████████████████████| 6.6 MB 55.1 MB/s 
[K     |████████████████████████████████| 163 kB 73.6 MB/s 
[?25h

In [None]:
!pip install --upgrade --upgrade-strategy only-if-needed --quiet -U bertopic

[K     |████████████████████████████████| 90 kB 1.8 MB/s 
[K     |████████████████████████████████| 85 kB 4.1 MB/s 
[K     |████████████████████████████████| 636 kB 53.3 MB/s 
[K     |████████████████████████████████| 88 kB 6.8 MB/s 
[K     |████████████████████████████████| 1.3 MB 25.4 MB/s 
[K     |████████████████████████████████| 1.1 MB 54.8 MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Building wheel for umap-learn (setup.py) ... [?25l[?25hdone
  Building wheel for pynndescent (setup.py) ... [?25l[?25hdone


In [None]:
!pip list | grep transformers

sentence-transformers         2.2.2
transformers                  4.20.1


##### Imports

In [None]:
import os
import re
import time
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Progress bar for pandas
from tqdm.autonotebook import tqdm
tqdm.pandas()

  


In [None]:
# Interactive cell for pandas
# https://youtu.be/rNgswRZ2C1Y
from google.colab import data_table

data_table.enable_dataframe_formatter()

##### Download Full Dataset

In [1]:
# !gdown https://drive.google.com/uc?id=1pXY7Tp6hArE6j7C0M7E-OXUXAkzevotG
import gdown
gdown.download_folder(id="1pXY7Tp6hArE6j7C0M7E-OXUXAkzevotG", quiet=True, use_cookies=False)

['/content/hackernews/hackernews-2019-2022-sessions.csv',
 '/content/hackernews/hackernews-data-from-phone.json',
 '/content/hackernews/hackernews-since-20221016.json',
 '/content/hackernews/hackernews-stories-since-2018.csv',
 '/content/hackernews/hackernews-stories-since-2022-10-14.csv',
 '/content/hackernews/hackernews-urls-from-browser-deduplicated.csv',
 '/content/hackernews/models/20221028',
 '/content/hackernews/tests/test_data_with_topics.csv',
 '/content/hackernews/tests/test_data.csv',
 '/content/hackernews/tests/test_set_with_topics.csv',
 '/content/hackernews/tests/test_set.csv',
 '/content/hackernews/tests/validation_set_with_topics.csv',
 '/content/hackernews/tests/validation_set.csv']

In [2]:
""" Careful!
One weird aspect of Colab's env is that it somehow has different folder structure
for the CPU and GPU runtime when downloading via gdown. The CPU runtime flatten
your folder, while the GPU runtime preserves the folder.

If there's ever error when loading from path, that might the cause
"""

" Careful!\nOne weird aspect of Colab's env is that it somehow has different folder structure\nfor the CPU and GPU runtime when downloading via gdown. The CPU runtime flatten\nyour folder, while the GPU runtime preserves the folder.\n\nIf there's ever error when loading from path, that might the cause\n"

In [3]:
# raw data
browser = '/content/hackernews/hackernews-urls-from-browser-deduplicated.csv'
phone = '/content/hackernews/hackernews-data-from-phone.json'
stories_2018 = '/content/hackernews/hackernews-stories-since-2018.csv'
stories_20221014 = '/content/hackernews/hackernews-stories-since-2022-10-14.csv'
session_since_2019 = '/content/hackernews/hackernews-2019-2022-sessions.csv'
session_since_20221016 = '/content/hackernews/hackernews-since-20221016.json'

In [4]:
# dataset
validation_set_path = '/content/hackernews/tests/validation_set.csv'
test_set_path = '/content/hackernews/tests/test_set.csv'
test_data_path = '/content/hackernews/tests/test_data.csv'
model_path = '/content/hackernews/models/20221028'

In [5]:
# dataset + feature
validation_set_with_topics_path = '/content/hackernews/tests/validation_set_with_topics.csv'
test_set_with_topics_path = '/content/hackernews/tests/test_set_with_topics.csv'
test_data_with_topics_path = '/content/hackernews/tests/test_data_with_topics.csv'

#### Data

##### Sample Data

In [None]:
# Sample HN titles
samples = [
    "I would like a job writing Haskell",
    "Hybrid recommender systems to improve recommendations for sparse datasets",
    "How poverty changes your mindset",
    "Rust in 2022",
    "PostgreSQL 14",
    "Improved distributed algorithms for fundamental graph problems (2017)",
    "Ask HN: What bits of fundamental knowledge are productivity multipliers?",
    "A first lesson in meta-rationality",
    "Django Newbie Mistakes",
    "Ask HN: Which are the best Go repositories to read to learn the language?",
    "Postgres full-text search: A search engine in a database (2021)",
    "Citybound – A city building game using actor-based distributed simulation",
    "BERTopic: The Future of Topic Modeling",
    "When to use memory safe languages",
    "Being OK with not being extraordinary",
    "TikTok reveals details of how its algorithm works",
    "A general overview of what happens before main() (2019)",
    "Becoming a Centaur",
    "Query serving systems: An emerging category of data systems",
    "Rust – A hard decision pays off ",
    "It's now your fault they don't know about it",
    "Deconstructing the Postgres planner to find indexing opportunities",
    "Ask HN: Why should I use Django?",
    "Common Infrastructure Errors I've Made",
    "Discover the best developer blogs on any tech stack",
    "Ask HN: Good C++ code bases to read?",
    "Ask HN: What is the SQLite of nosql databases?",
    "Building Fast Interpreters in Rust",
    "Ask HN: I suck at math, where to start?",
    "What Every Programmer Absolutely, Positively Needs To Know About Encodings (2011)",
    "Ask HN: Tools to visualize data in SQL databases?",
    "Ask HN: Mind bending books to read and never be the same as before?",
    "Ask HN: Does anybody still use bookmarking services?",
    "Patent Trolls Inbound: Our First Lawsuit",
    "Command line tools for productive programmers",
    "Ask HN: Have you ever switched cloud?",
    "Reliably Send an HTTP Request as a User Leaves a Page",
    "Fly.io: The reclaimer of Heroku's magic",
    "Ethereum Energy Consumption",
    "The Unreasonable Effectiveness of Makefiles",
    "The new silent majority: People who don't tweet",
    "On YouTube’s recommendation system",
    "I think I know why you can't hire engineers right now",
    "Wealth Inequality Is Even Worse in Reputation Economies",
    "Wealth isn’t created at the top, it is merely devoured there",
    "What ORMs have taught me: just learn SQL (2014)",
    "Bits of advice I wish I had known",
    "Show HN: I made a book with a hundred UI/UX tips",
    "Implementing a toy version of TLS 1.3",
    "How to take credit for someone else's work on GitHub",
    "I analyzed 20k recommendations made by Jim Cramer during the last 5 years",
    "In Defense of OpenStreetMap's Data Model",
    "Aging programmer",
    "GitHub Actions: Organization secrets",
    "Ask HN: How to improve as a struggling junior software engineer?",
    "Excel is pretty dang cool",
    "Choose Boring Technology",
    "Ask HN: What are you working on?",
    "Non-Obvious Docker Uses",
]

samples_new = [
    "Curl doesn't add libproxy due to its quality issues",
    "Eglot has landed on master: Emacs now has a built-in LSP client",
    "Space-based solar power is getting serious",
    "Is Anyone Else Tired of the Self Enforced Limits on AI Tech?",
    "A chill driving game with procedurally generate scenic landscapes",
    "Google has most of my email because it has all of yours",
    "I made a CMS that uses Git to store your data",
    "On Bruno Latour (1947–2022): The world was his laboratory",
    "Pocket Casts goes open source",
    "Pouring the 200 inch disk at Corning Glass Works",
    "C2Rust Transpiler",
    "On finally learning to program at the age of 40",
    "A Quantum Computer by Blasting It with the Fibonacci Sequence",
    "Epoll is fundamentally broken",
    "Remote working is exacerbating the urban-rural divide in the digital market",
    "Pure Data as a plugin, with a new GUI",
    "Oidbs: An Open Source MQTT Driven Benchmark Suite for IoT Data",
    "A History of Palomar Observatory",
    "I've Built a DHT Torrent Sniffer and Search Engine. Should I Release?",
    "The Wire retracts its Meta stories",
    "Differential mosquito attraction to humans is associated with skin acid levels",
    "The Mysterious Patient in Room 23: The Hermit Baroness",
    "A 'screenless smartphone'",
    "H.D.’S Art of Failure",
    "Eureka Finding the key to ancient Egypt",
    "News Music Search Archive",
    "A Firewall for Internet of Things",
    "I was tired of being a perfectionist so I built an app within 24 hours",
    "What happened when my wife died",
]

##### Actual Data

In [None]:
df_2018 = pd.read_csv(stories_2018,
                      usecols=['title', 'url', 'id', 'timestamp_formatted'],
                      parse_dates=['timestamp_formatted'])
df_2018 = df_2018.rename(columns={"timestamp_formatted": "ts"})
df_2018.info()

#### Exploration & Cleaning

In [None]:
""" Notes for future me
(1) Exploration are done using method chaining for fast iteration on the data
    (data are transformed and displayed without actually stored, signal options)

(2) Several approach I've just discovered might look weird, but somehow works:
    - Using `.head(n)` makes sure I'm operating on the same sample of the data
    - Chaining `apply` happens because I can't preserve values as columns
      (I wonder if this indicate the need to just create a function?
       but, what if it's an apply to avoid errors?)
    - The logic inside the `lambda` are frequently "swapped" based on whether I'm
      operating on dirty or clean data. For example, when exploring dirty data,
      I use "True if" to filter correct values and drop them with `dropna` because I
      want to see my transformation on the dirty columns. But when I finished
      cleaning, I have to "swap" the logic back to "else True" so that dirty data
      are the ones that is actually dropped (in this case I preserve the indexes)
      Variations of these patterns arise in the arithmetic comparison too

      example when cleaning one-word title:
        exploring dirty data: `True if len(sentence) == 1 else None`  # True and None
        keeping clean data  : `None if len(sentence) == 1 else True`  # is swapped

(3) Anyway, this data is MUCH. MORE DIRTY than I'd expected. Seriously didn't expect
    to spend this much time wrangling :/ really, bless community upvotes I guess
    (now that I thought about it, maybe the "dirty" patterns arises in the number
     of upvotes, kids, or empty urls? now I regret didn't get all the data hmm)
"""

##### Duplicate Titles :)

In [None]:
df_2018.shape

(1726813, 4)

In [None]:
(
    df_2018["title"].value_counts()[:30]
)

Y Combinator: Bookmarklet                                                 1558
Books for kids (death and bullying)                                        262
Test                                                                       119
WP Paint – WordPress Image Editor Free Version                             105
Abortion Is Murder (Available on Spotify and iTunes  By, Evon Latrail)      98
ghost                                                                       97
Keybase                                                                     83
Referral URL                                                                74
GraphQL with NodeJs: From Beginner to Advanced Concepts                     67
React Native Debug Tool                                                     60
Super Affiliate System                                                      59
Stowaway – Multi-hop Proxy Tool for pentesters                              58
Check Out 'One Punch Man – Home Workout'            

In [None]:
df_2018 = (
            df_2018.drop_duplicates(subset=['title'])
                   .reset_index(drop=True)
)

In [None]:
df_2018.shape

(1564079, 4)

##### One-word Titles

In [None]:
# Identify what "one-word" title seems to be
# spoiler: it's trash
f = (
    df_2018["title"]
        .head(500)
        .str
        .split()
        .apply(lambda x: len(x) if type(x) == list else True)  # weird float error :/
        .apply(lambda x: None if x > 1 else True)
        .dropna()
        .index
)
f

Int64Index([5, 12, 70, 131, 148, 236, 285, 339, 411, 445], dtype='int64')

In [None]:
df_2018.loc[f, 'title']

5                                                 Solve
12                                        SuperbowlEVE/
70                                                  Kek
131                                              Crypto
148                     -kazakhstan-u20-vs-usa-u20-live
236                                                 Aho
285    &#20154;&#29983;&#23601;&#26159;&#36825;&#26679;
339                                            Captable
411                                                Fwef
445                                               Nabil
Name: title, dtype: object

In [None]:
preserved_indexes = (
                        df_2018["title"]
                            .str.split()
                            .apply(lambda x: len(x) if type(x) == list else True)
                            .apply(lambda x: None if x == 1 else x)
                            .dropna()
                            .index
)
preserved_indexes[:15]

Int64Index([0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16], dtype='int64')

In [None]:
df_2018 = df_2018.loc[preserved_indexes].copy().reset_index(drop=True)
df_2018.shape

(1548517, 4)

##### Job postings in Titles

In [None]:
# Identify wrong type column, e.g. job postings that are posted on story
# typically has several dashes (minimum 3) on its title
# e.g. Company — Sites or Roles — Status — Other Criteria
#
# examples
"""
'Anyroad – Https://www.anyroad.com – Full-Time – San Francisco, CA ONSITE',
'Quobyte – Santa Clara, CA and Berlin, Germany – Full-Time – Onsite',
'Wyre – Senior Full Stack Engineers – Onsite – San Francisco, CA – $100-120k',
'BCG Gamma – Senior Software Engineer – Paris or London – Full-Time – Onsite',
'Ask HN: Best way to prepare for an Onsite interview at a Startup?',
'Tabella – Ethical Hacker – Full-Time – Onsite – Prague, Czechia (EU)',
'Sama AI – Sr Software Developer (Front-End) – Onsite (Mtl) or Remote – Full Time',
'Olo – Multiple Jobs Available (see Below) – Full-Time| REMOTE or ONSITE (NYC)'
"""

f = (
        df_2018['title']
            .head(5000)
            .str.lower()
            .str.count('–')
            .apply(lambda x: None if x < 2 else True)
            .dropna()
            .index
)
df_2018.head(5000).loc[f, 'title'].values

array(['Anyroad – Https://www.anyroad.com – Full-Time – San Francisco, CA ONSITE',
       'Revved – Engineer – Cofounder – Delhi/NCR, India',
       'We Are Hiring – Earn Rs.15000/- per Month – Simple Copy Paste Jobs',
       'Twitch (Amazon) – Software Engineers – San Francisco or Remote (US/Europe) – FTE',
       'Quobyte – Santa Clara, CA and Berlin, Germany – Full-Time – Onsite',
       'Maurice – Open Beta – Serverless Load Testing Tool',
       'AWS Blog Posts – Different Authors – Same Content?',
       'Yocan Evolve Plus – Yocan Evolve Plus Review – Justchillglass $39.99',
       'SmartKarma – Singapore| Full-Time Contract – React Native Developer',
       'Qliiq.com – Save Your Bookmarks in Qliiq.com – Seek for Feedback',
       'Wyre – Senior Full Stack Engineers – Onsite – San Francisco, CA – $100-120k',
       'BCG Gamma – Senior Software Engineer – Paris or London – Full-Time – Onsite',
       'BeFit Keto Cut– {Update 2020 Reviews} – Ingredients|Prise|Does It Work',
      

In [None]:
# extract clean indexes
preserved_indexes = (
        df_2018['title']
            .str.count('–')
            .apply(lambda x: None if x > 1 else True)
            .dropna()
            .index
)
df_2018.loc[preserved_indexes, 'title'].values

array(['Ask HN: Strategies to improve after finishing a product',
       "Ask HN: What to do with 10% of UK LTD i can't work at anymore",
       'Ask HN: What can I include in an effective communication workshop?',
       ...,
       "Global Forest Watch – Discover the world's forests through data",
       'Search Google inside sheets and excel',
       'Gov trying to ban magnet balls again'], dtype=object)

In [None]:
df_2018 = df_2018.loc[preserved_indexes].copy().reset_index(drop=True)

In [None]:
df_2018.shape

(1542775, 4)

##### Links in Title

In [None]:
# Identify trash link in the title, from "http" keyword
f = (
    df_2018['title'].head(1000)
                    .str.lower()
                    .str.contains('https?:\s*')
                    .values
)
df_2018.head(1000).loc[f].title.values

array(['HTTP: //wintersupplement.com/smart-blood-sugar-book/',
       'HTTP: //Www.zzlcdz.com/',
       'HTTP: //Wintersupplement.com/Vital-Xl/',
       'HTTPS: //Keto4us.org/Trubodx-Keto/',
       'HTTPS: //Buddysupplement.com/Biorexin/',
       'HTTPS: //www.annews24.com/top-10-most-expensive-watches-in-the-world/',
       'Essence CBD Oil HTTPS://buddysupplement.com/essence-cbd-oil-au/',
       'Visit Here HTTPS://djsupplement.com/empowered-boost-testosterone/',
       'HTTPS: //Djsupplement.com/Keto-Nutrition/',
       'Buy HTTPS://www.facebook.com/Trim-Life-Keto-101398769051708'],
      dtype=object)

In [None]:
# Invert the match to get clean values
f = (
    df_2018['title'].str.lower()
                    .str.contains('https?:\s*')
                    .values
)
df_2018 = df_2018.loc[~f].copy()

In [None]:
df_2018.shape

(1541923, 4)

##### Non-stories ([Something] HN)

In [None]:
# explore dirty titles
(
    df_2018['title'].head(50)
                    .str
                    .split("HN: ")
                    .apply(lambda x: x[0].strip())
                    .value_counts()
)

Ask                                                                                37
We need help with all aspects of marketing/ Contract help needed                    1
When PMF meets market: $2.7M ARR ACHIEVED                                           1
Tell                                                                                1
Corporation Warfare-Protonmail cyberattack sponsored by states and corporations     1
The impact of Airbnb on residential real estate price rises                         1
Show                                                                                1
It's Official. GitHub is slow                                                       1
Aspiring YC founder – Can you get into YC without a technical cofounder?            1
Looking for Job/Work Contract (Web Developer Javascript/PHP)                        1
What podcast hosting provides the best analytics?                                   1
What is the state of nanotech?                        

In [None]:
# clean the titles
(
    df_2018['title'].head(20)
                    .str
                    .split("HN: ")
                    .apply(lambda x: x[-1])  # take the last one
)

0       Strategies to improve after finishing a product
1     What to do with 10% of UK LTD i can't work at ...
2     What can I include in an effective communicati...
3     We need help with all aspects of marketing/ Co...
4           Cost Effective Options for Email Marketing?
5     Should I sell equity in a past startup on the ...
6     How to get the flexibilty of containers withou...
7     Why didn't the early web support more client s...
8     Something like Khan Academy but full curriculu...
9        What's the most fun tradition at your startup?
10            When PMF meets market: $2.7M ARR ACHIEVED
11    Programming book recommendations for autistic ...
12            Enterprise Software Developers in London?
13    Have you used Adderall or any other similar drug?
14         Why landline make crazy noise while hung up?
15    Where to Learn Kubernetes, Ansible and Terraform?
16    Banned site-wide from Reddit for helping a fel...
17    Is it possible to donate small amounts to 

In [None]:
df_2018['title'] = df_2018['title'].str.split("HN: ").apply(lambda x: x[-1])

##### Last Word [Year] and [PDF]

In [None]:
dirty_last_word_examples = ['A First Lesson in Econometrics (1970) [pdf]',
                    'Monarch: Google’s Planet-Scale In-Memory Time Series Database [pdf]',
                    'High System Load with Low CPU Utilization on Linux? (2020)']

In [None]:
# examples of cleaning dirty last word
pat = r'\s\(\d+\)|\s\[pdf\]'

(
    df_2018
        .loc[
            df_2018['title'].isin(dirty_last_word_examples),
            'title'
        ]
        .str
        .replace(pat, '')
        .values
)

  # Remove the CWD from sys.path while we load stuff.


array(['Monarch: Google’s Planet-Scale In-Memory Time Series Database',
       'High System Load with Low CPU Utilization on Linux?',
       'A First Lesson in Econometrics'], dtype=object)

In [None]:
df_2018['title'] = df_2018['title'].str.replace(pat, '', regex=True)

##### Non-English language

In [None]:
"""
Don't think I'll be using this for now because it might not work properly
with short texts according to the docs, which can be seen below, and my
tokenizer is probably good enough at this point (could recognize HN stuffs)
"""

In [None]:
f = (
    df_2018['title'].head(100)
        .apply(detect)
        .apply(lambda x: None if x == 'en' else 1)
        .dropna()
        .index
)
f

Int64Index([10, 12, 49, 59, 70, 73, 81, 87, 94], dtype='int64')

In [None]:
df_2018.loc[f, 'title']

10            When PMF meets market: $2.7M ARR ACHIEVED
12            Enterprise Software Developers in London?
49                                        Proposition 3
59                                   I Need Help Urgend
70                            Best real time data sets?
73    Oklahoma Sooners vs Alabama Crimson Tide Live ...
81                    Where do software engineers work?
87                                           Dark Mode?
94    (Directo-TV) Barcelona vs. Villarreal E.n Directo
Name: title, dtype: object

In [None]:
(
    df_2018
        .loc[f, 'title']
        .apply(detect_langs)
)

10                            [nl:0.9999972304748908]
12     [nl:0.5714292844563275, en:0.4285706111707865]
49      [en:0.5696024580914569, it:0.430396772621685]
59                            [nl:0.9999928553169629]
70     [ca:0.857141055642147, it:0.14285861979990916]
73                            [et:0.9999932651771479]
81    [af:0.8571386537057769, en:0.14285874783337843]
87     [da:0.8570648697305963, de:0.1429344431604382]
94                            [es:0.9999932855827895]
Name: title, dtype: object

#### All Cleaning Steps

In [None]:
"""
(1)
remove -> remove entries (view)
clean  -> mutates the data

(2)
I don't remove special characters because the tokenizer can handle it and it's
actually used in titles, e.g. "C++" would be "C" if special characters are removed:)
Also words are kept in their original case (I expect the tokenizer can handle that as well)
"""

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()

In [None]:
# All cleaning functions
def remove_one_word(title: pd.Series) -> pd.Series:
    f = (
        title
         .str
         .split()
         .apply(lambda x: len(x) if type(x) == list else 1)  # avoid weird float error :/
         .apply(lambda x: False if x == 1 else True)
    )
    return title.loc[f]
    
def remove_job_postings(title: pd.Series) -> pd.Series:
    f = (
        title
         .str
         .count('–')
         .apply(lambda x: False if x > 1 else True)  # risk losing some title, but yeh
    )
    return title.loc[f]

def remove_links(title: pd.Series) -> pd.Series:
    f = (
        title
         .str
         .lower()
         .str
         .contains('https?:\s*')
         .values
    )
    # invert bool filters because "regex aren't really for negative matching"
    return title.loc[~f]

def clean_non_stories(title: pd.Series) -> pd.Series:
    return title.str.split("HN: ").apply(lambda x: x[-1])

def clean_last_word_year_and_pdf(title: pd.Series) -> pd.Series:
    pat = r'\s\(\d+\)|\s\[pdf\]'
    return title.str.replace(pat, '', regex=True)

def lemmatize(title: pd.Series, lemmatizer=lemmatizer) -> pd.Series:
    return (
        title.str.split(" ")
             .apply(lambda row_list: [lemmatizer.lemmatize(word) for word in row_list])
             .apply(lambda x: " ".join(x))
    )

def finalizes(title: pd.Series) -> pd.Series:
    """finalizes the cleaning process by getting the copy of the passed-around view"""
    return title.copy().reset_index(drop=True)

In [None]:
# Reload
df_2018 = pd.read_csv(stories_2018,
                      usecols=['title', 'url', 'id', 'timestamp_formatted'],
                      parse_dates=['timestamp_formatted'])
df_2018 = df_2018.rename(columns={"timestamp_formatted": "ts"})
df_2018.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1726813 entries, 0 to 1726812
Data columns (total 4 columns):
 #   Column  Dtype              
---  ------  -----              
 0   title   object             
 1   url     object             
 2   id      int64              
 3   ts      datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1), int64(1), object(2)
memory usage: 52.7+ MB


In [None]:
titles = (df_2018
          ['title']
          .drop_duplicates()
          .pipe(remove_one_word)
          .pipe(remove_job_postings)
          .pipe(remove_links)
          .pipe(clean_non_stories)
          .pipe(clean_last_word_year_and_pdf)
          .pipe(lemmatize)
          .pipe(finalizes)
)
titles

0            Strategies to improve after finishing a product
1          What to do with 10% of UK LTD i can't work at ...
2          What can I include in an effective communicati...
3          We need help with all aspects of marketing/ Co...
4                Cost Effective Options for Email Marketing?
                                 ...                        
1541918    Association between naturally occurring lithiu...
1541919    Gene therapy cuts fat and builds muscle in sed...
1541920    Global Forest Watch – Discover the world's for...
1541921                Search Google inside sheets and excel
1541922                 Gov trying to ban magnet balls again
Name: title, Length: 1541923, dtype: object

In [None]:
corpus_sentences = titles.values

##### Dump & Reload Corpus

In [None]:
# Dump Corpus
import pickle
with open("corpus_sentences.pkl", "wb") as f:
    pickle.dump({'corpus_sentences': corpus_sentences}, f)

In [None]:
# Reload Corpus
import pickle
with open("corpus_sentences.pkl", "rb") as f:
    cached_data = pickle.load(f)
    corpus_sentences = cached_data['corpus_sentences']

#### Modeling

##### BERTopic

In [None]:
# Watch out for memory issues
# https://maartengr.github.io/BERTopic/faq.html#how-can-i-speed-up-bertopic

In [None]:
from bertopic import BERTopic
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import IncrementalPCA
from bertopic.vectorizers import OnlineCountVectorizer

In [None]:
# Batches
umap_model = IncrementalPCA(n_components=200)
cluster_model = MiniBatchKMeans(n_clusters=300, random_state=0)
vectorizer_model = OnlineCountVectorizer(stop_words="english", decay=.01)

In [None]:
# Prepare model
topic_model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=cluster_model, 
    vectorizer_model=vectorizer_model, 
    n_gram_range=(1, 2),  # this doesn't seem to work
    verbose=True
)

In [None]:
# Split list into N equal length
# https://stackoverflow.com/a/2135920/8996974
def split(a, n):
    k, m = divmod(len(a), n)
    return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))

In [None]:
# 100K split is processed in 20 minutes! That's too long!!
# Let's try splitting it to smaller parts instead
N = 8
doc_chunks = split(corpus_sentences, N)

In [None]:
topics = []
for chunk_no, docs in doc_chunks:
    print(f"Processing chunk no: {chunk_no}")
    start_time = time.time()

    topic_model.partial_fit(docs)
    topics.extend(topic_model.topics_)

    print("--- {} minutes ---".format((time.time() - start_time) / 60))
topic_model.topics_ = topics

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/6024 [00:00<?, ?it/s]

2022-10-26 13:56:03,334 - BERTopic - Reduced dimensionality


In [None]:
""" Give name to the model (manual experiment)
Example:
    model_no = '10'
    model_name = '10-lemmatized-BERTopic-ipca90-batchkmean300-8N'
"""
model_no = ''
model_name = ''

if not model_no or not model_name:
    raise Exception(f"Model can't be saved because model_no and model_name is not set")
topic_model.save(f'/kaggle/working/{model_no}/{model_name}')

##### Visualizing Results

In [None]:
from bertopic import BERTopic

model_name = ''

if not model_name:
    raise Exception(f"Model can't be loaded because model_name is not set")
topic_model = BERTopic.load(model_name)

In [None]:
freq = topic_model.get_topic_info(); print(freq.to_markdown())

In [None]:
(
    freq.sort_values('Count', ascending=False)
        .head(20)
)

**Attributes**

There are a number of attributes that you can access after having trained your BERTopic model:


| Attribute | Description |
|------------------------|---------------------------------------------------------------------------------------------|
| topics_               | The topics that are generated for each document after training or updating the topic model. |
| probabilities_ | The probabilities that are generated for each document if HDBSCAN is used. |
| topic_sizes_           | The size of each topic                                                                      |
| topic_mapper_          | A class for tracking topics and their mappings anytime they are merged/reduced.             |
| topic_representations_ | The top *n* terms per topic and their respective c-TF-IDF values.                             |
| c_tf_idf_              | The topic-term matrix as calculated through c-TF-IDF.                                       |
| topic_labels_          | The default labels for each topic.                                                          |
| custom_labels_         | Custom labels for each topic as generated through `.set_topic_labels`.                                                               |
| topic_embeddings_      | The embeddings for each topic if `embedding_model` was used.                                                              |
| representative_docs_   | The representative documents for each topic if HDBSCAN is used.                                                |

For example, to access the predicted topics for the first 10 documents, we simply run the following:

In [None]:
# Select the most frequent topic
topic_no = 0  # fill this
topic_model.get_topic(topic_no)

In [None]:
topic_model.topics_[:10]

In [None]:
topic_model.visualize_topics()

In [None]:
# If probabilities are calculated, this cell can be run
# topic_model.visualize_distribution(probs[200], min_probability=0.015)

In [None]:
topic_model.visualize_hierarchy(top_n_topics=100)

In [None]:
topic_model.visualize_barchart(top_n_topics=10)

In [None]:
topic_model.visualize_heatmap(n_clusters=100, width=1000, height=1000)

In [None]:
example_sentence = "Rust – A hard decision pays off "
similar_topics, similarity = topic_model.find_topics(example_sentence, top_n=5)
print(similar_topics)
topic_model.get_topic(similar_topics[0])

#### Picking the Best Model

In [None]:
from collections import defaultdict

def _avg_topics(topics):
    """A reduce operation taking average value of the key"""
    result = {}
    for (topic, rate) in topics:
        if topic in result:
            result[topic] += rate
            result[topic] /= 2
            continue
        result[topic] = rate
    return result

def find_topics(sentence, topic_model=topic_model):
    """Find the main topic of a sentence and return 10 most representative words"""
    topics = []

    # Use a multiplier calculation so that lower-ranked topics got fewer influence
    multiplier = 1
    rate = .5
    decay = .1

    similar_topics, similarity = topic_model.find_topics(sentence, top_n=1)
    for st in similar_topics:
        ts = topic_model.get_topic(st)
        ts = [(t1, t2 * multiplier) for (t1, t2) in ts]
        topics.extend(ts)
        multiplier *= rate
        rate -= decay

    topics = _avg_topics(topics)
    return sorted(topics.items(), key=lambda item: item[1], reverse=True)[:10]

In [None]:
# Example, testing the model
from bertopic import BERTopic

topic_model = BERTopic.load(model_path)
for sample in samples[:5]:
    print(sample)
    print(find_topics(sample, topic_model))
    print('-------------')

##### Calculating Coherence Score

In [None]:
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

In [None]:
# https://stackoverflow.com/a/27737385/8996974
from functools import wraps

def timing(f):
    @wraps(f)
    def wrap(*args, **kw):
        ts = time.time()
        result = f(*args, **kw)
        te = time.time()
        print(f"func: {f.__name__} args: [{args}, {kw}] took: {te-ts:2.4f} sec")
        return result
    return wrap

In [None]:
import pickle

@timing
def save_coherence_params(model_path, corpus_sentences):
    # Load model
    print(f"\nLoading model from {model_path}")
    topic_model = BERTopic.load(model_path)

    # Preprocess Documents
    print(f"Preprocessing...")
    topics = topic_model.topics_
    documents = pd.DataFrame({"Document": corpus_sentences,
                              "ID": range(len(corpus_sentences)),
                              "Topic": topics})
    documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
    cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)
    
    # Extract vectorizer and analyzer from BERTopic
    print(f"Extracting features...")
    vectorizer = topic_model.vectorizer_model
    analyzer = vectorizer.build_analyzer()
    
    # Extract features for Topic Coherence evaluation
    words = vectorizer.get_feature_names()
    tokens = [analyzer(doc) for doc in cleaned_docs]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
    topic_words = [[words for words, _ in topic_model.get_topic(topic)] 
                   for topic in range(len(set(topics))-1)]
    
    print(f"Saving the coherence_params...")
    save_path = f'{model_path}-coherence_params.pkl'
    with open(save_path, "wb") as f:
        pickle.dump({'topic_words': topic_words,
                     'tokens': tokens,
                     'corpus': corpus,
                     'dictionary': dictionary
                    }
                    ,f)
    print(f"Done!\n")

In [None]:
# Example
models = [
    model_path
]

for model in models:
    save_coherence_params(model, corpus_sentences)

##### Evaluation

In [None]:
import pickle

@timing
def evaluate_coherence_scores(model_path):
    save_path = f'{model_path}-coherence_params.pkl'
    
    print(f"\nLoading coherence params from {model_path}")
    with open(save_path, "rb") as f:
        cached_data = pickle.load(f)
        topic_words = cached_data['topic_words']
        tokens = cached_data['tokens']
        corpus = cached_data['corpus']
        dictionary = cached_data['dictionary']
    
    print(f"Computing coherence score...")
    coherence_model = CoherenceModel(topics=topic_words, 
                                 texts=tokens, 
                                 corpus=corpus,
                                 dictionary=dictionary, 
                                 coherence='c_v')
    coherence = coherence_model.get_coherence()
    
    model_name = os.path.basename(model_path)
    print(f"Coherence score for {model_name}: {coherence}")
    
    return coherence

In [None]:
coherences = [evaluate_coherence_scores(model) for model in models]

#### Creating Tests Data

##### How the Data Look Like

In [None]:
!head $browser

Ask HN: Are most of us developers lying about how much work we do? | Hacker News,https://news.ycombinator.com/item?id=29581125,2021-12-17 00:22:10
Trying Out Generics in Go | Hacker News,https://news.ycombinator.com/item?id=29581112,2021-12-16 23:02:53
Ask HN: How can I stop worrying and start living? | Hacker News,https://news.ycombinator.com/item?id=29585542,2021-12-17 00:05:49
Ask HN: What's the easiest way to make a living with $600k in cash? | Hacker News,https://news.ycombinator.com/item?id=29585360,2021-12-17 00:15:01
"One decade later, Minecraft world generation is interesting again | Hacker News",https://news.ycombinator.com/item?id=29446877,2021-12-17 00:22:53
Python built-ins worth learning (2019) | Hacker News,https://news.ycombinator.com/item?id=30621552,2022-08-07 10:25:39
"Assume everyone is stupid, lazy and possibly insane, including you | Hacker News",https://news.ycombinator.com/item?id=29226322,2021-11-17 20:31:57
Using Git Commit Message Templates to Write Better Co

In [None]:
!head $phone

{"by":"todsacerdoti","descendants":181,"id":33156727,"kids":[33160466,33157055,33160053,33159569,33160937,33157829,33157087,33160780,33161093,33160729,33157077,33160669,33157785,33161758,33157662,33160716,33161004,33161588,33162540,33162439,33157777,33159336,33169207,33156974,33156865,33157582,33156910,33157996],"score":718,"time":1665438206,"title":"How Wine works 101","type":"story","url":"https://werat.dev/blog/how-wine-works-101/"}
{"by":"azhenley","descendants":56,"id":33135389,"kids":[33151873,33152518,33151850,33151946,33162443,33152695,33151855],"score":27,"time":1665258469,"title":"When to use memory safe languages","type":"story","url":"https://verdagon.dev/blog/when-to-use-memory-safe-part-1"}
{"by":"imartin2k","descendants":163,"id":33139297,"kids":[33139921,33140022,33140138,33140516,33145325,33140360,33139916,33140362,33139959,33140604,33140169,33139863,33140028,33139839,33146213,33141149,33139669,33141261,33140896,33140515,33143624,33144405,33140765,33140358,33142190,331

In [None]:
!head $stories_20221014

title,url,timestamp,type,id
Show HN: Texterous.com,https://texterous.com/wizard,2022-10-18 21:38:46.000000 UTC,story,33254373
Measuring Traffic During the Half Moon Bay Pumpkin Festival,https://simonwillison.net/2022/Oct/19/measuring-traffic/,2022-10-19 16:32:32.000000 UTC,story,33264578
"MergeStat, Now with Some PostgreSQL",https://docs.mergestat.com/blog/2022/10/14/mergestat-now-with-postgres,2022-10-14 13:42:04.000000 UTC,story,33203374
Bat: A cat(1) clone in rust; adds wings,https://github.com/sharkdp/bat,2022-10-14 15:22:45.000000 UTC,story,33204747
How to Show Upcoming Meetings for a Microsoft 365 User,https://www.freecodecamp.org/news/how-to-show-upcoming-meetings-for-a-microsoft-365-user/,2022-10-18 06:27:37.000000 UTC,story,33243666
EA teases the ‘next generation’ of The Sims with Project Rene,https://www.theverge.com/2022/10/18/23409302/the-sims-project-rene-next-generation-sequel,2022-10-18 17:42:00.000000 UTC,story,33251146
Building Transformers from Neurons and Astrocytes,

In [None]:
!head $session_since_20221016

{"by":"bubblehack3r","descendants":207,"id":33230216,"kids":[33232854,33232004,33230963,33230702,33230959,33232321,33230914,33232663,33230962,33230972,33240739,33230955,33233037,33232699,33231046,33231641,33237912,33232772,33238236,33233050,33230879,33237304,33232729,33238621],"score":290,"time":1665986628,"title":"Yagni Exceptions (2021)","type":"story","url":"https://lukeplant.me.uk/blog/posts/yagni-exceptions/"}
{"by":"kiyanwang","descendants":133,"id":12245909,"kids":[12249785,12247719,12248255,12248187,12246496,12246910,12246670,12248366,12246165,12249184,12247365,12248491,12246709,12246750,12246389,12247405,12246335,12248696,12249237,12246517,12246536],"score":386,"time":1470639744,"title":"Notes on Distributed Systems for Young Bloods (2013)","type":"story","url":"https://www.somethingsimilar.com/2013/01/14/notes-on-distributed-systems-for-young-bloods/"}
{"by":"simonsarris","descendants":53,"id":33226817,"kids":[33232181,33227933,33227903,33228033,33227649,33229053,33229377,332

In [None]:
!head $session_since_2019

title,url,timestamp
Ask HN: Are most of us developers lying about how much work we do? | Hacker News,https://news.ycombinator.com/item?id=29581125,2021-12-17 00:22:10
Trying Out Generics in Go | Hacker News,https://news.ycombinator.com/item?id=29581112,2021-12-16 23:02:53
Ask HN: How can I stop worrying and start living? | Hacker News,https://news.ycombinator.com/item?id=29585542,2021-12-17 00:05:49
Ask HN: What's the easiest way to make a living with $600k in cash? | Hacker News,https://news.ycombinator.com/item?id=29585360,2021-12-17 00:15:01
"One decade later, Minecraft world generation is interesting again | Hacker News",https://news.ycombinator.com/item?id=29446877,2021-12-17 00:22:53
Python built-ins worth learning (2019) | Hacker News,https://news.ycombinator.com/item?id=30621552,2022-08-07 10:25:39
"Assume everyone is stupid, lazy and possibly insane, including you | Hacker News",https://news.ycombinator.com/item?id=29226322,2021-11-17 20:31:57
Using Git Commit Message Template

##### Load the Data

In [None]:
def load_json_data(path: str) -> pd.DataFrame:
    """Also parses the time column and make it into a new timestamp column"""
    data = []
    with open(path, 'r') as f:
        for line in f.readlines():
            d = json.loads(line)
            data.append(d)
    df = pd.DataFrame(data, columns=['title', 'time'])
    df['timestamp'] = pd.to_datetime(df['time'], unit='s')
    return df.drop(columns=['time'])

In [None]:
df_browser = pd.read_csv(browser, usecols=[0, 2], names=['title', 'timestamp'], parse_dates=[1])
df_phone = load_json_data(phone)
df_stories_new = pd.read_csv(stories_20221014, usecols=[0, 2], parse_dates=[1])
df_session_new = load_json_data(session_since_20221016)
df_session_old = pd.read_csv(session_since_2019, usecols=[0, 2], parse_dates=[1])

In [None]:
df_stories_new.timestamp = df_stories_new.timestamp.dt.strftime("%Y-%m-%d %H:%M:%S")

##### Munging

In [None]:
df_browser.head(), df_stories_new.head(), df_session_old.head()

(                                               title           timestamp
 0  Ask HN: Are most of us developers lying about ... 2021-12-17 00:22:10
 1            Trying Out Generics in Go | Hacker News 2021-12-16 23:02:53
 2  Ask HN: How can I stop worrying and start livi... 2021-12-17 00:05:49
 3  Ask HN: What's the easiest way to make a livin... 2021-12-17 00:15:01
 4  One decade later, Minecraft world generation i... 2021-12-17 00:22:53,
                                                title            timestamp
 0                             Show HN: Texterous.com  2022-10-18 21:38:46
 1  Measuring Traffic During the Half Moon Bay Pum...  2022-10-19 16:32:32
 2                MergeStat, Now with Some PostgreSQL  2022-10-14 13:42:04
 3            Bat: A cat(1) clone in rust; adds wings  2022-10-14 15:22:45
 4  How to Show Upcoming Meetings for a Microsoft ...  2022-10-18 06:27:37,
                                                title           timestamp
 0  Ask HN: Are most of us dev

In [None]:
df_phone.head(), df_session_new.head()

(                                               title           timestamp
 0                                 How Wine works 101 2022-10-10 21:43:26
 1                  When to use memory safe languages 2022-10-08 19:47:49
 2                                       Take a break 2022-10-09 09:12:51
 3  Show HN: I built a site that lets users find p... 2022-10-10 11:30:21
 4             Stop using utcnow and utcfromtimestamp 2022-10-09 04:35:31,
                                                title           timestamp
 0                            Yagni Exceptions (2021) 2022-10-17 06:03:48
 1  Notes on Distributed Systems for Young Bloods ... 2016-08-08 07:02:24
 2  Show HN: Carefulwords.com, a more inspiring th... 2022-10-16 20:14:47
 3                  How to build software like an SRE 2022-10-17 06:36:32
 4  Streaming data in Postgres to 1M clients with ... 2022-10-16 22:43:43)

In [None]:
dfs = [
    df_browser,
    df_phone,
    df_stories_new,
    df_session_new,
    df_session_old,
]

In [None]:
def trailing_hn_titles(title):
    return title.str.split(" \| Hack").apply(lambda x: x[0])

def extract_titles(df):
    return (df
            ['title']
            .drop_duplicates()
            .pipe(remove_one_word)
            .pipe(remove_job_postings)
            .pipe(remove_links)
            .pipe(clean_non_stories)
            .pipe(clean_last_word_year_and_pdf)
            .pipe(trailing_hn_titles)
            .pipe(finalizes)
    )

In [None]:
for df in dfs:
    df['title'] = extract_titles(df)
    print(df[:5].values)

[['Are most of us developers lying about how much work we do?'
  Timestamp('2021-12-17 00:22:10')]
 ['Trying Out Generics in Go' Timestamp('2021-12-16 23:02:53')]
 ['How can I stop worrying and start living?'
  Timestamp('2021-12-17 00:05:49')]
 ["What's the easiest way to make a living with $600k in cash?"
  Timestamp('2021-12-17 00:15:01')]
 ['One decade later, Minecraft world generation is interesting again'
  Timestamp('2021-12-17 00:22:53')]]
[['How Wine works 101' Timestamp('2022-10-10 21:43:26')]
 ['When to use memory safe languages' Timestamp('2022-10-08 19:47:49')]
 ['Take a break' Timestamp('2022-10-09 09:12:51')]
 ['I built a site that lets users find playlists by songs they contain'
  Timestamp('2022-10-10 11:30:21')]
 ['Stop using utcnow and utcfromtimestamp'
  Timestamp('2022-10-09 04:35:31')]]
[['Texterous.com' '2022-10-18 21:38:46']
 ['Measuring Traffic During the Half Moon Bay Pumpkin Festival'
  '2022-10-19 16:32:32']
 ['MergeStat, Now with Some PostgreSQL' '2022-10-1

In [None]:
sessions_dfs = [
    df_browser,
    df_phone,
    df_session_new,
    df_session_old,
]
for sdf in sessions_dfs:
    print(sdf.shape)

(871, 2)
(569, 2)
(75, 2)
(1469, 2)


In [None]:
sessions_df = (
    pd.concat(sessions_dfs)
      .drop_duplicates(subset=['title'])
      .sort_values('timestamp', ignore_index=True)
)

sessions_df.shape

(2007, 2)

In [None]:
sessions_df.tail()

Unnamed: 0,title,timestamp
2002,Cloudflare Pages: Best server tech since CGI-bin?,2022-10-23 16:03:30
2003,Gamification affects software developers: Caut...,2022-10-23 20:27:23
2004,Maintaining sufficient free space with ZFS,2022-10-23 20:59:16
2005,Buffers on the Edge: Python and Rust,2022-10-24 02:21:10
2006,"Sonic: Fast, lightweight and schema-less searc...",2022-10-24 11:31:27


In [None]:
sessions_df.timestamp.min(), sessions_df.timestamp.max()

(Timestamp('2015-10-31 19:12:46'), Timestamp('2022-10-24 11:31:27'))

In [None]:
df_stories_new.shape, df_stories_new.timestamp.min(), df_stories_new.timestamp.max()

((8679, 2), '2022-10-14 00:01:15', '2022-10-24 23:57:40')

In [None]:
sessions_df.loc[sessions_df.timestamp < df_stories_new.timestamp.min()]

Unnamed: 0,title,timestamp
0,Why I Use Nim instead of Python for Data Proce...,2015-10-31 19:12:46
1,What distinguishes great software engineers?,2016-09-22 19:50:03
2,How to learn hacking,2016-12-10 05:04:23
3,How to improve as a struggling junior software...,2018-08-17 10:45:19
4,I don't want to learn your query language,2018-09-01 07:37:11
...,...,...
1967,Why Uber Engineering Switched from Postgres to...,2022-10-09 17:00:42
1968,I built a site that lets users find playlists ...,2022-10-10 11:30:21
1969,How Wine works 101,2022-10-10 21:43:26
1970,Where can I see many examples of real companie...,2022-10-11 04:03:37


##### Creating Test Set

In [None]:
validation_set = sessions_df.loc[sessions_df.timestamp < df_stories_new.timestamp.min()].copy()
test_set = sessions_df.loc[sessions_df.timestamp >= df_stories_new.timestamp.min()].copy()

In [None]:
validation_set.timestamp.max(), test_set.timestamp.min(), df_stories_new.timestamp.min()

(Timestamp('2022-10-11 23:20:45'),
 Timestamp('2022-10-14 14:30:38'),
 '2022-10-14 00:01:15')

In [None]:
test_data = df_stories_new.sort_values('timestamp', ignore_index=True)

In [None]:
validation_set.to_csv('validation_set.csv', index=False)
test_set.to_csv('test_set.csv', index=False)
test_data.to_csv('test_data.csv', index=False)

In [None]:
!du -h *.csv

404K	test_data.csv
4.0K	test_set.csv
128K	validation_set.csv


In [None]:
!head -n 3 *.csv

==> test_data.csv <==
title,timestamp
The comparative recency of the proximal ancestors of SARS-CoV-1 and SARS-CoV-2,2022-10-14 00:01:15
A new algorithm predicts response to Sertraline with 83 percent accuracy,2022-10-14 00:01:41

==> test_set.csv <==
title,timestamp
The Deadlock Empire: A game that teaches locking and concurrency,2022-10-14 14:30:38
Implementing Hash Tables in C,2022-10-15 06:13:23

==> validation_set.csv <==
title,timestamp
Why I Use Nim instead of Python for Data Processing,2015-10-31 19:12:46
What distinguishes great software engineers?,2016-09-22 19:50:03


#### Recommendations

##### Load Validation & Test Set

In [None]:
validation_set = pd.read_csv(validation_set_path, usecols=[0])
test_set = pd.read_csv(test_set_path, usecols=[0])
test_data = pd.read_csv(test_data_path, usecols=[0])

In [None]:
validation_set = validation_set.dropna().reset_index(drop=True)

In [None]:
test_data = test_data.dropna().reset_index(drop=True)

##### Load Best Model

In [None]:
topic_model = BERTopic.load(model_path)
topic_model

<bertopic._bertopic.BERTopic at 0x7fc3db0ba1d0>

In [None]:
topic_model.topics_[:5]

[139, 7, 66, 172, 71]

In [None]:
example_sentence = "Rust – A hard decision pays off "
similar_topics, similarity = topic_model.find_topics(example_sentence, top_n=5)
print(similar_topics)
topic_model.get_topic(similar_topics[0])

[243, 125, 274, 152, 268]


[('rust', 0.12366704106289697),
 ('tonic', 0.04407518897360686),
 ('struct', 0.044067035774610144),
 ('ownership', 0.04126284745146919),
 ('golang', 0.0412602428722053),
 ('redux', 0.04125893708775355),
 ('cobalt', 0.035869093590621404),
 ('swift', 0.0348532585896845),
 ('guys', 0.03476520347221694),
 ('mod', 0.033813923883795456)]

##### Getting The Topics

In [None]:
def get_similar_topics(title):
    # ignores similarity values
    similar_topics, _ = topic_model.find_topics(title, top_n=5)
    return similar_topics

In [None]:
# Use this if similarity values is needed
"""
def get_topics(title):
    return topic_model.find_topics(title, top_n=5)

topics_and_similarity = validation_set["title"].apply(lambda x: get_topics(x))
val_attr = pd.DataFrame(topics_and_similarity.tolist(),
                        columns=['similar_topics', 'similarity'],
                        index=topics_and_similarity.index)
val_attr.head()
"""

In [None]:
# Similar topics only
validation_set["similar_topics"] = validation_set["title"].apply(lambda x: get_similar_topics(x))
validation_set.head()

Unnamed: 0,title,similar_topics
0,Why I Use Nim instead of Python for Data Proce...,"[129, 225, 135, 65, 9]"
1,What distinguishes great software engineers?,"[104, 253, 109, 158, 56]"
2,How to learn hacking,"[276, 36, 297, 105, 109]"
3,How to improve as a struggling junior software...,"[104, 253, 109, 148, 250]"
4,I don't want to learn your query language,"[144, 229, 117, 17, 109]"


In [None]:
# Similar topics only
test_set["similar_topics"] = test_set["title"].apply(lambda x: get_similar_topics(x))
test_set.head()

Unnamed: 0,title,similar_topics
0,The Deadlock Empire: A game that teaches locki...,"[170, 184, 65, 43, 84]"
1,Implementing Hash Tables in C,"[125, 64, 144, 248, 54]"
2,What to do to be healthy when old?,"[262, 0, 213, 96, 33]"
3,"If you used to be socially awkward and shy, ho...","[235, 250, 148, 29, 239]"
4,Moving from React to htmx,"[68, 278, 181, 53, 191]"


In [None]:
test_data["similar_topics"] = test_data["title"].apply(lambda x: get_similar_topics(x))
test_data.head()

Unnamed: 0,title,timestamp,similar_topics
0,The comparative recency of the proximal ancest...,2022-10-14 00:01:15,"[87, 286, 203, 48, 130]"
1,A new algorithm predicts response to Sertralin...,2022-10-14 00:01:41,"[96, 9, 151, 204, 54]"
2,AI Image Generation Is Advancing at Astronomic...,2022-10-14 00:16:07,"[201, 9, 4, 27, 2]"
3,The Russian Space Program Is Falling Back to E...,2022-10-14 00:17:16,"[130, 118, 102, 205, 280]"
4,It was all downhill after the Cuecat,2022-10-14 00:22:40,"[274, 40, 168, 155, 272]"


In [None]:
sets = [
    ('validation_set', validation_set),
    ('test_set', test_set),
    ('test_data', test_data),
]

for (name, df) in sets:
    if "similar_topics" not in df.columns:
        print(f'Getting topics for {name}')
        df["similar_topics"] = df["title"].apply(lambda x: get_similar_topics(x))
    print(f'Dumping {name}')
    df.to_csv(f'{name}_with_topics.csv', index=False)

Dumping validation_set
Dumping test_set
Dumping test_data


##### Fixing My Test Set

In [None]:
validation_set = pd.read_csv(validation_set_with_topics_path)
test_set = pd.read_csv(test_set_with_topics_path)
test_data = pd.read_csv(test_data_with_topics_path)

In [None]:
# Test set in test_data
test_set[test_set.title.isin(test_data.title)]

Unnamed: 0,title,similar_topics
4,Moving from React to htmx,"[68, 278, 181, 53, 191]"
9,"Carefulwords.com, a more inspiring thesaurus","[3, 295, 17, 281, 257]"
10,Streaming data in Postgres to 1M clients with ...,"[229, 34, 21, 173, 294]"
11,Yagni Exceptions,"[29, 154, 73, 263, 143]"
12,How to build software like an SRE,"[253, 152, 109, 158, 94]"
13,Projectbook – a free collection of 100 project...,"[148, 75, 253, 109, 122]"
14,I am endlessly fascinated with content tagging...,"[141, 47, 281, 17, 3]"
15,81% of IT teams directed to reduce or halt clo...,"[21, 216, 137, 23, 123]"
16,Wait vs. Interrupt Culture,"[83, 279, 256, 289, 56]"
17,Has anyone managed to find enjoyment in their ...,"[186, 167, 189, 13, 26]"


In [None]:
# Test set NOT in test_data
# Update: upon further introspection, these entries are actually from before
#         14-10-2022 and I somehow opened them in my tab after 14-10-2022.
#         to fix it, add it to the validation set
test_set_fix = test_set[~test_set.title.isin(test_data.title)].copy()
test_set_fix

Unnamed: 0,title,similar_topics
0,The Deadlock Empire: A game that teaches locki...,"[170, 184, 65, 43, 84]"
1,Implementing Hash Tables in C,"[125, 64, 144, 248, 54]"
2,What to do to be healthy when old?,"[262, 0, 213, 96, 33]"
3,"If you used to be socially awkward and shy, ho...","[235, 250, 148, 29, 239]"
5,Git Command Explorer,"[106, 152, 140, 39, 253]"
6,One million queries per second with MySQL,"[256, 294, 2, 229, 236]"
7,"SQLite: Past, Present, and Future","[34, 16, 294, 144, 70]"
8,Structural pattern matching in Python 3.10,"[225, 129, 65, 54, 29]"


In [None]:
# Add test_set_fix to the the validation_set
validation_set_fix = pd.concat([validation_set, test_set_fix]).reset_index(drop=True)

In [None]:
# Remove test_set_fix from the validation_set
test_set = test_set[test_set.title.isin(test_data.title)].copy()

In [None]:
validation_set_fix.to_csv(validation_set_with_topics_path, index=False)
test_set.to_csv(test_set_with_topics_path, index=False)

##### Constructing User's Profile

In [None]:
validation_set = pd.read_csv(validation_set_with_topics_path, converters={'similar_topics': eval})
test_set = pd.read_csv(test_set_with_topics_path, converters={'similar_topics': eval})
test_data = pd.read_csv(test_data_with_topics_path, parse_dates=['timestamp'], converters={'similar_topics': eval})

In [None]:
validation_set = validation_set.explode('similar_topics')
# test_set = test_set.explode('similar_topics')
# test_data = test_data.explode('similar_topics')

In [None]:
user_profile = (
    validation_set
        .similar_topics
        .value_counts(normalize=True)
)
user_profile

109    0.029611
64     0.023547
21     0.022335
144    0.020718
253    0.018696
         ...   
259    0.000202
157    0.000202
99     0.000101
134    0.000101
116    0.000101
Name: similar_topics, Length: 300, dtype: float64

##### Testing

In [None]:
test_data.head()

Unnamed: 0,title,timestamp,similar_topics
0,The comparative recency of the proximal ancest...,2022-10-14 00:01:15,"[87, 286, 203, 48, 130]"
1,A new algorithm predicts response to Sertralin...,2022-10-14 00:01:41,"[96, 9, 151, 204, 54]"
2,AI Image Generation Is Advancing at Astronomic...,2022-10-14 00:16:07,"[201, 9, 4, 27, 2]"
3,The Russian Space Program Is Falling Back to E...,2022-10-14 00:17:16,"[130, 118, 102, 205, 280]"
4,It was all downhill after the Cuecat,2022-10-14 00:22:40,"[274, 40, 168, 155, 272]"


In [None]:
def similarity_calculation(similar_topic):
    return sum([user_profile[topic_index]
                for topic_index in similar_topic]) * 1000

In [None]:
test_set['similarity_score'] = (
    test_set.similar_topics
             .apply(similarity_calculation)
)

In [None]:
print(f'Statistics for Test Set')
print(f'-----------------------')
print('Min: {}'.format(test_set['similarity_score'].min()))
print('Max: {}'.format(test_set['similarity_score'].max()))
print('Mean: {}'.format(test_set['similarity_score'].mean()))
print('Median: {}'.format(test_set['similarity_score'].quantile(.5)))

Statistics for Test Set
-----------------------
Min: 9.39868620515412
Max: 85.09348155634159
Mean: 41.62221847921696
Median: 42.14249621020718


In [None]:
test_set.sort_values('similarity_score')

Unnamed: 0,title,similar_topics,similarity_score
0,Moving from React to htmx,"[68, 278, 181, 53, 191]",9.398686
13,Kill Bill – Open-Source Subscription Billing a...,"[121, 10, 98, 92, 176]",12.733704
3,Yagni Exceptions,"[29, 154, 73, 263, 143]",13.441132
9,Has anyone managed to find enjoyment in their ...,"[186, 167, 189, 13, 26]",15.765538
21,How to show a not “so perfect” MVP to potentia...,"[163, 167, 92, 14, 261]",15.866599
8,Wait vs. Interrupt Culture,"[83, 279, 256, 289, 56]",19.403739
1,"Carefulwords.com, a more inspiring thesaurus","[3, 295, 17, 281, 257]",22.940879
19,I was tired of being a perfectionist so I buil...,"[25, 234, 298, 162, 253]",25.164224
6,I am endlessly fascinated with content tagging...,"[141, 47, 281, 17, 3]",29.408792
25,Buffers on the Edge: Python and Rust,"[125, 243, 256, 152, 274]",34.663972


In [None]:
test_data['similarity_score'] = (
    test_data.similar_topics
             .apply(similarity_calculation)
)
test_data['similarity_score'].mean()

23.400888442182687

In [None]:
take = 1000
top_result = test_data.sort_values('similarity_score', ascending=False)[:take]
got = top_result[top_result.title.isin(test_set.title)]
got

Unnamed: 0,title,timestamp,similar_topics,similarity_score
1155,"Quirks, Caveats, and Gotchas in SQLite",2022-10-15 13:54:09,"[34, 144, 16, 294, 21]",85.093482
4574,How to build software like an SRE,2022-10-19 20:08:53,"[253, 152, 109, 158, 94]",80.54573
2656,Projectbook – a free collection of 100 project...,2022-10-17 17:11:37,"[148, 75, 253, 109, 122]",73.673573
7627,GlueSQL: SQL database engine as a library,2022-10-24 08:47:14,"[34, 294, 21, 20, 229]",62.556847
2514,Streaming data in Postgres to 1M clients with ...,2022-10-17 14:41:00,"[229, 34, 21, 173, 294]",61.748358
1830,DevOps is broken,2022-10-16 15:18:12,"[39, 253, 106, 140, 152]",61.647297
6734,Maintaining sufficient free space with ZFS,2022-10-22 14:47:31,"[21, 170, 294, 125, 169]",56.897423
4150,Write better error messages,2022-10-19 12:27:18,"[64, 295, 144, 128, 186]",56.897423
8136,Gamification affects software developers: Caut...,2022-10-24 20:30:27,"[253, 140, 32, 25, 106]",47.902981
7436,Cloudflare Pages: Best server tech since CGI-bin?,2022-10-23 22:01:55,"[21, 216, 164, 187, 23]",47.094492


In [None]:
got.shape[0] / take  # ACCURACY OF 12 / 1000 ? LMAO

0.012

##### Recommendation

In [None]:
top_result['timestamp'] = pd.to_datetime(top_result['timestamp'], format='%Y-%m-%d %H:%M:%S')

title                       object
timestamp           datetime64[ns]
similar_topics              object
similarity_score           float64
dtype: object

In [None]:
top_result['date'] = top_result['timestamp'].dt.date

In [None]:
# https://stackoverflow.com/a/45992898/8996974
(
    top_result
    .sort_values(['date', 'similarity_score'], ascending=[True, False])
    .groupby('date')
    .head(3)
)  # seems like it's overfitting on "compiler" and "programming language" topics?

Unnamed: 0,title,timestamp,similar_topics,similarity_score,date
588,Formal Methods at Microsoft – Nikolaj Bjørner,2022-10-14 17:04:38,"[94, 144, 109, 64, 249]",98.635675,2022-10-14
730,A Minimalist Guide To Program Synthesis,2022-10-14 19:55:18,"[64, 109, 144, 225, 94]",96.715513,2022-10-14
624,How I Code at the Speed of Thought (Part 1),2022-10-14 17:47:18,"[250, 64, 144, 109, 2]",96.311268,2022-10-14
999,TMG (language): A self-hosted compiler from sc...,2022-10-15 08:21:17,"[64, 109, 253, 144, 94]",103.082365,2022-10-15
1464,Understanding the different flavors of Clang C...,2022-10-15 21:38:02,"[64, 109, 144, 253, 152]",102.273876,2022-10-15
873,Typosquatting programming language package man...,2022-10-15 01:00:12,"[109, 64, 253, 144, 295]",97.625063,2022-10-15
1862,An Overview of Generic Programming: Writing Co...,2022-10-16 15:56:15,"[64, 109, 144, 225, 94]",96.715513,2022-10-16
1721,When programs grovel into undocumented structures,2022-10-16 11:05:43,"[64, 109, 144, 94, 225]",96.715513,2022-10-16
1754,The TeX program: A program of study,2022-10-16 12:18:43,"[109, 225, 144, 250, 64]",96.311268,2022-10-16
2534,Differentiable Programming on Discrete Stochas...,2022-10-17 15:01:35,"[109, 144, 225, 64, 65]",96.109146,2022-10-17
