In [2]:
# NOTE: The course expects Spark, but since this is just about loading
# and previewing a handful of samples, we use pandas for simplicity.

import pandas as pd
import json

data = []
with open("data/c4-train.00000-of-01024-30K.json", "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        if i >= 50:
            break
        data.append(json.loads(line)["text"])

df = pd.DataFrame(data, columns=["text"])

for i in range(3):
    print("sample", i)
    print(df.loc[i, "text"][:100])

sample 0
Beginners BBQ Class Taking Place in Missoula!
Do you want to get better at making delicious BBQ? You
sample 1
Discussion in 'Mac OS X Lion (10.7)' started by axboi87, Jan 20, 2012.
I've got a 500gb internal dri
sample 2
Foil plaid lycra and spandex shortall with metallic slinky insets. Attached metallic elastic belt wi


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

vectorizer = TfidfVectorizer(stop_words="english", max_features=4096)
X = vectorizer.fit_transform(df["text"])

vectors = X.toarray()
vocab_size = len(vectorizer.vocabulary_)

all_positive = np.all(vectors >= 0)
is_decimal = np.any((vectors % 1) != 0)
magnitudes = np.linalg.norm(vectors, axis=1)

print("Vocab size:", vocab_size)
print("Shape of vectors:", vectors.shape)
print("All values non-negative?", all_positive)
print("Any decimal values?", is_decimal)
print("Per-sample vector magnitudes (first 5):", magnitudes[:5])
print("Total magnitude across all samples:", np.sum(magnitudes))
print("Stats on values: min =", np.min(vectors), "max =", np.max(vectors), "mean =", np.mean(vectors))

Vocab size: 3371
Shape of vectors: (50, 3371)
All values non-negative? True
Any decimal values? True
Per-sample vector magnitudes (first 5): [1. 1. 1. 1. 1.]
Total magnitude across all samples: 50.00000000000001
Stats on values: min = 0.0 max = 0.7916261758784078 mean = 0.002120859786574523


In [4]:
# So yeah I kinda don't care if Spark's TF-IDF normalizes by defaults or not but i suppose this is what yall mean by normalize, for scikit-learn it normalizes them by default
from sklearn.metrics.pairwise import cosine_similarity

# Change this query to experiment
query = "BBQ"

# Vectorize query with same vectorizer
query_vec = vectorizer.transform([query])

# Compute cosine similarity
sims = cosine_similarity(query_vec, X).flatten()

# Get top 5 matches
top_idx = sims.argsort()[::-1][:5]

print("Query:", query)
print("\nTop 5 most similar samples:\n")
for rank, idx in enumerate(top_idx, start=1):
    print(f"{rank}. Score={sims[idx]:.4f} | Sample={df.loc[idx, 'text'][:100]!r}")

Query: BBQ

Top 5 most similar samples:

1. Score=0.4550 | Sample='Beginners BBQ Class Taking Place in Missoula!\nDo you want to get better at making delicious BBQ? You'
2. Score=0.0000 | Sample='It is possible to erase up to 4 HDD / SSD at the same time. IDE HDD connection is also possible with'
3. Score=0.0000 | Sample='So many things happening this weekend!\nThe highly anticipated PBS Kids in the Park festival is tomor'
4. Score=0.0000 | Sample='Farmington, CT, August 30, 2016 -- Many Americans realize they need long-term care insurance, but ba'
5. Score=0.0000 | Sample='As I have been discussing the magic of wordpress themes before. WordPress has brough a revolution as'
