In [50]:
import os
import json
import re
import nltk
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction import text

nltk.download('words')
from nltk.corpus import words

nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
ENGLISH_WORDS = set(words.words())

[nltk_data] Downloading package words to /Users/ppanda/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [51]:
# --- Tokenizer ---
def tokenize(text):
    def in_dict(t, words):
        for i in range(4, len(t)):
            if t[:i] in words:
                return True
        return False
    raw_tokens = re.findall(r'\b\w+\b', text.lower())
    tokens = [t for t in raw_tokens if all(c.isascii() and c.isalpha() for c in t) and in_dict(t, ENGLISH_WORDS)]
    return [token.lemma_ for token in nlp(" ".join(tokens))]

# --- Process JSON ---
def process_json_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    text_to_tokenize = []
    for field in ['title', 'description', 'keywords', 'headings', 'content']:
        if field in data:
            if isinstance(data[field], list):
                text_to_tokenize.extend(data[field])
            elif isinstance(data[field], str):
                text_to_tokenize.append(data[field])
    combined_text = " ".join(text_to_tokenize)
    return " ".join(tokenize(combined_text))

# --- Load dataset ---
def load_dataset(folder, label):
    texts, labels = [], []
    for domain_folder in os.listdir(folder):
        domain_path = os.path.join(folder, domain_folder)
        if not os.path.isdir(domain_path):
            continue
        for file_name in os.listdir(domain_path):
            if file_name.endswith(".json"):
                file_path = os.path.join(domain_path, file_name)
                text = process_json_file(file_path)
                if text.strip():  # only keep non-empty
                    texts.append(text)
                    labels.append(label)
    return texts, labels

In [None]:
DEV_FOLDER = "./mycelium/out"
NONDEV_FOLDER = "./mycelium/out_neg"

def train():
    dev_texts, dev_labels = load_dataset(DEV_FOLDER, 1)
    nondev_texts, nondev_labels = load_dataset(NONDEV_FOLDER, 0)
    
    texts = dev_texts + nondev_texts
    labels = dev_labels + nondev_labels
    
    # Vectorize
    vectorizer = TfidfVectorizer(
        max_features=10000,
        ngram_range=(1,3),
        min_df=2,   # ignore words appearing in only 1 doc
        max_df=0.9, # ignore words appearing in 90% of docs
        stop_words=list(text.ENGLISH_STOP_WORDS),
    )

    X = vectorizer.fit_transform(texts)
    y = labels
    
    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train classifier
    clf = LogisticRegression(max_iter=2000, C=10, solver="saga")
    clf.fit(X_train, y_train)

    # Evaluate
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred))

    return clf, vectorizer

clf, vectorizer = train()

In [49]:
# --- Classify a new JSON ---
def classify_json(file_path, clf, vectorizer):
    text = process_json_file(file_path)
    X_new = vectorizer.transform([text])
    pred = clf.predict(X_new)[0]
    return "Dev Blog ✅" if pred == 1 else "Not a Dev Blog ❌"

print(classify_json("./mycelium/out_neg/1clickvpn.net/7c4170d3-65bd-49f0-919c-d919b3367243.json", clf, vectorizer))
print(classify_json("./mycelium/out/odonnellweb.com/6eba3ec3-fea9-4025-be00-2e7672f98766.json", clf, vectorizer))

Not a Dev Blog ❌
Dev Blog ✅
