In [65]:
import json
import re
import unicodedata
from nltk.corpus import stopwords
import nltk

import os
os.environ["MKL_VERBOSE"] = "0"

# Filtering the scraped data from MIT OpenCourseWare

We collected lecture video titles across various STEM-related fields, including their course titles. The goal of this filtration process is to clean and normalize these titles by removing non-STEM-specific stopwords, redundant patterns (e.g., Roman numerals, numbers, continuation markers), and punctuation anomalies, while preserving meaningful keywords. This ensures Unicode normalization, deduplication, and concise, contextually relevant keyword extraction to improve the identification of unique STEM-related lecture content.

In [66]:
# Define additional stopwords
additional_stop_words = {
    "introduction", "overview", "basics", "principles", "fundamentals", "essentials",
    "concepts", "topics", "outline", "scope", "insights", "lecture", "session",
    "class", "seminar", "discussion", "tutorial", "workshop", "exercise", "module",
    "part", "series", "unit", "goals", "objectives", "summary", "review", "highlights",
    "conclusion", "recap", "continued", "continuation", "advanced", "update", "notes",
    "refresher", "start", "beginning", "end", "final", "first", "second", "third", 
    "perspective", "focus", "study", "methods", "approaches", "applications", "field",
    "understanding", "applications of", "in context", "applied", "introduction to", 
    "explanation", "examining", "exam", "goals", "and", "quiz", "presentation", 
    "presentations", "case", "cases", "bonus", "project", "projects", "problem",
    "problems", "course", "courses", "solution", "solutions", "general", 
    "examples", "example", "lectures", "question", "questions", "answer", "answers",
    "reading", "assignment", "assignments", "definition", "definitions", "reaction",
    "reactions", "education", "clip", "clips", "intro"
}
generic_terms = [
    "Access", "Activity", "Adjustment", "Adult", "Advice", "Advanced", "Alternatives", 
    "Analysis", "Application", "Apple", "Approaches", "Artists", "Assignment", "Basics", 
    "Beginner", "Best", "Bonus", "Build", "Cases", "Challenges", "Class", "Clip", "Clips", 
    "Course", "Courses", "Concepts", "Conclusion", "Continued", "Continuation", "Creative", 
    "Custom", "Design", "DIY", "Discussion", "Education", "End", "Entry", "Example", 
    "Examples", "Exercise", "Explanation", "Fail", "First", "Final", "Focus", "Fun", 
    "Fundamentals", "Games", "Gaming", "General", "Goals", "Guide", "Hacks", "Highlights", 
    "How-to", "Ideas", "Implementation", "Insights", "Intro", "Introduction", "Lecture", 
    "Lessons", "Methods", "Module", "Motivation", "Notes", "Objectives", "Overview", 
    "Part", "Perspective", "Principles", "Problems", "Project", "Projects", "Question", 
    "Questions", "Quiz", "Reading", "Recap", "Refresher", "Review", "Scope", "Second", 
    "Series", "Session", "Skills", "Solution", "Solutions", "Start", "Study", "Summary", 
    "Support", "Techniques", "Third", "Tips", "Topics", "Tools", "Tutorial", "Tutorials", 
    "Unit", "Update", "Ways", "Webinar", "Workshop", "Work", "Code", "Codes", "Feedback",
    "Check-in"
]

all_stopwords = set(stopwords.words("english")).union(additional_stop_words).union(generic_terms)

# Patterns for cleaning
patterns_to_remove = r"Lecture\s*\d+:|Lecture\s*\d+|^(\d+[\.\:\;\,\-\—\—]\s*)"
continuation_pattern = r"\b(cont\.?|contd\.?|continued)\b"
parentheses_pattern = r"\(.*?\)"
roman_numeral_pattern = r"\b[IVXLCDM]+\b"
number_pattern = r"\b\d+[\.\:\;\,]?\b"
symbol_cleanup_pattern = r"(?<!\w)[^\w\s/-]|[^\w\s/-](?!\w)"
short_title_pattern = r"^[A-Za-z0-9]{1,2}$"
max_words_per_title = 4

# Normalize Unicode and replace non-ASCII characters
def normalize_unicode(text):
    text = re.sub(r"[\u2013\u2014\u2015—]", "-", text)  # Normalize dashes
    text = unicodedata.normalize("NFKD", text)
    return "".join(c if ord(c) < 128 else " " for c in text)

# Preprocess title
def preprocess_title(title):
    title = normalize_unicode(title)
    title = re.sub(parentheses_pattern, "", title)
    title = re.sub(roman_numeral_pattern, "", title, flags=re.IGNORECASE)
    title = re.sub(number_pattern, "", title)
    title = re.sub(patterns_to_remove, "", title, flags=re.IGNORECASE)
    return title.strip()

# Filter keywords
def filter_keywords(title):
    title = preprocess_title(title)
    title_parts = re.split(r'[,:;]', title)
    filtered_parts = []
    for part in title_parts:
        part = re.sub(r"^\.\s*", "", part).strip()
        part = re.sub(r"^-+|-+$", "", part).strip()
        filtered_words = [
            re.sub(symbol_cleanup_pattern, "", word) for word in part.split()
            if word.lower() not in all_stopwords 
            and not re.search(short_title_pattern, word)
            and len(word) > 2
        ]
        if filtered_words and len(filtered_words) <= max_words_per_title:
            filtered_parts.append(" ".join(filtered_words))
    return filtered_parts

# Cleaning the course title
def clean_course_title(title):
    # Remove any content within brackets
    title = re.sub(r"\[.*?\]|\(.*?\)", "", title).strip()
    return normalize_unicode(title)

# Recursive data cleaning
def clean_data_recursive(data):
    cleaned_data = {}
    for key, value in data.items():
        if isinstance(value, list):  # Base case: process a list of entries
            cleaned_entries = []
            for entry in value:
                if "title" in entry and entry["title"]:
                    filtered_title_list = filter_keywords(entry["title"])
                    if filtered_title_list:
                        entry["title"] = filtered_title_list
                        # Clean the resource_course_title field
                        if "resource_course_title" in entry:
                            entry["resource_course_title"] = clean_course_title(entry["resource_course_title"])
                        cleaned_entries.append(entry)
            if cleaned_entries:
                cleaned_data[key] = cleaned_entries
        elif isinstance(value, dict):  # Recursive case: process nested subcategories
            cleaned_subcategory = clean_data_recursive(value)
            if cleaned_subcategory:
                cleaned_data[key] = cleaned_subcategory
    return cleaned_data

# Load and clean data
with open('lecture_videos_data.json', 'r') as file:
    data = json.load(file)

cleaned_data = clean_data_recursive(data)

# Save to JSON
with open('filtered_stem_lectures.json', 'w') as outfile:
    json.dump(cleaned_data, outfile, indent=4)
print("Filtered, de-duplicated, and cleaned titles have been saved to 'filtered_stem_lectures.json'")

Filtered, de-duplicated, and cleaned titles have been saved to 'filtered_stem_lectures.json'


In [67]:
def get_skeleton_with_counts(json_data, depth=1, max_depth=3):
    """
    Recursively generate the skeleton of the JSON object up to a specified depth,
    including counts of elements under the last represented layer.
    :param json_data: JSON object (list, dict, etc.)
    :param depth: Current depth in the JSON hierarchy
    :param max_depth: Maximum depth to process
    :return: Skeleton representation of the JSON object with counts
    """
    if depth > max_depth:
        # Add counts for lists and dictionaries beyond the max depth
        if isinstance(json_data, dict):
            return f"{len(json_data)} keys"
        elif isinstance(json_data, list):
            return f"{len(json_data)} items"
        else:
            return None  # Non-nested values are ignored

    if isinstance(json_data, dict):
        return {key: get_skeleton_with_counts(value, depth + 1, max_depth) for key, value in json_data.items()}
    elif isinstance(json_data, list):
        return [f"{len(json_data)} items"]
    else:
        return None  # Non-nested values are ignored for the skeleton


file_name = "filtered_stem_lectures.json"
with open(file_name, 'r') as file:
    json_data = json.load(file)

# Generate skeleton with counts for the first two layers
skeleton = get_skeleton_with_counts(json_data, max_depth=6)

print("Skeleton with counts:")
print(json.dumps(skeleton, indent=4))

Skeleton with counts:
{
    "Engineering": [
        "1081 items"
    ],
    "Science": [
        "1152 items"
    ],
    "Mathematics": [
        "806 items"
    ],
    "Computer Science": [
        "474 items"
    ],
    "Physics": [
        "641 items"
    ],
    "Mechanical Engineering": [
        "153 items"
    ],
    "Systems Engineering": [
        "235 items"
    ],
    "Electrical Engineering": [
        "225 items"
    ],
    "Differential Equations": [
        "405 items"
    ],
    "Probability and Statistics": [
        "233 items"
    ],
    "Linear Algebra": [
        "310 items"
    ],
    "Biology": [
        "259 items"
    ],
    "Calculus": [
        "322 items"
    ],
    "Earth Science": [
        "6 items"
    ],
    "Materials Science and Engineering": [
        "188 items"
    ],
    "Applied Mathematics": [
        "230 items"
    ],
    "Chemistry": [
        "309 items"
    ],
    "Algorithms and Data Structures": [
        "172 items"
    ],
    "Thermodyn

# Creating a hierarchical structure

Since we have a flat JSON structure, we would like to order it into a hierarchical format, nesting subcategories under their respective parent categories (e.g., "Physics" under "Science"). As we are exploring STEM, it is natural to have these categories as the highest order (with the exception of Technology being replaced with Computer Science for our case). It assigns parent category names to entries, merges related subcategories, and removes duplicate entries based on their titles and course names. The final structure ensures clear categorization and deduplication, improving the organization and usability of the data for downstream analysis.

The desired hierarchy is as follows, limited to 3 levels:
#### Science
- **Physics**
  - Atomic, Molecular, Optical Physics
  - Theoretical Physics
  - Condensed Matter Physics
  - Nuclear Physics
  - Particle Physics
  - Astrophysics
  - Quantum Mechanics
  - Electromagnetism
- **Chemistry**
  - Physical Chemistry
  - Analytical Chemistry
  - Organic Chemistry
  - Inorganic Chemistry
- **Biology**
  - Biochemistry
  - Molecular Biology
  - Genetics
  - Neurobiology
  - Structural Biology
  - Synthetic Biology
  - Microbiology
  - Virology
  - Anatomy and Physiology
- **Earth Science**

#### Engineering
- **Electrical Engineering**
  - Digital Systems
  - Signal Processing
  - Electronics
  - Telecommunications
  - Electric Power
- **Mechanical Engineering**
  - Solid Mechanics
  - Thermodynamics
  - Transport Processes
  - Robotics and Control Systems
- **Systems Engineering**
  - Systems Optimization
  - Systems Design
- **Materials Science and Engineering**
- **Chemical Engineering**
- **Environmental Engineering**
- **Energy**
- **Biological Engineering**
  - Cell and Tissue Engineering
  - Biomedical Signal and Image Processing

#### Mathematics
- **Calculus**
- **Differential Equations**
- **Probability and Statistics**
- **Linear Algebra**
- **Discrete Mathematics**
- **Mathematical Analysis**
- **Econometrics**
- **Applied Mathematics**

#### Computer Science
- **Algorithms and Data Structures**
- **Artificial Intelligence**
- **Programming Languages**
- **Computer Networks**
- **Theory of Computation**
- **Graphics and Visualization**
- **Game Design**
- **Data Mining**
- **Cryptography**
- **Computation**
- **Computation and Systems Biology**
- **Computational Biology**
- **Computational Science and Engineering**
- **Computational Modeling and Simulation**

In [68]:
def reorganize_json(data):
    """
    Reorganize the JSON file to properly nest categories, merge with main categories,
    and remove duplicates while assigning the parent category name as the key for entries.
    """

    def pop_category(data, category_name):
        """
        Pop a category from the data dictionary, returning an empty list if the category does not exist.
        """
        return data.pop(category_name, [])

    def merge_categories(main_category, new_subcategories, main_category_name=None):
        """
        Merge existing subcategories with new ones for a main category.
        If the main category is a list, wrap it in a dictionary using the main_category_name as the key.
        """
        if isinstance(main_category, list) and main_category_name:
            main_category = {main_category_name: main_category}
        elif isinstance(main_category, list):
            raise ValueError("main_category_name must be provided if main_category is a list.")
        merged = {**main_category, **new_subcategories}
        return merged

    def assign_parent_category(entries, parent_category):
        """
        Assign the parent category name as the key for entries.
        """
        return {parent_category: entries}

    def remove_duplicates_from_titles(category_list):
        """
        Remove duplicate entries based on `resource_course_title` and `title`.
        """
        seen = set()
        unique = []
        for entry in category_list:
            if isinstance(entry, dict):
                titles = tuple(entry.get("title", []))
                course = entry.get("resource_course_title", "")
                if (course, titles) not in seen:
                    seen.add((course, titles))
                    unique.append(entry)
        return unique

    # Merge with main categories and assign parent categories
    science = merge_categories(
        assign_parent_category(data.get("Science", []), "Science"),
        {
            "Physics": merge_categories(
                assign_parent_category(pop_category(data, "Physics"), "Physics"),
                {
                    "Atomic, Molecular, Optical Physics": assign_parent_category(pop_category(data, "Atomic, Molecular, Optical Physics"), "Atomic, Molecular, Optical Physics"),
                    "Theoretical Physics": assign_parent_category(pop_category(data, "Theoretical Physics"), "Theoretical Physics"),
                    "Condensed Matter Physics": assign_parent_category(pop_category(data, "Condensed Matter Physics"), "Condensed Matter Physics"),
                    "Nuclear Physics": assign_parent_category(pop_category(data, "Nuclear Physics"), "Nuclear Physics"),
                    "Particle Physics": assign_parent_category(pop_category(data, "Particle Physics"), "Particle Physics"),
                    "Astrophysics": assign_parent_category(pop_category(data, "Astrophysics"), "Astrophysics"),
                    "Quantum Mechanics": assign_parent_category(pop_category(data, "Quantum Mechanics"), "Quantum Mechanics"),
                    "Electromagnetism": assign_parent_category(pop_category(data, "Electromagnetism"), "Electromagnetism"),
                },
                main_category_name="Physics"
            ),
            "Chemistry": merge_categories(
                assign_parent_category(pop_category(data, "Chemistry"), "Chemistry"),
                {
                    "Physical Chemistry": assign_parent_category(pop_category(data, "Physical Chemistry"), "Physical Chemistry"),
                    "Analytical Chemistry": assign_parent_category(pop_category(data, "Analytical Chemistry"), "Analytical Chemistry"),
                    "Organic Chemistry": assign_parent_category(pop_category(data, "Organic Chemistry"), "Organic Chemistry"),
                    "Inorganic Chemistry": assign_parent_category(pop_category(data, "Inorganic Chemistry"), "Inorganic Chemistry"),
                },
                main_category_name="Chemistry"
            ),
            "Biology": merge_categories(
                assign_parent_category(pop_category(data, "Biology"), "Biology"),
                {
                    "Biochemistry": assign_parent_category(pop_category(data, "Biochemistry"), "Biochemistry"),
                    "Molecular Biology": assign_parent_category(pop_category(data, "Molecular Biology"), "Molecular Biology"),
                    "Genetics": assign_parent_category(pop_category(data, "Genetics"), "Genetics"),
                    "Neurobiology": assign_parent_category(pop_category(data, "Neurobiology"), "Neurobiology"),
                    "Structural Biology": assign_parent_category(pop_category(data, "Structural Biology"), "Structural Biology"),
                    "Synthetic Biology": assign_parent_category(pop_category(data, "Synthetic Biology"), "Synthetic Biology"),
                    "Microbiology": assign_parent_category(pop_category(data, "Microbiology"), "Microbiology"),
                    "Virology": assign_parent_category(pop_category(data, "Virology"), "Virology"),
                    "Anatomy and Physiology": assign_parent_category(pop_category(data, "Anatomy and Physiology"), "Anatomy and Physiology"),
                },
                main_category_name="Biology"
            ),
            "Earth Science": assign_parent_category(pop_category(data, "Earth Science"), "Earth Science"),
        },
        main_category_name="Science"
    )

    engineering = merge_categories(
        assign_parent_category(data.get("Engineering", []), "Engineering"),
        {
            "Electrical Engineering": merge_categories(
                assign_parent_category(pop_category(data, "Electrical Engineering"), "Electrical Engineering"),
                {
                    "Digital Systems": assign_parent_category(pop_category(data, "Digital Systems"), "Digital Systems"),
                    "Signal Processing": assign_parent_category(pop_category(data, "Signal Processing"), "Signal Processing"),
                    "Electronics": assign_parent_category(pop_category(data, "Electronics"), "Electronics"),
                    "Telecommunications": assign_parent_category(pop_category(data, "Telecommunications"), "Telecommunications"),
                    "Electric Power": assign_parent_category(pop_category(data, "Electric Power"), "Electric Power"),
                },
                main_category_name="Electrical Engineering"
            ),
            "Mechanical Engineering": merge_categories(
                assign_parent_category(pop_category(data, "Mechanical Engineering"), "Mechanical Engineering"),
                {
                    "Solid Mechanics": assign_parent_category(pop_category(data, "Solid Mechanics"), "Solid Mechanics"),
                    "Thermodynamics": assign_parent_category(pop_category(data, "Thermodynamics"), "Thermodynamics"),
                    "Transport Processes": assign_parent_category(pop_category(data, "Transport Processes"), "Transport Processes"),
                    "Robotics and Control Systems": assign_parent_category(pop_category(data, "Robotics and Control Systems"), "Robotics and Control Systems"),
                },
                main_category_name="Mechanical Engineering"
            ),
            "Systems Engineering": merge_categories(
                assign_parent_category(pop_category(data, "Systems Engineering"), "Systems Engineering"),
                {
                    "Systems Optimization": assign_parent_category(pop_category(data, "Systems Optimization"), "Systems Optimization"),
                    "Systems Design": assign_parent_category(pop_category(data, "Systems Design"), "Systems Design"),
                },
                main_category_name="Systems Engineering"
            ),
            "Materials Science and Engineering": assign_parent_category(pop_category(data, "Materials Science and Engineering"), "Materials Science and Engineering"),
            "Chemical Engineering": assign_parent_category(pop_category(data, "Chemical Engineering"), "Chemical Engineering"),
            "Environmental Engineering": assign_parent_category(pop_category(data, "Environmental Engineering"), "Environmental Engineering"),
            "Energy": assign_parent_category(pop_category(data, "Energy"), "Energy"),
            "Biological Engineering": merge_categories(
                assign_parent_category(pop_category(data, "Biological Engineering"), "Biological Engineering"),
                {
                    "Cell and Tissue Engineering": assign_parent_category(pop_category(data, "Cell and Tissue Engineering"), "Cell and Tissue Engineering"),
                    "Biomedical Signal and Image Processing": assign_parent_category(pop_category(data, "Biomedical Signal and Image Processing"), "Biomedical Signal and Image Processing"),
                },
                main_category_name="Biological Engineering"
            ),
        },
        main_category_name="Engineering"
    )

    mathematics = merge_categories(
        assign_parent_category(data.get("Mathematics", []), "Mathematics"),
        {
            "Calculus": assign_parent_category(pop_category(data, "Calculus"), "Calculus"),
            "Differential Equations": assign_parent_category(pop_category(data, "Differential Equations"), "Differential Equations"),
            "Probability and Statistics": assign_parent_category(pop_category(data, "Probability and Statistics"), "Probability and Statistics"),
            "Linear Algebra": assign_parent_category(pop_category(data, "Linear Algebra"), "Linear Algebra"),
            "Discrete Mathematics": assign_parent_category(pop_category(data, "Discrete Mathematics"), "Discrete Mathematics"),
            "Mathematical Analysis": assign_parent_category(pop_category(data, "Mathematical Analysis"), "Mathematical Analysis"),
            "Econometrics": assign_parent_category(pop_category(data, "Econometrics"), "Econometrics"),
            "Applied Mathematics": assign_parent_category(pop_category(data, "Applied Mathematics"), "Applied Mathematics"),
        },
        main_category_name="Mathematics"
    )

    computer_science = merge_categories(
        assign_parent_category(data.get("Computer Science", []), "Computer Science"),
        {
            "Algorithms and Data Structures": assign_parent_category(pop_category(data, "Algorithms and Data Structures"), "Algorithms and Data Structures"),
            "Artificial Intelligence": assign_parent_category(pop_category(data, "Artificial Intelligence"), "Artificial Intelligence"),
            "Programming Languages": assign_parent_category(pop_category(data, "Programming Languages"), "Programming Languages"),
            "Computer Networks": assign_parent_category(pop_category(data, "Computer Networks"), "Computer Networks"),
            "Theory of Computation": assign_parent_category(pop_category(data, "Theory of Computation"), "Theory of Computation"),
            "Graphics and Visualization": assign_parent_category(pop_category(data, "Graphics and Visualization"), "Graphics and Visualization"),
            "Game Design": assign_parent_category(pop_category(data, "Game Design"), "Game Design"),
            "Data Mining": assign_parent_category(pop_category(data, "Data Mining"), "Data Mining"),
            "Cryptography": assign_parent_category(pop_category(data, "Cryptography"), "Cryptography"),
            "Computation": assign_parent_category(pop_category(data, "Computation"), "Computation"),
            "Computation and Systems Biology": assign_parent_category(pop_category(data, "Computation and Systems Biology"), "Computation and Systems Biology"),
            "Computational Biology": assign_parent_category(pop_category(data, "Computational Biology"), "Computational Biology"),
            "Computational Science and Engineering": assign_parent_category(pop_category(data, "Computational Science and Engineering"), "Computational Science and Engineering"),
            "Computational Modeling and Simulation": assign_parent_category(pop_category(data, "Computational Modeling and Simulation"), "Computational Modeling and Simulation"),
        },
        main_category_name="Computer Science"
    )

    # Construct the final structure
    new_structure = {
        "Science": science,
        "Engineering": engineering,
        "Mathematics": mathematics,
        "Computer Science": computer_science,
    }

    # Remove duplicates in all categories
    for main_category, subcategories in new_structure.items():
        if isinstance(subcategories, dict):
            for subcat, items in subcategories.items():
                if isinstance(items, list):
                    new_structure[main_category][subcat] = remove_duplicates_from_titles(items)

    return new_structure

input_file = "filtered_stem_lectures.json"
output_file = "reorganized_stem_lectures.json"

with open(input_file, "r", encoding="utf-8") as file:
    json_data = json.load(file)

updated_json_data = reorganize_json(json_data)

with open(output_file, "w", encoding="utf-8") as file:
    json.dump(updated_json_data, file, ensure_ascii=False, indent=4)

print(f"Reorganized JSON saved to {output_file}")

Reorganized JSON saved to reorganized_stem_lectures.json


In [55]:
file_name = "reorganized_stem_lectures.json"
with open(file_name, 'r') as file:
    json_data = json.load(file)

skeleton = get_skeleton_with_counts(json_data, max_depth=6)
print("Skeleton with counts:")
print(json.dumps(skeleton, indent=4))

Skeleton with counts:
{
    "Science": {
        "Science": [
            "961 items"
        ],
        "Physics": {
            "Physics": [
                "641 items"
            ],
            "Atomic, Molecular, Optical Physics": {
                "Atomic, Molecular, Optical Physics": [
                    "109 items"
                ]
            },
            "Theoretical Physics": {
                "Theoretical Physics": [
                    "83 items"
                ]
            },
            "Condensed Matter Physics": {
                "Condensed Matter Physics": [
                    "23 items"
                ]
            },
            "Nuclear Physics": {
                "Nuclear Physics": [
                    "51 items"
                ]
            },
            "Particle Physics": {
                "Particle Physics": [
                    "94 items"
                ]
            },
            "Astrophysics": {
                "Astrophysics": [
             

In [71]:
def remove_duplicates_within_category(data, main_category):
    """
    Remove duplicate entries from a parent category while keeping entries in the most nested layers,
    with logging to track the removals.
    """
    if main_category not in data:
        return data

    def collect_nested_entries(subcategories, parent_name):
        """
        Recursively collect all entries from the most nested subcategories.
        """
        all_entries = set()
        for subcat_name, subcat_data in subcategories.items():
            if subcat_name == parent_name and isinstance(subcat_data, list):
                # Skip the general category's direct entries
                continue
            if isinstance(subcat_data, list):
                for entry in subcat_data:
                    entry_title = tuple(entry.get("title", []))
                    all_entries.add(entry_title)
            elif isinstance(subcat_data, dict):
                if subcat_name in subcat_data and isinstance(subcat_data[subcat_name], list):
                    for entry in subcat_data[subcat_name]:
                        entry_title = tuple(entry.get("title", []))
                        all_entries.add(entry_title)
                # Recursively collect from deeper subcategories
                all_entries.update(collect_nested_entries(subcat_data, subcat_name))
        return all_entries

    subcategories = data[main_category]
    all_nested_entries = collect_nested_entries(subcategories, main_category)

    # Remove duplicates from the parent category's direct entries
    general_entries = data[main_category].get(main_category, [])
    remaining_entries = []
    for entry in general_entries:
        entry_title = tuple(entry.get("title", []))
        if entry_title in all_nested_entries:
            print(f"Removing duplicate entry from '{main_category}': {entry_title}")
        else:
            remaining_entries.append(entry)
    data[main_category][main_category] = remaining_entries
    return data

input_file = "reorganized_stem_lectures.json"
with open(input_file, 'r', encoding='utf-8') as file:
    json_data = json.load(file)

main_categories_to_process = ["Science", "Engineering", "Mathematics", "Computer Science"]
for main_category in main_categories_to_process:
    json_data = remove_duplicates_within_category(json_data, main_category)

output_file = "final_stem_lectures.json"
with open(output_file, 'w', encoding='utf-8') as file:
    json.dump(json_data, file, ensure_ascii=False, indent=4)

print(f"Deduplicated data saved to {output_file}. Check 'deduplication_log.txt' for details.")

Removing duplicate entry from 'Science': ('Molecular Dynamics',)
Removing duplicate entry from 'Science': ('Monte Carlo Simulations',)
Removing duplicate entry from 'Science': ('Free Energies Physical Coarse-Graining',)
Removing duplicate entry from 'Science': ('Potentials', 'Supercells', 'Relaxation', 'Methodology')
Removing duplicate entry from 'Science': ('Ab-Initio Thermodynamics Structure Prediction',)
Removing duplicate entry from 'Science': ('Accelerated Molecular Dynamics',)
Removing duplicate entry from 'Science': ('Studies High Pressure',)
Removing duplicate entry from 'Science': ('Potentials',)
Removing duplicate entry from 'Science': ('Energy',)
Removing duplicate entry from 'Science': ('Studies DFT',)
Removing duplicate entry from 'Science': ('Studies',)
Removing duplicate entry from 'Science': ('Finite Temperature',)
Removing duplicate entry from 'Science': ('Monte Carlo Simulation',)
Removing duplicate entry from 'Science': ('Model Hamiltonions',)
Removing duplicate entr

In [72]:
file_name = "final_stem_lectures.json"
with open(file_name, 'r') as file:
    json_data = json.load(file)

skeleton = get_skeleton_with_counts(json_data, max_depth=6)
print("Skeleton with counts:")
print(json.dumps(skeleton, indent=4))

Skeleton with counts:
{
    "Science": {
        "Science": [
            "227 items"
        ],
        "Physics": {
            "Physics": [
                "641 items"
            ],
            "Atomic, Molecular, Optical Physics": {
                "Atomic, Molecular, Optical Physics": [
                    "109 items"
                ]
            },
            "Theoretical Physics": {
                "Theoretical Physics": [
                    "83 items"
                ]
            },
            "Condensed Matter Physics": {
                "Condensed Matter Physics": [
                    "23 items"
                ]
            },
            "Nuclear Physics": {
                "Nuclear Physics": [
                    "51 items"
                ]
            },
            "Particle Physics": {
                "Particle Physics": [
                    "94 items"
                ]
            },
            "Astrophysics": {
                "Astrophysics": [
             

# Creating keyword lists per category

We now extract unique keywords from lecture titles across categories and subcategories, cleaning them by removing any words containing numbers. It organizes these keywords into Python lists, sorted and grouped by category, ensuring clarity and relevance. The resulting lists provide a streamlined view of keywords for each category. 

In [73]:
def clean_keywords_of_numbers(keywords):
    """
    Remove any words with numbers from the given list of keywords.
    """
    cleaned_keywords = []
    for keyword in keywords:
        if isinstance(keyword, str):
            cleaned_keyword = " ".join(
                word for word in keyword.split() if not re.search(r'\d', word)
            )
            if cleaned_keyword:
                cleaned_keywords.append(cleaned_keyword)
    return cleaned_keywords


def extract_cleaned_keywords_per_category(json_data):
    """
    Extract unique, cleaned keywords per category and subcategory from the JSON data.
    """
    category_keywords = {}

    def extract_keywords_recursively(category_name, entries):
        """
        Recursively extract and clean keywords for each category and its subcategories.
        """
        keywords = set()
        if isinstance(entries, list):
            for entry in entries:
                title = entry.get("title", [])
                if isinstance(title, list):
                    cleaned_title = clean_keywords_of_numbers(title)
                    keywords.update(cleaned_title)
        elif isinstance(entries, dict):
            for subcategory_name, sub_entries in entries.items():
                sub_keywords = extract_keywords_recursively(subcategory_name, sub_entries)
                subcategory_key = subcategory_name.lower().replace(" ", "_")
                category_keywords[subcategory_key] = sorted(sub_keywords)
                keywords.update(sub_keywords)
        return keywords

    for main_category, subcategories in json_data.items():
        main_category_key = main_category.lower().replace(" ", "_")
        main_keywords = extract_keywords_recursively(main_category, subcategories)
        category_keywords[main_category_key] = sorted(main_keywords)

    return category_keywords

file_name = "final_stem_lectures.json"
with open(file_name, 'r') as file:
    json_data = json.load(file)

cleaned_keywords_per_category = extract_cleaned_keywords_per_category(json_data)

output_file = "cleaned_keywords_per_category.txt"
with open(output_file, "w") as file:
    for category, keywords in cleaned_keywords_per_category.items():
        file.write(f"{category} = {keywords}\n\n")