In [None]:
import json
import re
import unicodedata
from nltk.corpus import stopwords
import nltk

In [159]:
nltk.download("stopwords")

# Define lecture-related stopwords and patterns to filter out
additional_stop_words = {
    "introduction", "overview", "basics", "principles", "fundamentals", "essentials",
    "concepts", "topics", "outline", "scope", "insights", "lecture", "session",
    "class", "seminar", "discussion", "tutorial", "workshop", "exercise", "module",
    "part", "series", "unit", "goals", "objectives", "summary", "review", "highlights",
    "conclusion", "recap", "continued", "continuation", "advanced", "update", "notes",
    "refresher", "start", "beginning", "end", "final", "first", "second", "third", 
    "perspective", "focus", "study", "methods", "approaches", "applications", "field",
    "understanding", "applications of", "in context", "applied", "introduction to", 
    "explanation", "examining", "exam", "goals", "and", "quiz", "presentation", 
    "presentations", "case", "cases", "bonus", "project", "projects", "problem",
    "problems", "course", "courses", "solution", "solutions", "general", 
    "examples", "example", "lectures", "question", "questions", "answer", "answers",
    "reading", "assignment", "assignments", "definition", "definitions", "reaction",
    "reactions", "education", "clip", "clips", "intro"
}
all_stopwords = set(stopwords.words("english")).union(additional_stop_words)

# Patterns to clean and normalize the titles
patterns_to_remove = r"Lecture\s*\d+:|Lecture\s*\d+|^(\d+[\.\:\;\,\-\—\—]\s*)"
continuation_pattern = r"\b(cont\.?|contd\.?|continued)\b"
parentheses_pattern = r"\(.*?\)"
roman_numeral_pattern = r"\b[IVXLCDM]+\b"
number_pattern = r"\b\d+[\.\:\;\,]?\b"
symbol_cleanup_pattern = r"(?<!\w)[^\w\s/-]|[^\w\s/-](?!\w)"
short_title_pattern = r"^[A-Za-z0-9]{1,2}$"  # Exclude titles like "A", "B2", "A2"

max_words_per_title = 4

with open('lecture_videos_data.json', 'r') as file:
    stem_data = json.load(file)

# Function to normalize Unicode symbols and replace non-ASCII characters with spaces
def normalize_unicode(text):
    # Replace known Unicode symbols with ASCII equivalents
    text = re.sub(r"[\u2013\u2014\u2015—]", "-", text)  # Normalize dashes to hyphen
    # Normalize and replace non-ASCII characters with a space
    text = unicodedata.normalize("NFKD", text)
    return "".join(c if ord(c) < 128 else " " for c in text)  # Replace non-ASCII with space

# Function to pre-process and clean the title field, including Unicode normalization
def preprocess_title(title):
    # Normalize Unicode symbols and replace non-ASCII characters with spaces
    title = normalize_unicode(title)
    # Remove parentheses content
    title = re.sub(parentheses_pattern, "", title)
    # Remove Roman numerals
    title = re.sub(roman_numeral_pattern, "", title, flags=re.IGNORECASE)
    # Remove standalone numbers and any trailing punctuation
    title = re.sub(number_pattern, "", title)
    # Remove specified patterns and numbers with punctuation
    title = re.sub(patterns_to_remove, "", title, flags=re.IGNORECASE)
    return title.strip()

# Function to filter and separate keywords within title fields, splitting only by commas, colons, and semicolons
def filter_keywords(title):
    title = preprocess_title(title)  # Preprocess the title with Unicode handling
    title_parts = re.split(r'[,:;]', title)  # Split by punctuation to separate distinct concepts
    filtered_parts = []
    for part in title_parts:
        # Remove isolated dots and leading/trailing hyphens within each part
        part = re.sub(r"^\.\s*", "", part).strip()
        part = re.sub(r"^-+|-+$", "", part).strip()  # Remove leading/trailing hyphens
        filtered_words = [
            re.sub(symbol_cleanup_pattern, "", word) for word in part.split()
            if word.lower() not in all_stopwords 
            and not re.search(short_title_pattern, word)  # Exclude short titles
            and len(word) > 2  # Exclude single-character words
        ]
        # Rejoin words if they are below the word count threshold
        if filtered_words and len(filtered_words) <= max_words_per_title:
            filtered_parts.append(" ".join(filtered_words))
    return filtered_parts

# Function to remove words starting or ending with punctuation
def remove_punctuated_words(title_list):
    cleaned_title_list = [
        word for word in title_list if not re.search(r"^[^\w]|[^\w]$", word)
    ]
    return cleaned_title_list

# Process JSON data to filter out keywords in the title field and exclude continuation entries
unique_lectures = {}
for category, lectures in stem_data.items():
    filtered_lectures = []
    seen_titles = set()
    for lecture in lectures:
        if re.search(continuation_pattern, lecture["title"], re.IGNORECASE):  # Skip continuation markers
            continue
        filtered_title_list = filter_keywords(lecture["title"])  # Filter keywords
        filtered_title_list = remove_punctuated_words(filtered_title_list)  # Remove punctuated words
        if not filtered_title_list:  # Exclude lecture entry if all titles were removed
            continue
        unique_key = (lecture["resource_course_title"], tuple(filtered_title_list))
        if unique_key not in seen_titles:
            lecture["title"] = filtered_title_list
            filtered_lectures.append(lecture)
            seen_titles.add(unique_key)
    unique_lectures[category] = filtered_lectures

with open('filtered_stem_lectures.json', 'w') as outfile:
    json.dump(unique_lectures, outfile, indent=4)
print("Filtered, de-duplicated, and cleaned titles have been saved to 'filtered_stem_lectures.json'")

[nltk_data] Downloading package stopwords to /Users/yasminekroknes-
[nltk_data]     gomez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Filtered, de-duplicated, and cleaned titles have been saved to 'filtered_stem_lectures.json'


In [160]:
def deduplicate_within_category(lectures):
    """
    Deduplicate lectures within a single category based on title and course title.
    """
    seen_titles = set()
    filtered_lectures = []
    for lecture in lectures:
        title_clean = lecture["title"]  # Assume titles are already preprocessed
        unique_key = (lecture["resource_course_title"], tuple(title_clean))
        if unique_key not in seen_titles:
            filtered_lectures.append(lecture)
            seen_titles.add(unique_key)
    return filtered_lectures


def analyze_and_nest_categories(data, threshold=0.9):
    """
    Analyze duplicate overlaps across categories and nest highly overlapping ones under a parent.
    """
    category_items = {cat: {tuple(l["title"]) for l in lectures} for cat, lectures in data.items()}
    parent_map = {}  # Maps categories to their parents

    # Identify parent-child relationships based on overlap
    for cat1, items1 in category_items.items():
        for cat2, items2 in category_items.items():
            if cat1 != cat2 and cat1 not in parent_map:
                overlap = items1 & items2
                overlap_ratio = len(overlap) / len(items2) if items2 else 0
                if overlap_ratio >= threshold:
                    parent_map[cat2] = cat1

    # Create nested structure
    nested_data = {}
    for category, lectures in data.items():
        parent = parent_map.get(category, None)
        if parent:
            # Ensure the parent category is initialized as a dictionary to hold subcategories
            if parent not in nested_data:
                nested_data[parent] = {"own_lectures": []}
            nested_data[parent][category] = lectures
        else:
            # Add non-child categories directly
            if category not in nested_data:
                nested_data[category] = {"own_lectures": lectures}
    return nested_data


def clean_parent_categories(nested_data):
    """
    Remove duplicates from parent categories that exist in nested subcategories.
    """
    for parent, subcategories in nested_data.items():
        if isinstance(subcategories, dict):
            # Collect all titles from subcategories
            child_titles = set()
            for child, lectures in subcategories.items():
                if child != "own_lectures":
                    child_titles.update(tuple(l["title"]) for l in lectures)
            # Remove duplicates in parent's own lectures
            if "own_lectures" in subcategories:
                subcategories["own_lectures"] = [
                    lecture for lecture in subcategories["own_lectures"]
                    if tuple(lecture["title"]) not in child_titles
                ]
    return nested_data


# Main processing logic
with open('filtered_stem_lectures.json', 'r') as file:
    stem_data = json.load(file)

# Deduplicate within categories
deduplicated_data = {
    category: deduplicate_within_category(lectures)
    for category, lectures in stem_data.items()
}

# Nest overlapping categories
nested_data = analyze_and_nest_categories(deduplicated_data)

# Clean parent categories by removing duplicates present in children
final_data = clean_parent_categories(nested_data)

# Save the final processed data
with open('filtered_stem_lectures_initial_nest.json', 'w') as outfile:
    json.dump(final_data, outfile, indent=4)

print("Final nested and deduplicated categories saved to 'filtered_stem_lectures_initial_nest.json'")

Final nested and deduplicated categories saved to 'filtered_stem_lectures_initial_nest.json'


In [161]:
def remove_empty_own_lectures(data):
    """
    Remove entries where the entry contains only 'own_lectures' as a subcategory and it's empty.
    :param data: Nested dictionary of categories.
    :return: Cleaned dictionary with invalid entries removed.
    """
    if isinstance(data, dict):
        cleaned_data = {}
        for key, value in data.items():
            if isinstance(value, dict):
                subkeys = value.keys()
                # Check if the entry contains only 'own_lectures' and it's empty
                if "own_lectures" in subkeys and len(subkeys) == 1:
                    if isinstance(value["own_lectures"], (list, dict)) and not value["own_lectures"]:
                        print(f"Removing {key} because it only contains 'own_lectures' and it is empty.")
                        continue
                    if isinstance(value["own_lectures"], str) and value["own_lectures"] == "0 items":
                        print(f"Removing {key} because it only contains 'own_lectures' and it is '0 items'.")
                        continue
                # Recursively clean subcategories
                cleaned_value = remove_empty_own_lectures(value)
                if cleaned_value:
                    cleaned_data[key] = cleaned_value
            else:
                # Add non-dict values as is
                cleaned_data[key] = value
        return cleaned_data

    elif isinstance(data, list):
        # Recursively clean items in lists
        return [remove_empty_own_lectures(item) for item in data if item]

    # For other types (e.g., strings, numbers), return as-is
    return data

# File names
input_file = "filtered_stem_lectures_initial_nest.json"
output_file = "filtered_stem_lectures_no_empty.json"

# Load the JSON file with the current data
with open(input_file, "r", encoding="utf-8") as file:
    nested_data = json.load(file)

# Remove entries with only empty 'own_lectures'
cleaned_data = remove_empty_own_lectures(nested_data)

# Save the cleaned data back to the JSON file
with open(output_file, "w", encoding="utf-8") as file:
    json.dump(cleaned_data, file, ensure_ascii=False, indent=4)

print(f"Entries with only empty 'own_lectures' removed and saved to '{output_file}'.")

Removing Cognitive Science because it only contains 'own_lectures' and it is empty.
Removing Fluid Mechanics because it only contains 'own_lectures' and it is empty.
Removing Mechanical Design because it only contains 'own_lectures' and it is empty.
Removing Ocean Engineering because it only contains 'own_lectures' and it is empty.
Removing Civil Engineering because it only contains 'own_lectures' and it is empty.
Removing Geophysics because it only contains 'own_lectures' and it is empty.
Removing Algebra and Number Theory because it only contains 'own_lectures' and it is empty.
Removing Hydrodynamics because it only contains 'own_lectures' and it is empty.
Removing Topology and Geometry because it only contains 'own_lectures' and it is empty.
Removing Geology because it only contains 'own_lectures' and it is empty.
Removing Propulsion Systems because it only contains 'own_lectures' and it is empty.
Removing Materials Selection because it only contains 'own_lectures' and it is empty.


In [162]:
def get_skeleton_with_counts(json_data, depth=1, max_depth=3):
    """
    Recursively generate the skeleton of the JSON object up to a specified depth,
    including counts of elements under the last represented layer.
    :param json_data: JSON object (list, dict, etc.)
    :param depth: Current depth in the JSON hierarchy
    :param max_depth: Maximum depth to process
    :return: Skeleton representation of the JSON object with counts
    """
    if depth > max_depth:
        # Add counts for lists and dictionaries beyond the max depth
        if isinstance(json_data, dict):
            return f"{len(json_data)} keys"
        elif isinstance(json_data, list):
            return f"{len(json_data)} items"
        else:
            return None  # Non-nested values are ignored

    if isinstance(json_data, dict):
        return {key: get_skeleton_with_counts(value, depth + 1, max_depth) for key, value in json_data.items()}
    elif isinstance(json_data, list):
        return [f"{len(json_data)} items"]
    else:
        return None  # Non-nested values are ignored for the skeleton

# Load JSON file
file_name = "filtered_stem_lectures_no_empty.json"  # Replace with your JSON file name
with open(file_name, 'r') as file:
    json_data = json.load(file)

# Generate skeleton with counts for the first two layers
skeleton = get_skeleton_with_counts(json_data, max_depth=6)

# Print the skeleton with counts
print("Skeleton with counts:")
print(json.dumps(skeleton, indent=4))

Skeleton with counts:
{
    "Engineering": {
        "own_lectures": [
            "848 items"
        ],
        "Telecommunications": [
            "53 items"
        ],
        "Graphics and Visualization": [
            "33 items"
        ]
    },
    "Science": {
        "own_lectures": [
            "668 items"
        ],
        "Classical Mechanics": [
            "235 items"
        ],
        "Nuclear Physics": [
            "42 items"
        ],
        "Astrophysics": [
            "22 items"
        ]
    },
    "Mathematics": {
        "own_lectures": [
            "588 items"
        ],
        "Computation": [
            "100 items"
        ]
    },
    "Computer Science": {
        "own_lectures": [
            "336 items"
        ],
        "Software Design and Engineering": [
            "95 items"
        ],
        "Computer Design and Engineering": [
            "22 items"
        ],
        "Digital Media": [
            "21 items"
        ],
        "Data Minin

In [163]:
def reorganize_json(data):
    """
    Reorganize the JSON file to properly nest categories and remove duplicates.
    """
    def pop_category(data, category_name):
        """
        Pop a category from the data dictionary, returning an empty structure if the category does not exist.
        """
        return data.pop(category_name, {})

    def merge_categories(main_category, new_subcategories):
        """
        Merge existing subcategories with new ones for a main category.
        """
        merged = {**main_category, **new_subcategories}
        return merged

    def remove_empty_own_lectures(category):
        """
        Remove categories with empty own_lectures and no subcategories.
        """
        if isinstance(category, dict):
            if "own_lectures" in category and not category["own_lectures"] and len(category) == 1:
                return None  # Remove this category
            cleaned_category = {key: remove_empty_own_lectures(value) for key, value in category.items()}
            return {k: v for k, v in cleaned_category.items() if v is not None}
        return category

    def remove_duplicates_from_parent(moved_category, parent_category):
        """
        Remove duplicate lectures from the parent category after moving to a subcategory.
        """
        if not moved_category or "own_lectures" not in moved_category:
            return

        # Collect all items from the moved category
        moved_items = set()
        for item in moved_category["own_lectures"]:
            if isinstance(item, dict):
                resource_course_title = item.get("resource_course_title", "")
                title = item.get("title", "")
                
                # Ensure title is a string; if it's a list, join it into a single string
                if isinstance(title, list):
                    title = " ".join(title)
                    
                moved_items.add((resource_course_title, title))

        def clean_category_recursively(category):
            """
            Recursively clean duplicates from all subcategories in the parent.
            """
            if isinstance(category, dict):
                # Clean duplicates in 'own_lectures'
                if "own_lectures" in category:
                    category["own_lectures"] = [
                        item for item in category["own_lectures"]
                        if not isinstance(item, dict) or 
                        (item.get("resource_course_title"), 
                        " ".join(item.get("title")) if isinstance(item.get("title"), list) else item.get("title")) 
                        not in moved_items
                    ]
                # Recursively clean subcategories
                for subcat in category.values():
                    clean_category_recursively(subcat)

        clean_category_recursively(parent_category)


    # Retrieve main categories without removing their existing structure
    science = data.get("Science", {}).copy()
    engineering = data.get("Engineering", {}).copy()
    mathematics = data.get("Mathematics", {}).copy()
    computer_science = data.get("Computer Science", {}).copy()

    # Define the new structure
    new_structure = {
        "Science": merge_categories(
            science,
            {
                "Physics": merge_categories(
                    science.get("Physics", {}),
                    {
                        "own_lectures": pop_category(data, "Physics").get("own_lectures", []),
                        "Atomic, Molecular, Optical Physics": pop_category(data, "Atomic, Molecular, Optical Physics"),
                        "Theoretical Physics": pop_category(data, "Theoretical Physics"),
                        "Condensed Matter Physics": pop_category(data, "Condensed Matter Physics")
                    }
                ),
                "Chemistry": pop_category(data, "Chemistry"),
                "Biology": merge_categories(
                    pop_category(data, "Biology"),
                    {"Biochemistry": pop_category(data, "Biochemistry")}
                )
            }
        ),
        "Engineering": merge_categories(
            engineering,
            {
                "Electrical Engineering": merge_categories(
                    engineering.get("Electrical Engineering", {}),
                    {
                        "own_lectures": pop_category(data, "Electrical Engineering").get("own_lectures", []),
                        "Digital Systems": pop_category(data, "Digital Systems"),
                        "Signal Processing": pop_category(data, "Signal Processing")
                    }
                ),
                "Mechanical Engineering": merge_categories(
                    engineering.get("Mechanical Engineering", {}),
                    {"own_lectures": pop_category(data, "Mechanical Engineering").get("own_lectures", [])}
                ),
                "Systems Engineering": merge_categories(
                    engineering.get("Systems Engineering", {}),
                    {"Systems Optimization": pop_category(data, "Systems Optimization")}
                ),
                "Materials Science and Engineering": pop_category(data, "Materials Science and Engineering"),
                "Chemical Engineering": pop_category(data, "Chemical Engineering"),
                "Environmental Engineering": pop_category(data, "Environmental Engineering"),
                "Energy": pop_category(data, "Energy"),
                "Biological Engineering": pop_category(data, "Biological Engineering")
            }
        ),
        "Mathematics": merge_categories(
            mathematics,
            {
                "Calculus": pop_category(data, "Calculus"),
                "Differential Equations": pop_category(data, "Differential Equations"),
                "Probability and Statistics": pop_category(data, "Probability and Statistics"),
                "Linear Algebra": pop_category(data, "Linear Algebra"),
                "Discrete Mathematics": pop_category(data, "Discrete Mathematics"),
                "Mathematical Analysis": pop_category(data, "Mathematical Analysis"),
                "Econometrics": pop_category(data, "Econometrics")
            }
        ),
        "Computer Science": merge_categories(
            computer_science,
            {
                "Algorithms and Data Structures": pop_category(data, "Algorithms and Data Structures"),
                "Artificial Intelligence": pop_category(data, "Artificial Intelligence"),
                "Programming Languages": pop_category(data, "Programming Languages"),
                "Computer Networks": pop_category(data, "Computer Networks"),
                "Theory of Computation": pop_category(data, "Theory of Computation")
            }
        )
    }

    # Move Nuclear Physics, Particle Physics, and Astrophysics under Physics
    physics = new_structure["Science"]["Physics"]
    for subcat in ["Nuclear Physics", "Particle Physics", "Astrophysics"]:
        if subcat in science:
            subcat_data = pop_category(science, subcat)
            if isinstance(subcat_data, dict):
                physics[subcat] = subcat_data
                remove_duplicates_from_parent(subcat_data, science)
            elif isinstance(subcat_data, list):  # If it's a list of lectures
                physics[subcat] = {"own_lectures": subcat_data}
                remove_duplicates_from_parent({"own_lectures": subcat_data}, science)

    # Remove empty own_lectures and return the updated structure
    return remove_empty_own_lectures(new_structure)


# Load the existing JSON file
input_file = "filtered_stem_lectures_no_empty.json"
output_file = "reorganized_stem_lectures_no_duplicates.json"

with open(input_file, "r", encoding="utf-8") as file:
    json_data = json.load(file)

# Reorganize the JSON structure
updated_json_data = reorganize_json(json_data)

# Save the updated JSON structure
with open(output_file, "w", encoding="utf-8") as file:
    json.dump(updated_json_data, file, ensure_ascii=False, indent=4)

print(f"Reorganized JSON with duplicates removed saved to {output_file}")

Reorganized JSON with duplicates removed saved to reorganized_stem_lectures_no_duplicates.json


In [164]:
def remove_moved_physics_categories(science_category, moved_categories):
    """
    Remove specified moved physics subcategories from the higher-level Science category.
    
    :param science_category: The Science category in the JSON data.
    :param moved_categories: A list of subcategory names to remove from Science.
    """
    for category in moved_categories:
        if category in science_category:
            science_category.pop(category)

# Run this function after the reorganization
moved_physics_categories = ["Nuclear Physics", "Particle Physics", "Astrophysics"]

# Access the Science category from the updated JSON structure
science = updated_json_data.get("Science", {})

# Remove the moved physics categories from Science
remove_moved_physics_categories(science, moved_physics_categories)

# Save the cleaned-up JSON structure
output_file_cleaned = "reorganized_stem_lectures_cleaned.json"
with open(output_file_cleaned, "w", encoding="utf-8") as file:
    json.dump(updated_json_data, file, ensure_ascii=False, indent=4)

print(f"Physics categories removed from Science. Cleaned JSON saved to {output_file_cleaned}")


Physics categories removed from Science. Cleaned JSON saved to reorganized_stem_lectures_cleaned.json


In [165]:
# Load JSON file
file_name = "reorganized_stem_lectures_cleaned.json"  # Replace with your JSON file name
with open(file_name, 'r') as file:
    json_data = json.load(file)

# Generate skeleton with counts for the first two layers
skeleton = get_skeleton_with_counts(json_data, max_depth=6)

# Print the skeleton with counts
print("Skeleton of the first two layers with counts:")
print(json.dumps(skeleton, indent=4))

Skeleton of the first two layers with counts:
{
    "Science": {
        "own_lectures": [
            "668 items"
        ],
        "Classical Mechanics": [
            "235 items"
        ],
        "Physics": {
            "own_lectures": [
                "301 items"
            ],
            "Atomic, Molecular, Optical Physics": {
                "own_lectures": [
                    "0 items"
                ],
                "Electromagnetism": [
                    "23 items"
                ],
                "Transport Processes": [
                    "14 items"
                ],
                "Electronic Materials": [
                    "14 items"
                ],
                "Electric Power": [
                    "14 items"
                ]
            },
            "Theoretical Physics": {},
            "Condensed Matter Physics": {},
            "Nuclear Physics": {
                "own_lectures": [
                    "42 items"
                ]
       

In [166]:
def remove_empty_own_lectures(data):
    """
    Recursively remove `own_lectures` fields that are empty lists.
    """
    if isinstance(data, dict):
        cleaned_data = {}
        for key, value in data.items():
            if key == "own_lectures" and isinstance(value, list) and not value:
                continue  # Skip empty own_lectures
            cleaned_data[key] = remove_empty_own_lectures(value)
        return cleaned_data
    elif isinstance(data, list):
        return [remove_empty_own_lectures(item) for item in data]
    return data

# Load JSON file
file_name = "reorganized_stem_lectures_cleaned.json"  # Replace with your file name
with open(file_name, 'r') as file:
    json_data = json.load(file)

# Remove empty `own_lectures`
cleaned_data = remove_empty_own_lectures(json_data)

# Save the updated JSON
output_file = "reorganized_stem_lectures_cleaned.json"
with open(output_file, 'w', encoding='utf-8') as file:
    json.dump(cleaned_data, file, ensure_ascii=False, indent=4)

print(f"Empty `own_lectures` removed. Saved to {output_file}.")

Empty `own_lectures` removed. Saved to reorganized_stem_lectures_cleaned.json.


In [155]:
# Load JSON file
file_name = "cleaned_no_empty_own_lectures.json"  # Replace with your JSON file name
with open(file_name, 'r') as file:
    json_data = json.load(file)

# Generate skeleton with counts for the first two layers
skeleton = get_skeleton_with_counts(json_data, max_depth=6)

# Print the skeleton with counts
print("Skeleton of the first two layers with counts:")
print(json.dumps(skeleton, indent=4))

Skeleton of the first two layers with counts:
{
    "Science": {
        "own_lectures": [
            "668 items"
        ],
        "Classical Mechanics": [
            "235 items"
        ],
        "Physics": {
            "own_lectures": [
                "301 items"
            ],
            "Atomic, Molecular, Optical Physics": {
                "Electromagnetism": [
                    "23 items"
                ],
                "Transport Processes": [
                    "14 items"
                ],
                "Electronic Materials": [
                    "14 items"
                ],
                "Electric Power": [
                    "14 items"
                ]
            },
            "Theoretical Physics": {},
            "Condensed Matter Physics": {},
            "Nuclear Physics": {
                "own_lectures": [
                    "42 items"
                ]
            },
            "Astrophysics": {
                "own_lectures": [
           

In [167]:
def rename_own_lectures_to_parent(data, parent_name=None):
    """
    Recursively rename `own_lectures` to the parent category name if the list is not empty.
    """
    if isinstance(data, dict):
        updated_data = {}
        for key, value in data.items():
            if key == "own_lectures" and isinstance(value, list) and value:
                updated_data[parent_name] = value  # Rename to parent name
            else:
                updated_data[key] = rename_own_lectures_to_parent(value, parent_name=key)
        return updated_data
    elif isinstance(data, list):
        return [rename_own_lectures_to_parent(item, parent_name) for item in data]
    return data

# Load JSON file
file_name = "reorganized_stem_lectures_cleaned.json"  # Use the file from the previous step
with open(file_name, 'r') as file:
    json_data = json.load(file)

# Rename `own_lectures` to parent category name
renamed_data = rename_own_lectures_to_parent(json_data)

# Save the updated JSON
output_file = "final_stem_lectures.json"
with open(output_file, 'w', encoding='utf-8') as file:
    json.dump(renamed_data, file, ensure_ascii=False, indent=4)

print(f"`own_lectures` renamed to parent names where applicable. Saved to {output_file}.")

`own_lectures` renamed to parent names where applicable. Saved to final_stem_lectures.json.


In [168]:
# Load JSON file
file_name = "final_stem_lectures.json"  # Replace with your JSON file name
with open(file_name, 'r') as file:
    json_data = json.load(file)

# Generate skeleton with counts for the first two layers
skeleton = get_skeleton_with_counts(json_data, max_depth=6)

# Print the skeleton with counts
print("Skeleton of the first two layers with counts:")
print(json.dumps(skeleton, indent=4))

Skeleton of the first two layers with counts:
{
    "Science": {
        "Science": [
            "668 items"
        ],
        "Classical Mechanics": [
            "235 items"
        ],
        "Physics": {
            "Physics": [
                "301 items"
            ],
            "Atomic, Molecular, Optical Physics": {
                "Electromagnetism": [
                    "23 items"
                ],
                "Transport Processes": [
                    "14 items"
                ],
                "Electronic Materials": [
                    "14 items"
                ],
                "Electric Power": [
                    "14 items"
                ]
            },
            "Theoretical Physics": {},
            "Condensed Matter Physics": {},
            "Nuclear Physics": {
                "Nuclear Physics": [
                    "42 items"
                ]
            },
            "Astrophysics": {
                "Astrophysics": [
                  

In [169]:
import random

# Load the JSON data
file_name = "final_stem_lectures.json"  # Replace with your file name
with open(file_name, 'r') as file:
    json_data = json.load(file)

# Function to extract a random set of unique words from a given category
def get_random_unique_words(data, main_category, sub_category, word_count=150):
    """
    Extracts a random set of unique words from the 'title' sections in a given category.
    """
    words = set()  # Use a set to ensure uniqueness
    # Retrieve the specified subcategory
    if main_category in data and sub_category in data[main_category]:
        subcategory_data = data[main_category][sub_category]
        # Collect all unique words from the 'title' lists
        for entry in subcategory_data:
            words.update(entry.get("title", []))
    # Randomly sample words
    return random.sample(list(words), min(word_count, len(words)))

# Categories to extract from
categories = [
    ("Science", "Science"),
    ("Mathematics", "Mathematics"),
    ("Computer Science", "Computer Science"),
    ("Engineering", "Engineering"),
]

# Extract random unique words for each category
random_words = {}
for main_category, sub_category in categories:
    random_words[f"{main_category} - {sub_category}"] = get_random_unique_words(
        json_data, main_category, sub_category, word_count=150
    )

# Save the extracted unique words to a file
output_file = "random_unique_words_from_categories.json"
with open(output_file, 'w', encoding='utf-8') as file:
    json.dump(random_words, file, ensure_ascii=False, indent=4)

print(f"Random unique words extracted and saved to {output_file}.")


Random unique words extracted and saved to random_unique_words_from_categories.json.
