In [1]:
import json
import re
from nltk.corpus import stopwords
import nltk

nltk.download("stopwords")

# Define lecture-related stopwords and patterns to filter out
additional_stop_words = {
    "introduction", "overview", "basics", "principles", "fundamentals", "essentials",
    "concepts", "topics", "outline", "scope", "insights", "lecture", "session",
    "class", "seminar", "discussion", "tutorial", "workshop", "exercise", "module",
    "part", "series", "unit", "goals", "objectives", "summary", "review", "highlights",
    "conclusion", "recap", "continued", "continuation", "advanced", "update", "notes",
    "refresher", "start", "beginning", "end", "final", "first", "second", "third", 
    "perspective", "focus", "study", "methods", "approaches", "applications", "field",
    "understanding", "applications of", "in context", "applied", "introduction to", 
    "explanation", "examining", "exam", "goals"
}
all_stopwords = set(stopwords.words("english")).union(additional_stop_words)

# Pattern to remove numbers followed by punctuation
patterns_to_remove = r"Lecture\s*\d+:|Lecture\s*\d+|^(\d+[\.\:\;\,\-\—\—]\s*)"

# Pattern to exclude entries with '(cont.)', '(contd)', '(continued)', etc.
continuation_pattern = r"\b(cont\.?|contd\.?|continued)\b"

# Pattern to remove any text within parentheses, punctuation, and Roman numerals
parentheses_pattern = r"\(.*?\)"
punctuation_pattern = r"\b\w*[^\w\s]\w*\b"
roman_numeral_pattern = r"\b[IVXLCDM]+\b"


with open('lecture_videos_data.json', 'r') as file:
    stem_data = json.load(file)

# Function to pre-process and clean the title field
def preprocess_title(title):
    # Remove parentheses content
    title = re.sub(parentheses_pattern, "", title)
    # Remove words with punctuation
    title = re.sub(punctuation_pattern, "", title)
    # Remove Roman numerals
    title = re.sub(roman_numeral_pattern, "", title, flags=re.IGNORECASE)
    # Remove specified patterns and numbers with punctuation
    title = re.sub(patterns_to_remove, "", title, flags=re.IGNORECASE)
    return title.strip()


# Function to filter and separate keywords within title fields
def filter_keywords(title):
    # Preprocess the title before splitting
    title = preprocess_title(title)
    # Split by common delimiters and whitespace to separate concepts
    title_parts = re.split(r'[,:;\s]', title)
    # Filter out stopwords and unwanted terms in each part
    filtered_parts = [
        word.strip() for word in title_parts
        if word.lower() not in all_stopwords and len(word) > 2 and not word.isdigit()
    ]
    return filtered_parts


# Process JSON data to filter out keywords in the title field and exclude continuation entries
unique_lectures = {}
for category, lectures in stem_data.items():
    filtered_lectures = []
    seen_titles = set()
    for lecture in lectures:
        # Skip titles containing continuation markers
        if re.search(continuation_pattern, lecture["title"], re.IGNORECASE):
            continue
        
        # Process title to filter keywords and separate by spaces and punctuation
        filtered_title_list = filter_keywords(lecture["title"])
        
        # Exclude empty titles after filtering
        if not filtered_title_list:
            continue
        
        # Create a unique key based on resource_course_title and filtered title list
        unique_key = (lecture["resource_course_title"], tuple(filtered_title_list))
        if unique_key not in seen_titles:
            lecture["title"] = filtered_title_list  # Store the title as a list of separated concepts
            filtered_lectures.append(lecture)
            seen_titles.add(unique_key)
            
    unique_lectures[category] = filtered_lectures

with open('filtered_stem_lectures.json', 'w') as outfile:
    json.dump(unique_lectures, outfile, indent=4)

print("Filtered, de-duplicated, and cleaned titles have been saved to 'filtered_stem_lectures.json'")


# Filter the resource_course_title field in the saved filtered data
introductory_phrases = [
    r"Introduction to", r"Principles of", r"Basics of", r"Overview of",
    r"Essentials of", r"Introduction", r"Intro to", r"Introductory", r"Foundations of"
]
intro_pattern = r'\b(?:' + '|'.join(introductory_phrases) + r')\b'

with open('filtered_stem_lectures.json', 'r') as file:
    stem_data = json.load(file)


def extract_main_subject(title):
    # Remove introductory phrases
    title = re.sub(intro_pattern, '', title, flags=re.IGNORECASE)
    # Remove course codes or other content in parentheses
    title = re.sub(r'\(.*?\)', '', title)
    # Strip and remove extra whitespace
    title = title.strip()
    return title


for category, lectures in stem_data.items():
    for lecture in lectures:
        lecture["resource_course_title"] = extract_main_subject(lecture["resource_course_title"])

with open('filtered_stem_lectures.json', 'w') as outfile:
    json.dump(stem_data, outfile, indent=4)

print("Final cleaned course titles have been saved to 'filtered_stem_lectures.json'")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/kroknes/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Filtered, de-duplicated, and cleaned titles have been saved to 'filtered_stem_lectures.json'
Final cleaned course titles have been saved to 'filtered_stem_lectures.json'
