In [None]:
# get the 8th grade and common words

## import

In [13]:
import csv
import nltk
from collections import Counter
import pandas as pd
import os

input_filepath='work flow v6 data.csv'
output_filepath='filtered_medical_dictionary.csv'
"""
Removes common English words from a medical dictionary file.

Args:
    input_filepath (str): Path to the original medical dictionary CSV.
    output_filepath (str): Path to save the filtered CSV.
"""

# --- Part 1: Build the list of common English words to filter out ---

print("Step 1: Building a list of common English words...")

# Attempt to download the required NLTK data packages.
# This needs to be done only once.

nltk.data.find('corpora/brown')
nltk.data.find('corpora/stopwords')
print("NLTK corpora (brown, stopwords) already downloaded.")
from nltk.corpus import brown, stopwords

# Get all alphabetical words from the Brown Corpus (a balanced sample of English text)
all_words = [word.lower() for word in brown.words() if word.isalpha()]
word_freq = Counter(all_words)
stop_words = set(stopwords.words('english'))

# Select words that are common, but not stop words.
# We choose words with a frequency of at least 50.
common_words_set = set()
for word, freq in word_freq.items():
    if (freq >= 50 and 4 <= len(word) <= 20 and word not in stop_words):
        common_words_set.add(word)

# For extra robustness, let's take the top 3000 most frequent from this set
sorted_common_words = sorted(list(common_words_set), key=lambda w: word_freq[w], reverse=True)
final_filter_set = set(sorted_common_words[:])

print(f"Created a filter list with {len(final_filter_set)} common words.")

# --- Part 2: Load and filter your medical dictionary ---

print(f"\nStep 2: Loading and filtering your medical dictionary from '{input_filepath}'...")


# Load your dictionary. Assuming it has one column with no header.
med_terms_df = pd.read_csv(input_filepath, header=None, names=['Term'])
original_term_count = len(med_terms_df)

# The core filtering logic: keep a term only if its lowercase version is NOT in our filter set.
filtered_df = med_terms_df[~med_terms_df['Term'].str.lower().isin(final_filter_set)]
filtered_term_count = len(filtered_df)

words_removed_count = original_term_count - filtered_term_count

print(f"\nFiltering complete!")
print(f" - Original number of terms: {original_term_count}")
print(f" - Number of common terms removed: {words_removed_count}")
print(f" - Final number of terms: {filtered_term_count}")

# --- Part 3: Save the result ---

filtered_df.to_csv(output_filepath, index=False)
print(f"\nStep 3: Successfully saved the filtered medical dictionary to '{output_filepath}'")


Step 1: Building a list of common English words...
NLTK corpora (brown, stopwords) already downloaded.
Created a filter list with 1939 common words.

Step 2: Loading and filtering your medical dictionary from 'work flow v6 data.csv'...

Filtering complete!
 - Original number of terms: 9365
 - Number of common terms removed: 30
 - Final number of terms: 9335

Step 3: Successfully saved the filtered medical dictionary to 'filtered_medical_dictionary.csv'
