This categorizes adverse events into four severity levels—most severe, severe, moderate severe, and less severe—using predefined keywords. It assigns severity labels to text data in the FOI_TEXT column and filters out unlabeled entries. The labeled dataset is saved as a CSV file, providing a structured resource for analyzing the distribution and impact of adverse events.

In [1]:
# manual Categorization 
import pandas as pd
data = pd.read_csv('finaldata.csv')

def label_severity(text):
    text = str(text).lower()  

    most_severe_keywords = [
        'death', 'life-threatening', 'fatal', 'cardiac arrest', 'heart failure',
        'sepsis', 'systemic infection', 'loss of capture'
    ]
    severe_keywords = [
        'device failure', 'injury', 'serious', 'device malfunction',
        'lead fracture', 'myocardial perforation', 'arrhythmias', 'device recall', 'device migration',
        'chest pain', 'syncope','severe infection', 'surgery' ,'prolapse recurrence', 'pocket erosion'
    ]
    moderate_severe_keywords = [
        'pain', 'bowel problem', 'dizziness', 'shortness of breath', 'implant failure',
        'sexual problem', 'dyspareunia', 'chronic inflammation', 'pulse', 'tachycardia', 'sensing',
        'incontinence', 'recurrence', 'urinary problem', 'mental stress','swelling', 'bleeding', 'discomfort', 'sleeping issue',
        'constipation', 'diarrhea','allergic reaction', 'lead dislodgement', 'infection at the implantation site',
        'twiddler’s syndrome', 'fatigue', 'arm pain', 'arthritis','nausea','heart rate','migration'
    ]
    less_severe_keywords = ['resolved', 'mild', 'minor', 'improvement', 'no complications', 'itching', 'shoulder pain']

    if any(keyword in text for keyword in most_severe_keywords):
        return 'most severe'
    elif any(keyword in text for keyword in severe_keywords):
        return 'severe'
    elif any(keyword in text for keyword in moderate_severe_keywords):
        return 'moderate severe'
    elif any(keyword in text for keyword in less_severe_keywords):
        return 'less severe'
    else:
        return None  

data['Severity_Label'] = data['FOI_TEXT'].apply(label_severity)

labeled_data = data[data['Severity_Label'].notnull()]

print(labeled_data[['FOI_TEXT', 'Severity_Label']].head())

label_counts = labeled_data['Severity_Label'].value_counts()
print("\nCounts of Severity Labels:")
print(label_counts)

labeled_data.to_csv('labeled_dataset.csv', index=False)


                                             FOI_TEXT   Severity_Label
9   IT WAS REPORTED THAT THE PATIENT PRESENTED IN ...  moderate severe
10  IT WAS REPORTED THAT THE RIGHT ATRIAL (RA) LEA...  moderate severe
14  DURING FOLLOW-UP, A LOSS OF CAPTURE WAS OBSERV...      most severe
18  RELATED MANUFACTURER REPORT NUMBER: 2017865-20...  moderate severe
24  RELATED MANUFACTURER REFERENCE NUMBER: 2017865...           severe

Counts of Severity Labels:
Severity_Label
moderate severe    1451
most severe         313
severe              250
less severe          51
Name: count, dtype: int64


### By Using frequency model

In [3]:
import pandas as pd
from collections import Counter

# Load your dataset
df = pd.read_csv('finaldata.csv')

# Define a list of important keywords for each severity category

severity_keywords = {
    'Most Severe': ['sepsis', 'cardiac arrest', 'lead fracture', 'pocket erosion','death', 'life-threatening', 'heart failure',
        'systemic infection', 'loss of capture'],
    'Severe': ['tachycardia', 'infection', 'shortness of breath', 'syncope', 'device malfunction','device failure', 'injury', 'serious', 'device malfunction',
        'lead fracture', 'myocardial perforation', 'arrhythmias',],
    'Moderate Severe': ['dizziness', 'pain', 'lead dislodgement', 'swelling', 'itching', 'urinary problem', 'mental stress','nausea','heart rate','migration'],
    'Less Severe': ['discomfort', 'fatigue', 'itching', 'chest pain', 'sensing' ,'no complications', 'shoulder pain']
}

# Function to count keyword frequency in FOI_TEXT
def count_keywords(text, keywords):
    text_lower = text.lower()  # Convert to lowercase for case-insensitive matching
    keyword_counter = Counter()
    
    # Debugging: Print the text being processed
    print(f"Processing text: {text_lower[:100]}...")  # Show first 100 characters
    
    for severity, words in keywords.items():
        for word in words:
            if word in text_lower:
                keyword_counter[severity] += 1
                
                # Debugging: Print keyword match
                print(f"Matched keyword: '{word}' for severity: '{severity}'")
    
    return keyword_counter

# Apply the function to count keywords and assign severity based on the highest frequency
def assign_severity(text):
    counts = count_keywords(text, severity_keywords)
    if counts:
        return counts.most_common(1)[0][0]  # Return the severity with the highest frequency
    return 'Unlabeled'

# Apply the severity assignment to the dataset
df['Severity_Label'] = df['FOI_TEXT'].apply(assign_severity)

# Display a portion of the dataset with severity labels
print(df[['FOI_TEXT', 'Severity_Label']].head())

# Save the labeled dataset
df.to_csv('labeled_by_frequency.csv', index=False)


Processing text: it was reported that the device was prophylactically explanted and replaced as it is subject to the ...
Processing text: it was reported that a patient presented remotely with a false positive premature battery depletion ...
Processing text: it was reported that the device was prophylactically explanted and replaced as it is subject to the ...
Processing text: the reported event was the prophylactic explant of a device that was subject to the zenex, assurity,...
Processing text: it was reported the asymptomatic patient presented for a lead revision. upon interrogation, it was o...
Processing text: it was reported that the device was explanted and replaced as it is subject to the zenex, assurity, ...
Processing text: if information is provided in the future, a supplemental report will be issued....
Processing text: medtronic is submitting this report to comply with fda reporting regulations under 21 cfr parts 4 an...
Processing text: it was reported that the device was 

In [4]:
import pandas as pd

# Assuming your dataset is already loaded and labeled with 'Severity_Label'
# df = pd.read_csv('finaldata.csv')

# Remove rows where the 'Severity_Label' is 'Unlabeled'
filtered_df = df[df['Severity_Label'] != 'Unlabeled']

# Save the filtered dataset to a new CSV file with only the labeled severities
filtered_df.to_csv('labeled_severity_data.csv', index=False)

# Display confirmation
print("Filtered dataset saved as 'labeled_severity_data.csv'")


Filtered dataset saved as 'labeled_severity_data.csv'
