In [1]:
import pandas as pd
# Load your data
data = pd.read_csv('Opioid Prediction/test_healthcare_opioid_prediction_data.csv')
data.head(1)

Unnamed: 0,PatientID,Age,Gender,Race,ZipCode,ChronicPainConditions,NumOpioidPrescriptions,AverageDosage,DurationOfPrescriptions,NumHealthcareVisits,...,Refills,MedicationClass,Adherence,ClinicalNotes,Specialty,AppointmentType,SubSpecialty,TimeofAppointment,TimeSeenbyPhysician,TotalTimeSpentwithPhysician
0,2491941032034,22,Male,White,10001,Arthritis,12,7,25,8,...,3,Opioid,Low,Prescribed Oxymorphone for severe pain.,Primary Care,Consultation,Specialized,06:47:54,05:23:03,28


In [2]:
columns_to_remove = ['PatientID', 'PrescriptionDate', 'ClinicalNotes', 
                     'TimeofAppointment', 'TimeSeenbyPhysician', 
                     'TotalTimeSpentwithPhysician', 'ZipCode', 
                     'Specialty', 'SubSpecialty']

data = data.drop(columns=columns_to_remove)
data.head()

Unnamed: 0,Age,Gender,Race,ChronicPainConditions,NumOpioidPrescriptions,AverageDosage,DurationOfPrescriptions,NumHealthcareVisits,NumHospitalizations,PainManagementTreatment,MedicationName,Dosage,Frequency,Duration,Refills,MedicationClass,Adherence,AppointmentType
0,22,Male,White,Arthritis,12,7,25,8,1,Yes,Tramadol,20 mg,every 8 hours,20,3,Opioid,Low,Consultation
1,75,Male,Hispanic,Post-Surgery Pain,14,21,20,12,1,No,Hydromorphone,10 mg,every 4-6 hours,10,0,Opioid,High,Routine Check-up
2,46,Female,Hispanic,Arthritis,14,87,21,16,2,No,Tramadol,2.5 mg,every 8 hours,11,2,Analgesic,Low,Follow-up
3,33,Male,Other,Cancer Pain,19,41,19,19,3,No,Tramadol,40 mg,every 4-6 hours,18,1,Opioid,High,Follow-up
4,23,Male,Black,Fibromyalgia,5,9,18,3,3,No,Codeine,10 mg,every 12 hours,17,2,Analgesic,High,Consultation


In [3]:
import pandas as pd
# Select categorical columns
categorical_columns = data.select_dtypes(include=['object', 'category']).columns
# Print unique values for each column
for column in categorical_columns:
    unique_values = data[column].unique()
    print(f"Column: {column}")
    print(unique_values)
    print("\n")

Column: Gender
['Male' 'Female']


Column: Race
['White' 'Hispanic' 'Other' 'Black' 'Asian']


Column: ChronicPainConditions
['Arthritis' 'Post-Surgery Pain' 'Cancer Pain' 'Fibromyalgia'
 'Chronic Back Pain']


Column: PainManagementTreatment
['Yes' 'No']


Column: MedicationName
['Tramadol' 'Hydromorphone' 'Codeine' 'Meperidine' 'Buprenorphine'
 'Morphine' 'Hydrocodone' 'Tapentadol' 'Fentanyl' 'Oxycodone' 'Methadone'
 'Oxymorphone']


Column: Dosage
['20 mg' '10 mg' '2.5 mg' '40 mg' '80 mg' '5 mg' '50 mcg/hour'
 '75 mcg/hour' '100 mcg/hour' '12.5 mcg/hour' '60 mg' '30 mg']


Column: Frequency
['every 8 hours' 'every 4-6 hours' 'every 12 hours' 'once daily']


Column: MedicationClass
['Opioid' 'Analgesic' 'Narcotic']


Column: Adherence
['Low' 'High' 'Moderate']


Column: AppointmentType
['Consultation' 'Routine Check-up' 'Follow-up']




In [4]:
data["Dosage_numeric"] = data["Dosage"].str.extract("(\d+\.?\d*)", expand=False)
del data["Dosage"]  
data.head()

Unnamed: 0,Age,Gender,Race,ChronicPainConditions,NumOpioidPrescriptions,AverageDosage,DurationOfPrescriptions,NumHealthcareVisits,NumHospitalizations,PainManagementTreatment,MedicationName,Frequency,Duration,Refills,MedicationClass,Adherence,AppointmentType,Dosage_numeric
0,22,Male,White,Arthritis,12,7,25,8,1,Yes,Tramadol,every 8 hours,20,3,Opioid,Low,Consultation,20.0
1,75,Male,Hispanic,Post-Surgery Pain,14,21,20,12,1,No,Hydromorphone,every 4-6 hours,10,0,Opioid,High,Routine Check-up,10.0
2,46,Female,Hispanic,Arthritis,14,87,21,16,2,No,Tramadol,every 8 hours,11,2,Analgesic,Low,Follow-up,2.5
3,33,Male,Other,Cancer Pain,19,41,19,19,3,No,Tramadol,every 4-6 hours,18,1,Opioid,High,Follow-up,40.0
4,23,Male,Black,Fibromyalgia,5,9,18,3,3,No,Codeine,every 12 hours,17,2,Analgesic,High,Consultation,10.0


In [5]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
# Define the order for the ordinal categories
frequency_order = ['every 8 hours', 'every 4-6 hours', 'every 12 hours', 'once daily']
adherence_order = ['Low', 'Moderate', 'High']
# Ordinal Encoding for Frequency and Adherence
ordinal_columns = ['Frequency', 'Adherence']

ordinal_encoder = OrdinalEncoder(categories=[frequency_order, adherence_order])
data[ordinal_columns] = ordinal_encoder.fit_transform(data[ordinal_columns])
data.head()

Unnamed: 0,Age,Gender,Race,ChronicPainConditions,NumOpioidPrescriptions,AverageDosage,DurationOfPrescriptions,NumHealthcareVisits,NumHospitalizations,PainManagementTreatment,MedicationName,Frequency,Duration,Refills,MedicationClass,Adherence,AppointmentType,Dosage_numeric
0,22,Male,White,Arthritis,12,7,25,8,1,Yes,Tramadol,0.0,20,3,Opioid,0.0,Consultation,20.0
1,75,Male,Hispanic,Post-Surgery Pain,14,21,20,12,1,No,Hydromorphone,1.0,10,0,Opioid,2.0,Routine Check-up,10.0
2,46,Female,Hispanic,Arthritis,14,87,21,16,2,No,Tramadol,0.0,11,2,Analgesic,0.0,Follow-up,2.5
3,33,Male,Other,Cancer Pain,19,41,19,19,3,No,Tramadol,1.0,18,1,Opioid,2.0,Follow-up,40.0
4,23,Male,Black,Fibromyalgia,5,9,18,3,3,No,Codeine,2.0,17,2,Analgesic,2.0,Consultation,10.0


In [6]:
# Label Encoding for PainManagementTreatment
label_encoder = LabelEncoder()
data['PainManagementTreatment'] = label_encoder.fit_transform(data['PainManagementTreatment'])

In [7]:
data.head()

Unnamed: 0,Age,Gender,Race,ChronicPainConditions,NumOpioidPrescriptions,AverageDosage,DurationOfPrescriptions,NumHealthcareVisits,NumHospitalizations,PainManagementTreatment,MedicationName,Frequency,Duration,Refills,MedicationClass,Adherence,AppointmentType,Dosage_numeric
0,22,Male,White,Arthritis,12,7,25,8,1,1,Tramadol,0.0,20,3,Opioid,0.0,Consultation,20.0
1,75,Male,Hispanic,Post-Surgery Pain,14,21,20,12,1,0,Hydromorphone,1.0,10,0,Opioid,2.0,Routine Check-up,10.0
2,46,Female,Hispanic,Arthritis,14,87,21,16,2,0,Tramadol,0.0,11,2,Analgesic,0.0,Follow-up,2.5
3,33,Male,Other,Cancer Pain,19,41,19,19,3,0,Tramadol,1.0,18,1,Opioid,2.0,Follow-up,40.0
4,23,Male,Black,Fibromyalgia,5,9,18,3,3,0,Codeine,2.0,17,2,Analgesic,2.0,Consultation,10.0


In [8]:
# Encoding
# One-Hot Encoding for Gender, Race, ChronicPainConditions, MedicationName, Dosage, MedicationClass, AppointmentType
one_hot_columns = ['Gender', 'Race', 'ChronicPainConditions', 'MedicationName', 'MedicationClass', 'AppointmentType']
# Initialize the LabelEncoder
label_encoders = {}

# Apply LabelEncoder to each column
for col in one_hot_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Check the transformed data
data.head()

Unnamed: 0,Age,Gender,Race,ChronicPainConditions,NumOpioidPrescriptions,AverageDosage,DurationOfPrescriptions,NumHealthcareVisits,NumHospitalizations,PainManagementTreatment,MedicationName,Frequency,Duration,Refills,MedicationClass,Adherence,AppointmentType,Dosage_numeric
0,22,1,4,0,12,7,25,8,1,1,11,0.0,20,3,2,0.0,0,20.0
1,75,1,2,4,14,21,20,12,1,0,4,1.0,10,0,2,2.0,2,10.0
2,46,0,2,0,14,87,21,16,2,0,11,0.0,11,2,0,0.0,1,2.5
3,33,1,3,1,19,41,19,19,3,0,11,1.0,18,1,2,2.0,1,40.0
4,23,1,1,3,5,9,18,3,3,0,1,2.0,17,2,0,2.0,0,10.0


In [10]:
import joblib
# Load the saved Decision Tree model
model_filename = 'best_decision_tree_model.joblib'
loaded_model = joblib.load(model_filename)

# Make predictions on the test data
test_predictions = loaded_model.predict(data)
# Output the predictions
print("Predictions on the test data:")
print(test_predictions)

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- RiskScore
