In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize  # Import NLTK tokenization function
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
import re
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

# Read the CSV data with UTF-8 encoding and handle potential errors
try:
    data = pd.read_csv("ICD10_openmed_UTF8.csv", delimiter=";", encoding="utf-8")
except FileNotFoundError:
    print("Error: CSV file not found. Please provide the correct path.")
    exit()
except pd.errors.ParserError:
    print("Error: CSV parsing error. Check the format of your data.")
    exit()

# Preprocess the text data using NLTK tokenization
def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r"[^a-z0-9\s]", "", text)  # Remove non-alphanumeric characters
    tokens = word_tokenize(text)  # Tokenize using NLTK's word_tokenize
    return tokens

data["term"] = data["term"].apply(preprocess_text)

# Separate features (text) and target labels (ICD-10 codes)
X = data["term"]
y = data["icd10"]  # Assuming a single ICD-10 code per row (modify if multiple codes)

# Handle multi-label scenarios if needed (e.g., one-hot encoding)
if isinstance(y.iloc[0], list):  # Check if each label is a list of ICD-10 codes
    from sklearn.preprocessing import MultiLabelBinarizer
    mlb = MultiLabelBinarizer()
    y = mlb.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

# Create the TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed

# Transform the text data into TF-IDF vectors
X_train_tfidf = vectorizer.fit_transform([", ".join(doc) for doc in X_train])
X_test_tfidf = vectorizer.transform([", ".join(doc) for doc in X_test])  # Join tokens with commas
X_val_tfidf = vectorizer.transform([", ".join(doc) for doc in X_val])  # Join tokens with commas

# Train the Random Forest model with overfitting prevention strategies
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
# Adjust n_estimators (number of trees) and max_depth (tree depth) based on data and performance
# Consider techniques like grid search for hyperparameter optimization

# Train the model
model.fit(X_train_tfidf, y_train)

# Evaluate the model's performance on the evaluation set
y_pred_eval = model.predict(X_val_tfidf)
evaluation_accuracy = accuracy_score(y_val, y_pred_eval)
evaluation_precision = precision_score(y_val, y_pred_eval, average='weighted')  # Consider weighted average for multi-label classification
evaluation_recall = recall_score(y_val, y_pred_eval, average='weighted')  # Consider weighted average for multi-label classification
evaluation_f1 = f1_score(y_val, y_pred_eval, average='weighted')  # Consider weighted average for multi-label classification

print("Evaluation Accuracy:", evaluation_accuracy)
print("Evaluation Precision:", evaluation_precision)
print("Evaluation Recall:", evaluation_recall)
print("Evaluation F1-score:", evaluation_f1)

# Now, evaluate the model on the final test set
y_pred_test = model.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, y_pred_test)

# Evaluate the model's performance (consider adding metrics like F1-score)
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")
# Save the trained model (consider using joblib, pickle, or ONNX for deployment)
import joblib
joblib.dump(model, "icd10_prediction_model.pkl")  # Replace with your preferred format
print("Model saved successfully!")

# Evaluate the model's performance
y_pred = model.predict(X_test_tfidf)  # Predicted labels

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Consider weighted average for multi-label classification
recall = recall_score(y_test, y_pred, average='weighted')  # Consider weighted average for multi-label classification
f1 = f1_score(y_test, y_pred, average='weighted')  # Consider weighted average for multi-label classification

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

# Save the model (using TensorFlow.js format)
model.save('icd10_prediction_model.h5')

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Evaluation Accuracy: 0.21150758251561105
Evaluation Precision: 0.14943407086049873
Evaluation Recall: 0.21150758251561105
Evaluation F1-score: 0.140972079720921
Model Accuracy: 0.2156
Model saved successfully!


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.21561106155218554
Precision: 0.15233047983376097
Recall: 0.21561106155218554
F1-score: 0.14134888690203146


AttributeError: 'RandomForestClassifier' object has no attribute 'save'

In [None]:
# WORKS CORRECT !!!!

import pandas as pd
from nltk.tokenize import word_tokenize  # Import NLTK tokenization function
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
import re
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Read the CSV data with UTF-8 encoding and handle potential errors
try:
    data = pd.read_csv("ICD10_openmed_UTF8.csv", delimiter=";", encoding="utf-8")
except FileNotFoundError:
    print("Error: CSV file not found. Please provide the correct path.")
    exit()
except pd.errors.ParserError:
    print("Error: CSV parsing error. Check the format of your data.")
    exit()

# Preprocess the text data using NLTK tokenization
def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r"[^a-z0-9\s]", "", text)  # Remove non-alphanumeric characters
    tokens = word_tokenize(text)  # Tokenize using NLTK's word_tokenize
    return tokens

data["term"] = data["term"].apply(preprocess_text)

# Separate features (text) and target labels (ICD-10 codes)
X = data["term"]
y = data["icd10"]  # Assuming a single ICD-10 code per row (modify if multiple codes)

# Handle multi-label scenarios if needed (e.g., one-hot encoding)
if isinstance(y.iloc[0], list):  # Check if each label is a list of ICD-10 codes
    from sklearn.preprocessing import MultiLabelBinarizer
    mlb = MultiLabelBinarizer()
    y = mlb.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed

# Transform the text data into TF-IDF vectors
X_train_tfidf = vectorizer.fit_transform([", ".join(doc) for doc in X_train])
X_test_tfidf = vectorizer.transform([", ".join(doc) for doc in X_test])  # Join tokens with commas

# Train the Random Forest model with overfitting prevention strategies
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
# Adjust n_estimators (number of trees) and max_depth (tree depth) based on data and performance
# Consider techniques like grid search for hyperparameter optimization

# Train the model
model.fit(X_train_tfidf, y_train)

# Evaluate the model's performance (consider adding metrics like F1-score)
from sklearn.metrics import accuracy_score
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

# Save the trained model (consider using joblib, pickle, or ONNX for deployment)
import joblib
joblib.dump(model, "icd10_prediction_model.pkl")  # Replace with your preferred format
print("Model saved successfully!")

# Save the model (using TensorFlow.js format)
model.save('icd10_prediction_model.h5')