In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
file_path = "D:/Projects/LLM Models/mlmodel/kidney Disease/kidney_disease.csv"
df = pd.read_csv(file_path)
df

Unnamed: 0,id,age,albumin,blood urea,Creatinine,sodium,potassium,hemoglobin,wbc count,rbc count,anemia,classification
0,0,48.0,1.0,36.0,1.2,,,15.4,7800,5.2,no,ckd
1,1,7.0,4.0,18.0,0.8,,,11.3,6000,,no,ckd
2,2,62.0,2.0,53.0,1.8,,,9.6,7500,,yes,ckd
3,3,48.0,4.0,56.0,3.8,111.0,2.5,11.2,6700,3.9,yes,ckd
4,4,51.0,2.0,26.0,1.4,,,11.6,7300,4.6,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55.0,0.0,49.0,0.5,150.0,4.9,15.7,6700,4.9,no,notckd
396,396,42.0,0.0,31.0,1.2,141.0,3.5,16.5,7800,6.2,no,notckd
397,397,12.0,0.0,26.0,0.6,137.0,4.4,15.8,6600,5.4,no,notckd
398,398,17.0,0.0,50.0,1.0,135.0,4.9,14.2,7200,5.9,no,notckd


In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
file_path = 'D:/Projects/LLM Models/mlmodel/kidney Disease/kidney_disease_modified.csv'
df = pd.read_csv(file_path)

# Correctly drop the 'id' column
df.drop(columns=['id'], inplace=True)

# Step 1: Handle Missing Values
# Fill numerical columns with median
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())

# Fill categorical columns with mode
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

# Step 2: Encode Categorical Variables
# Encode categorical columns using LabelEncoder
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Step 3: Build a Predictive Model
# Define features and target
X = df.drop(columns=['classification'])
y = df['classification']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Classification Report:\n{classification_report(y_test, y_pred)}")

# Function to preprocess a new input data point and make a prediction
def predict_kidney_disease(new_data):
    # Convert the input data to a DataFrame if it's not already one
    if isinstance(new_data, dict):
        new_data = pd.DataFrame([new_data])

    # Handle missing values using the same strategy as before
    new_data[numerical_cols] = new_data[numerical_cols].fillna(df[numerical_cols].median())
    
    # Drop the 'classification' column if it exists in the categorical columns list
    categorical_cols_temp = categorical_cols.drop('classification', errors='ignore')

    # Fill and encode categorical variables
    new_data[categorical_cols_temp] = new_data[categorical_cols_temp].fillna(df[categorical_cols_temp].mode().iloc[0])
    for col in categorical_cols_temp:
        if col in new_data:
            if new_data[col].isin(label_encoders[col].classes_).all():
                new_data[col] = label_encoders[col].transform(new_data[col])
            else:
                new_data[col] = new_data[col].apply(lambda x: label_encoders[col].transform([x])[0] 
                                                    if x in label_encoders[col].classes_ else -1)
    # Make a prediction using the trained model
    prediction = model.predict(new_data)

    # Decode the prediction back to original labels
    predicted_class = label_encoders['classification'].inverse_transform(prediction)

    return predicted_class[0]

# Example usage
sample_data = {
    'age': 30,
    'albumin': 4.00,
    'blood urea': 20,
    'Creatinine': 0.90,
    'sodium': 140,
    'potassium': 5,
    'hemoglobin': 11.2,
    'wbc count': 2.40,
    'rbc count': 3.85,
    'anemia': 'no'
}

# Predict kidney disease for the sample data
prediction = predict_kidney_disease(sample_data)
print(f"Prediction for kidney disease: {prediction}")


Accuracy: 0.9875
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        52
           2       0.97      1.00      0.98        28

    accuracy                           0.99        80
   macro avg       0.98      0.99      0.99        80
weighted avg       0.99      0.99      0.99        80

Prediction for kidney disease: ckd


In [8]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import pickle

# Load the dataset
file_path = 'D:/Projects/LLM Models/mlmodel/kidney Disease/kidney_disease_modified.csv'
df = pd.read_csv(file_path)

# Correctly drop the 'id' column
df.drop(columns=['id'], inplace=True)

# Step 1: Handle Missing Values
# Fill numerical columns with median
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())

# Fill categorical columns with mode
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

# Step 2: Encode Categorical Variables
# Encode categorical columns using LabelEncoder
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Step 3: Build a Predictive Model
# Define features and target
X = df.drop(columns=['classification'])
y = df['classification']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Classification Report:\n{classification_report(y_test, y_pred)}")

# Save the model and label encoders as pickle files
# Specify the folder path where you want to save the pickle files
folder_path = 'D:/Projects/LLM Models/mlmodel/kidney Disease/saved_models'

# Create the folder if it doesn't exist
os.makedirs(folder_path, exist_ok=True)

# Specify the paths for the pickle files
model_file_path = os.path.join(folder_path, 'kidney_disease_model.pkl')
label_encoders_file_path = os.path.join(folder_path, 'label_encoders.pkl')

# Save the RandomForest model to the specified path
with open(model_file_path, 'wb') as model_file:
    pickle.dump(model, model_file)

# Save the label encoders dictionary to the specified path
with open(label_encoders_file_path, 'wb') as le_file:
    pickle.dump(label_encoders, le_file)

# Function to preprocess a new input data point and make a prediction
def predict_kidney_disease(new_data):
    # Convert the input data to a DataFrame if it's not already one
    if isinstance(new_data, dict):
        new_data = pd.DataFrame([new_data])

    # Handle missing values using the same strategy as before
    new_data[numerical_cols] = new_data[numerical_cols].fillna(df[numerical_cols].median())
    
    # Drop the 'classification' column if it exists in the categorical columns list
    categorical_cols_temp = categorical_cols.drop('classification', errors='ignore')

    # Fill and encode categorical variables
    new_data[categorical_cols_temp] = new_data[categorical_cols_temp].fillna(df[categorical_cols_temp].mode().iloc[0])
    for col in categorical_cols_temp:
        if col in new_data:
            if new_data[col].isin(label_encoders[col].classes_).all():
                new_data[col] = label_encoders[col].transform(new_data[col])
            else:
                new_data[col] = new_data[col].apply(lambda x: label_encoders[col].transform([x])[0] 
                                                    if x in label_encoders[col].classes_ else -1)
    # Make a prediction using the trained model
    prediction = model.predict(new_data)

    # Decode the prediction back to original labels
    predicted_class = label_encoders['classification'].inverse_transform(prediction)

    return predicted_class[0]


Accuracy: 0.9875
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        52
           2       0.97      1.00      0.98        28

    accuracy                           0.99        80
   macro avg       0.98      0.99      0.99        80
weighted avg       0.99      0.99      0.99        80



In [11]:
import pandas as pd
import pickle
import os

# Load the model and label encoders from the pickle files
folder_path = 'D:/Projects/LLM Models/mlmodel/kidney Disease/saved_models'

# Specify the paths for the pickle files
model_file_path = os.path.join(folder_path, 'kidney_disease_model.pkl')
label_encoders_file_path = os.path.join(folder_path, 'label_encoders.pkl')

# Load the RandomForest model from the specified path
with open(model_file_path, 'rb') as model_file:
    loaded_model = pickle.load(model_file)

# Load the label encoders dictionary from the specified path
with open(label_encoders_file_path, 'rb') as le_file:
    loaded_label_encoders = pickle.load(le_file)
print(list(label_encoders.keys()))
# Function to preprocess a new input data point and make a prediction using the loaded model
def predict_kidney_disease(new_data):
    # Convert the input data to a DataFrame if it's not already one
    if isinstance(new_data, dict):
        new_data = pd.DataFrame([new_data])

    # Handle missing values using the same strategy as before
    numerical_cols = new_data.select_dtypes(include=['float64', 'int64']).columns
    new_data[numerical_cols] = new_data[numerical_cols].fillna(df[numerical_cols].median())

    # Fill and encode categorical variables
    categorical_cols = new_data.select_dtypes(include=['object']).columns
    new_data[categorical_cols] = new_data[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])
    for col in categorical_cols:
        if col in new_data:
            if new_data[col].isin(loaded_label_encoders[col].classes_).all():
                new_data[col] = loaded_label_encoders[col].transform(new_data[col])
            else:
                new_data[col] = new_data[col].apply(lambda x: loaded_label_encoders[col].transform([x])[0] 
                                                    if x in loaded_label_encoders[col].classes_ else -1)
    
    # Make a prediction using the loaded model
    prediction = loaded_model.predict(new_data)

    # Decode the prediction back to original labels
    predicted_class = loaded_label_encoders['classification'].inverse_transform(prediction)

    return predicted_class[0]

sample_data = {
    'age': 30,
    'albumin': 4.00,
    'blood urea': 20,
    'Creatinine': 0.90,
    'sodium': 140,
    'potassium': 5,
    'hemoglobin': 11.2,
    'wbc count': 2.40,
    'rbc count': 3.85,
    'anemia': 'no'
}

# Predict kidney disease for the sample data
prediction = predict_kidney_disease(sample_data)
print(f"Prediction for kidney disease: {prediction}")


['rbc count', 'anemia', 'classification']
Prediction for kidney disease: ckd


In [13]:
categorical_cols = list(label_encoders.keys())  # Assuming label_encoders has all categorical columns
numerical_cols = ['age','albumin','blood urea','Creatinine','sodium','potassium','hemoglobin','wbc count']  
def predict_kidney_disease(new_data):
    # Convert the input data to a DataFrame if it's not already one
    if isinstance(new_data, dict):
        new_data = pd.DataFrame([new_data])
    
    # Handle missing values
    new_data[numerical_cols] = new_data[numerical_cols].fillna(new_data[numerical_cols].median())
    
    # Fill and encode categorical variables
    new_data[categorical_cols] = new_data[categorical_cols].fillna(new_data[categorical_cols].mode().iloc[0])
    for col in categorical_cols:
        if col in new_data:
            new_data[col] = new_data[col].apply(lambda x: label_encoders[col].transform([x])[0] 
                                                if x in label_encoders[col].classes_ else -1)
    
    # Ensure the order of columns matches the training data
    new_data = new_data[X.columns]

    # Make a prediction using the trained model
    prediction = model.predict(new_data)

    # Decode the prediction back to original labels
    if 'classification' in label_encoders:
        predicted_class = label_encoders['classification'].inverse_transform(prediction)
        return predicted_class[0]
    else:
        return prediction[0]
sample_data = {
    'age': 30,
    'albumin': 4.00,
    'blood urea': 20,
    'Creatinine': 0.90,
    'sodium': 140,
    'potassium': 5,
    'hemoglobin': 11.2,
    'wbc count': 2.40,
    'rbc count': 3.85,
    'anemia': 'no'
}

# Predict kidney disease for the sample data
prediction = predict_kidney_disease(sample_data)
print(f"Prediction for kidney disease: {prediction}")


KeyError: "['classification'] not in index"

In [14]:
import pandas as pd
import pickle

# Load the model and label encoders
folder_path = 'D:/Projects/LLM Models/mlmodel/kidney Disease/saved_models'
model_file_path = os.path.join(folder_path, 'kidney_disease_model.pkl')
label_encoders_file_path = os.path.join(folder_path, 'label_encoders.pkl')

with open(model_file_path, 'rb') as model_file:
    model = pickle.load(model_file)

with open(label_encoders_file_path, 'rb') as le_file:
    label_encoders = pickle.load(le_file)

# Define columns
categorical_cols = ['anemia']  # List of categorical columns
numerical_cols = ['age', 'albumin', 'blood urea', 'Creatinine', 'sodium', 'potassium', 'hemoglobin', 'wbc count', 'rbc count']

def predict_kidney_disease(new_data):
    # Convert the input data to a DataFrame if it's not already one
    if isinstance(new_data, dict):
        new_data = pd.DataFrame([new_data])
    # Handle missing values
    new_data[numerical_cols] = new_data[numerical_cols].fillna(new_data[numerical_cols].median())
    # Fill and encode categorical variables
    for col in categorical_cols:
        if col in new_data:
            new_data[col] = new_data[col].fillna(new_data[col].mode().iloc[0])
            if col in label_encoders:
                new_data[col] = new_data[col].apply(lambda x: label_encoders[col].transform([x])[0]
                                                    if x in label_encoders[col].classes_ else -1)
    # Ensure the order of columns matches the training data
    # Here you should match the columns with the training feature columns
    required_columns = numerical_cols + categorical_cols
    new_data = new_data[required_columns]
    # Make a prediction using the trained model
    prediction = model.predict(new_data)
    # If 'classification' is a categorical feature and encoded, you may need to decode
    if 'classification' in label_encoders:
        predicted_class = label_encoders['classification'].inverse_transform(prediction)
        return predicted_class[0]
    else:
        return prediction[0]

# Sample data for prediction
sample_data = {
    'age': 30,
    'albumin': 4.00,
    'blood urea': 20,
    'Creatinine': 0.90,
    'sodium': 140,
    'potassium': 5,
    'hemoglobin': 11.2,
    'wbc count': 2.40,
    'rbc count': 3.85,
    'anemia': 'no'
}

# Predict kidney disease for the sample data
prediction = predict_kidney_disease(sample_data)
print(f"Prediction for kidney disease: {prediction}")


Prediction for kidney disease: ckd
