In [9]:
import pandas as pd

# Load the dataset (update the path if needed)
df = pd.read_csv("anemia.csv")  # Replace with actual filename if different

# Display first few rows
df.head(4)
# Get basic info about the dataset
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1421 entries, 0 to 1420
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Gender      1421 non-null   int64  
 1   Hemoglobin  1421 non-null   float64
 2   MCH         1421 non-null   float64
 3   MCHC        1421 non-null   float64
 4   MCV         1421 non-null   float64
 5   Result      1421 non-null   int64  
dtypes: float64(4), int64(2)
memory usage: 66.7 KB


In [10]:
# Check for duplicates
print("Duplicate Rows:", df.duplicated().sum())

# Remove duplicates if any exist
df = df.drop_duplicates()


Duplicate Rows: 887


In [11]:
df = df.drop_duplicates().reset_index(drop=True)

# Check the new dataset size
print("New dataset size:", df.shape)


New dataset size: (534, 6)


In [12]:
print(df['Gender'].value_counts())


Gender
1    279
0    255
Name: count, dtype: int64


In [13]:
from sklearn.preprocessing import MinMaxScaler

# Initialize scaler
scaler = MinMaxScaler()

# Apply normalization to numerical columns only
num_cols = ['Hemoglobin', 'MCH', 'MCHC', 'MCV']
df[num_cols] = scaler.fit_transform(df[num_cols])

# Check the first few rows after scaling
df.head()


Unnamed: 0,Gender,Hemoglobin,MCH,MCHC,MCV,Result
0,1,0.805825,0.478571,0.276596,0.444099,0
1,0,0.902913,0.671429,0.106383,0.080745,0
2,0,0.23301,0.392857,0.382979,0.055901,1
3,0,0.805825,0.0,0.765957,0.562112,0
4,1,0.786408,0.428571,0.085106,0.934783,0


In [14]:
from sklearn.model_selection import train_test_split

# Define features and target
X = df.drop(columns=['Result'])  # Features
y = df['Result']  # Target variable

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Check split sizes
print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)


Training set size: (427, 5)
Testing set size: (107, 5)


In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initialize model
model = LogisticRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)

# Detailed performance report
print(classification_report(y_test, y_pred))


Model Accuracy: 0.9532710280373832
              precision    recall  f1-score   support

           0       1.00      0.91      0.95        58
           1       0.91      1.00      0.95        49

    accuracy                           0.95       107
   macro avg       0.95      0.96      0.95       107
weighted avg       0.96      0.95      0.95       107



In [16]:
import joblib

# Save the trained model
joblib.dump(model, "anemia_model.joblib")

print("Model saved successfully!")


Model saved successfully!
