In [3]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, confusion_matrix

# Load your data
df = pd.read_parquet('/content/sample_data/diagnosis.parquet')

# Check for missing values
missing_values = df.isnull().sum()

# Display the count of missing values for each column
print("Missing values for each column:")
print(missing_values)

Missing values for each column:
PERSONID    0
271.0       0
276.51      0
283.9       0
401.9       0
           ..
Z99.2       0
Z99.3       0
Z99.81      0
Z99.89      0
POMPE       0
Length: 7509, dtype: int64


In [4]:
# Count the number of samples in each class
class_counts = df["POMPE"].value_counts()
print("\nNumber of samples in each class:")
print(class_counts)


Number of samples in each class:
0    89372
1        7
Name: POMPE, dtype: int64


In [5]:
# Check for NaNs and print the indices before dropping them
nan_indices = df[df.isna().any(axis=1)].index
print(f"Indices with NaN: {nan_indices.tolist()}")

# Drop the rows with NaNs
df = df.dropna()

# Confirm the removal
print(f"Number of rows after removing NaNs: {df.shape[0]}")

Indices with NaN: []
Number of rows after removing NaNs: 89379


In [None]:
# Assuming 'Time' is not a feature and 'Class' is the label
X = df.drop(['POMPE'], axis=1)
y = df['POMPE']

# Isolation Forest for anomaly detection
iso_forest = IsolationForest(n_estimators=300, contamination= 'auto', random_state=42)
preds = iso_forest.fit_predict(X)

# Isolation Forest marks anomalies as -1, so we convert these to 1 for our 'Class' label (assuming 1 indicates the minority class)
anomaly_labels = pd.Series(preds).apply(lambda x: 1 if x == -1 else 0)

# Evaluation
print(classification_report(y, anomaly_labels))
print(confusion_matrix(y, anomaly_labels))