In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
import joblib

In [2]:
# Step 1: Load data
data = pd.read_csv('ASD_data.csv')

In [3]:
from sklearn.preprocessing import LabelEncoder

# Step 2: Preprocess
# Remove trailing spaces from column names
data.columns = data.columns.str.strip()

# Drop 'Qchat-10-Score' column if it exists
if 'Qchat-10-Score' in data.columns:
    data = data.drop(columns=['Qchat-10-Score'])

# Initialize label encoder
le = LabelEncoder()

# List of categorical columns to encode
categorical_columns = ['Sex', 'Ethnicity', 'Jaundice', 'Family_mem_with_ASD', 'Who completed the test']

# Apply label encoding to categorical columns
for col in categorical_columns:
    if col in data.columns:
        data[col] = le.fit_transform(data[col])

# Convert target variable 'Class/ASD Traits' to binary values
if 'Class/ASD Traits' in data.columns:
    data['Class/ASD Traits'] = data['Class/ASD Traits'].map({'Yes': 1, 'No': 0})


In [4]:
# Step 3: Define X and y
X = data.drop(columns=['Class/ASD Traits'])
y = data['Class/ASD Traits']

# Split the dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Model Training
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)


In [5]:
# Step 4: Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Step 5: Model Training
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [7]:
# Step 6: Evaluate
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1]))

              precision    recall  f1-score   support

           0       0.94      0.99      0.96        69
           1       0.99      0.97      0.98       142

    accuracy                           0.98       211
   macro avg       0.97      0.98      0.97       211
weighted avg       0.98      0.98      0.98       211

[[ 68   1]
 [  4 138]]
ROC AUC Score: 0.998928352725046


In [17]:
# Step 7: Save Model
joblib.dump(rf, 'ASD_model.pkl')

['ASD_model.pkl']

In [10]:
import pandas as pd
import joblib
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Load the saved model and preprocessing objects
model = joblib.load('ASD_model.pkl')
scaler = joblib.load('scaler.pkl')  # Only if scaling was used during training
label_encoders = joblib.load('label_encoders.pkl')  # Load label encoders

# Load test data
test_data = pd.read_csv('ASD_test_data.csv')  # Update with your test data path

# Remove extra spaces from column names
test_data.columns = test_data.columns.str.strip()

# Drop 'Qchat-10-Score' if it exists in the test data
if 'Qchat-10-Score' in test_data.columns:
    test_data = test_data.drop(columns=['Qchat-10-Score'])

# Apply label encoding to categorical columns
categorical_columns = ['Sex', 'Ethnicity', 'Jaundice', 'Family_mem_with_ASD', 'Who completed the test']
for col in categorical_columns:
    if col in test_data.columns:
        # Get the classes of the label encoder used in training
        known_classes = set(label_encoders[col].classes_)
        
        # Replace unseen values with a known class (e.g., most common value from training set)
        test_data[col] = test_data[col].apply(lambda x: x if x in known_classes else list(known_classes)[0])
        
        # Transform the column using the label encoder
        test_data[col] = label_encoders[col].transform(test_data[col])

# Apply scaling to numerical features if needed
numerical_features = ['Age_Mons']  # Specify the numerical features used
if scaler:
    test_data[numerical_features] = scaler.transform(test_data[numerical_features])

# Define X and y if labeled test data
X_test = test_data.drop(columns=['Class/ASD Traits']) if 'Class/ASD Traits' in test_data.columns else test_data
y_test = test_data['Class/ASD Traits'] if 'Class/ASD Traits' in test_data.columns else None

# Make predictions
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)[:, 1]

# Encode y_test if it contains 'Yes'/'No' instead of 1/0
if y_test is not None and y_test.dtype == object:  # Check if y_test contains strings
    y_test = y_test.map({'Yes': 1, 'No': 0})  # Map 'Yes' to 1 and 'No' to 0

# Proceed with model evaluation
if y_test is not None:
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    print("ROC AUC Score:", roc_auc_score(y_test, y_pred_prob))


# Save predictions if needed
test_data['Predicted_ASD_Traits'] = y_pred
test_data.to_csv('ASD_test_predictions.csv', index=False)


Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.97       326
           1       0.99      0.99      0.99       728

    accuracy                           0.98      1054
   macro avg       0.98      0.98      0.98      1054
weighted avg       0.98      0.98      0.98      1054

Confusion Matrix:
[[319   7]
 [ 10 718]]
ROC AUC Score: 0.9989487123306142


In [20]:
from sklearn.preprocessing import StandardScaler
import joblib

# Define numerical features for scaling
numerical_features = ['Age_Mons']  # Adjust this list based on your dataset

# Initialize and fit the scaler
scaler = StandardScaler()
scaler.fit(X_train[numerical_features])  # Fit the scaler to the training data

# Save the scaler
joblib.dump(scaler, 'scaler.pkl')


['scaler.pkl']

In [9]:
from sklearn.preprocessing import LabelEncoder
import joblib

# Initialize label encoders for each categorical column
label_encoders = {}
categorical_columns = ['Sex', 'Ethnicity', 'Jaundice', 'Family_mem_with_ASD', 'Who completed the test']

# Fit and transform each categorical column, saving the encoder
for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le  # Store the fitted encoder in a dictionary

# Save the dictionary of label encoders
joblib.dump(label_encoders, 'label_encoders.pkl')


['label_encoders.pkl']

In [8]:
from sklearn.preprocessing import LabelEncoder
import joblib

# Initialize label encoders for each categorical column
label_encoders = {}
categorical_columns = ['Sex', 'Ethnicity', 'Jaundice', 'Family_mem_with_ASD', 'Who_completed_the_test']

for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])  # Fit and transform the training data
    label_encoders[col] = le  # Store the fitted encoder

# Save the label encoders for later use
joblib.dump(label_encoders, 'label_encoders.pkl')


KeyError: 'Who_completed_the_test'