In [1]:
# Libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
df = pd.read_csv("dataset.csv")

In [3]:
selected_features_with_qchat = ['Qchat-10-Score', 'A9', 'A5', 'A6', 'A7', 'A4']
selected_features_without_qchat = ['A9', 'A5', 'A6', 'A7', 'A4']
target = 'ClassASD_Traits'

In [4]:
df[target] = df[target].map({'Yes': 1, 'No': 0})

In [5]:
X_with_qchat = df[selected_features_with_qchat]
X_without_qchat = df[selected_features_without_qchat]
y = df[target]

In [6]:
X_train_q, X_test_q, y_train_q, y_test_q = train_test_split(X_with_qchat, y, test_size=0.2, random_state=42)
X_train_noq, X_test_noq, y_train_noq, y_test_noq = train_test_split(X_without_qchat, y, test_size=0.2, random_state=42)

In [8]:
scaler = StandardScaler()
X_train_q = scaler.fit_transform(X_train_q)
X_test_q = scaler.transform(X_test_q)

In [9]:
X_train_noq = scaler.fit_transform(X_train_noq)
X_test_noq = scaler.transform(X_test_noq)

In [10]:
models = {
    "Logistic Regression": {
        "model": LogisticRegression(max_iter=5000),
        "params": {"C": [0.1, 1, 10], "penalty": ['l2']}
    },
    "Random Forest": {
        "model": RandomForestClassifier(),
        "params": {"n_estimators": [50, 100, 200], "max_depth": [5, 10, 20]}
    },
    "SVM": {
        "model": SVC(),
        "params": {"C": [0.1, 1, 10], "kernel": ['linear', 'rbf']}
    },
    "KNN": {
        "model": KNeighborsClassifier(),
        "params": {"n_neighbors": [3, 5, 7], "weights": ['uniform', 'distance']}
    },
    "Decision Tree": {
        "model": DecisionTreeClassifier(),
        "params": {"max_depth": [5, 10, 20], "criterion": ['gini', 'entropy']}
    },
    "Gradient Boosting": {
        "model": GradientBoostingClassifier(),
        "params": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.1, 0.2]}
    }
}

In [11]:
results = []

for config_name, (X_train, X_test, y_train, y_test) in {
    "With Qchat-10-Score": (X_train_q, X_test_q, y_train_q, y_test_q),
    "Without Qchat-10-Score": (X_train_noq, X_test_noq, y_train_noq, y_test_noq)
}.items():
    print(f"Configuration: {config_name}")
    for model_name, model_config in models.items():
        print(f"\nTraining {model_name}...")
        model = model_config["model"]
        params = model_config["params"]
        
        # Grid Search
        grid_search = GridSearchCV(model, params, cv=5, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        
        # Best model
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test)
        
        # Evaluation
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred, output_dict=True)
        
        print(f"Best Parameters: {grid_search.best_params_}")
        print(f"Accuracy: {accuracy * 100:.2f}%")
        results.append({
            "Configuration": config_name,
            "Model": model_name,
            "Best Parameters": grid_search.best_params_,
            "Accuracy": accuracy,
            "Classification Report": report
        })

Configuration: With Qchat-10-Score

Training Logistic Regression...
Best Parameters: {'C': 10, 'penalty': 'l2'}
Accuracy: 100.00%

Training Random Forest...
Best Parameters: {'max_depth': 5, 'n_estimators': 50}
Accuracy: 100.00%

Training SVM...
Best Parameters: {'C': 1, 'kernel': 'linear'}
Accuracy: 100.00%

Training KNN...
Best Parameters: {'n_neighbors': 7, 'weights': 'distance'}
Accuracy: 100.00%

Training Decision Tree...
Best Parameters: {'criterion': 'gini', 'max_depth': 5}
Accuracy: 100.00%

Training Gradient Boosting...
Best Parameters: {'learning_rate': 0.01, 'n_estimators': 50}
Accuracy: 100.00%
Configuration: Without Qchat-10-Score

Training Logistic Regression...
Best Parameters: {'C': 0.1, 'penalty': 'l2'}
Accuracy: 92.42%

Training Random Forest...
Best Parameters: {'max_depth': 5, 'n_estimators': 100}
Accuracy: 91.47%

Training SVM...
Best Parameters: {'C': 0.1, 'kernel': 'rbf'}
Accuracy: 91.94%

Training KNN...
Best Parameters: {'n_neighbors': 7, 'weights': 'uniform'}


In [12]:
results_df = pd.DataFrame(results)
results_df.to_csv("model_results.csv", index=False)

In [13]:
print("\nTraining Summary:")
print(results_df)


Training Summary:
             Configuration                Model  \
0      With Qchat-10-Score  Logistic Regression   
1      With Qchat-10-Score        Random Forest   
2      With Qchat-10-Score                  SVM   
3      With Qchat-10-Score                  KNN   
4      With Qchat-10-Score        Decision Tree   
5      With Qchat-10-Score    Gradient Boosting   
6   Without Qchat-10-Score  Logistic Regression   
7   Without Qchat-10-Score        Random Forest   
8   Without Qchat-10-Score                  SVM   
9   Without Qchat-10-Score                  KNN   
10  Without Qchat-10-Score        Decision Tree   
11  Without Qchat-10-Score    Gradient Boosting   

                                 Best Parameters  Accuracy  \
0                     {'C': 10, 'penalty': 'l2'}  1.000000   
1           {'max_depth': 5, 'n_estimators': 50}  1.000000   
2                   {'C': 1, 'kernel': 'linear'}  1.000000   
3      {'n_neighbors': 7, 'weights': 'distance'}  1.000000   
4      

In [14]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd

In [15]:
df = pd.read_csv("dataset.csv")

In [16]:
selected_features_with_qchat = ['Qchat-10-Score', 'A9', 'A5', 'A6', 'A7', 'A4']
selected_features_without_qchat = ['A9', 'A5', 'A6', 'A7', 'A4']
target = 'ClassASD_Traits'

In [17]:
df[target] = df[target].map({'Yes': 1, 'No': 0})

In [18]:
datasets = {
    "With Qchat-10-Score": df[selected_features_with_qchat + [target]],
    "Without Qchat-10-Score": df[selected_features_without_qchat + [target]]
}

In [19]:
balancing_methods = {
    "Original": None,
    "SMOTE": SMOTE(random_state=42),
    "Random Undersampling": RandomUnderSampler(random_state=42),
    "SMOTEENN": SMOTEENN(random_state=42)
}

In [20]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=5000),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Gradient Boosting": GradientBoostingClassifier()
}

In [21]:
results = []

for dataset_name, data in datasets.items():
    X = data.drop(columns=[target])
    y = data[target]
    
    for method_name, method in balancing_methods.items():
        print(f"Dataset: {dataset_name}, Balancing: {method_name}")
        
        # Split the dataset
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Apply balancing method if applicable
        if method:
            X_train, y_train = method.fit_resample(X_train, y_train)
        
        # Scale the data
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        for model_name, model in models.items():
            print(f"Training {model_name}...")
            model.fit(X_train_scaled, y_train)
            y_pred = model.predict(X_test_scaled)
            
            # Evaluate the model
            accuracy = accuracy_score(y_test, y_pred)
            report = classification_report(y_test, y_pred, output_dict=True)
            
            results.append({
                "Dataset": dataset_name,
                "Balancing": method_name,
                "Model": model_name,
                "Accuracy": accuracy,
                "Classification Report": report
            })

Dataset: With Qchat-10-Score, Balancing: Original
Training Logistic Regression...
Training Random Forest...
Training SVM...
Training KNN...
Training Decision Tree...
Training Gradient Boosting...
Dataset: With Qchat-10-Score, Balancing: SMOTE
Training Logistic Regression...
Training Random Forest...
Training SVM...
Training KNN...
Training Decision Tree...
Training Gradient Boosting...
Dataset: With Qchat-10-Score, Balancing: Random Undersampling
Training Logistic Regression...
Training Random Forest...
Training SVM...
Training KNN...
Training Decision Tree...
Training Gradient Boosting...
Dataset: With Qchat-10-Score, Balancing: SMOTEENN
Training Logistic Regression...
Training Random Forest...
Training SVM...
Training KNN...
Training Decision Tree...
Training Gradient Boosting...
Dataset: Without Qchat-10-Score, Balancing: Original
Training Logistic Regression...
Training Random Forest...
Training SVM...
Training KNN...
Training Decision Tree...
Training Gradient Boosting...
Dataset:

In [22]:
results_df = pd.DataFrame(results)
results_df.to_csv("balanced_model_results.csv", index=False)

In [23]:
print("\nTraining Summary:")
print(results_df)


Training Summary:
                   Dataset             Balancing                Model  \
0      With Qchat-10-Score              Original  Logistic Regression   
1      With Qchat-10-Score              Original        Random Forest   
2      With Qchat-10-Score              Original                  SVM   
3      With Qchat-10-Score              Original                  KNN   
4      With Qchat-10-Score              Original        Decision Tree   
5      With Qchat-10-Score              Original    Gradient Boosting   
6      With Qchat-10-Score                 SMOTE  Logistic Regression   
7      With Qchat-10-Score                 SMOTE        Random Forest   
8      With Qchat-10-Score                 SMOTE                  SVM   
9      With Qchat-10-Score                 SMOTE                  KNN   
10     With Qchat-10-Score                 SMOTE        Decision Tree   
11     With Qchat-10-Score                 SMOTE    Gradient Boosting   
12     With Qchat-10-Score  Rand