**Matthew Zlibut**

DATA set description

Autism spectrum disorder (ASD) is a developmental disorder that affects communication and behavior. Unfortunately, waiting for an ASD diagnosis is lengthy and procedures are expensive. The economic impact of autism and the increase in the number of ASD cases across the world reveals an urgent need for the development of easily implemented and effective ASD screening methods.

Column variables presented in this data: **AGE, GENDER, ETHNICITY, JAUNDICE, FAMILY with PDD, TEST TAKER, COUNTRY, etc.**

The **age** of the patient was a number presented in years old. 

**Gender** was only measured to be M or F, which is translated to a 1 or 0 in our data.

**Ethnicity** was a string which lists ethnicities in text format. 

Born with **jaundice** is a Boolean value (True or False)

**Family member with PDD** is a Boolean value (True or False)

**Who is completing the test** is a String value. ex: Parent, self, caregiver, medical staff, clinician ,etc.

**Country of residence** is a String, List countries in text format

**Used the screening app before** Boolean (yes or no) Whether the user has used a screening app

Question 1-10 Answer Binary (0, 1) The answer code of the question based on the screening

method used

Screening Score Integer The final score obtained based on the scoring algorithm of the
screening method used. This was computed in an automated manner



In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score

def cross_validation(estimator, param_grid, X_train, y_train):
    #returns a grid search object and cross-validatied scores in that order
    gs = GridSearchCV(estimator=estimator,
                 param_grid = param_grid,
                 scoring='accuracy',
                 cv=5, iid=False, refit=True,
                n_jobs=-1)
    scores = cross_validate(estimator=gs, X=X_train, y=y_train, cv=10, scoring='accuracy', return_train_score=True)
    return gs, scores

def print_scoring_metrics(method, y_true, y_pred, estimator, normalize=False):
    acc = accuracy_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    print(('%s score'+4*'%s %2.2f\t')%(method, 'Accuracy', acc, 'Recall', rec, 'Precision', prec, 'F1', f1))
    

df = pd.read_csv('Autism-Child-Data.csv', na_values = '?')
df = df.dropna(axis=0)

df = pd.get_dummies(df, columns=['country_of_res'], prefix = 'country')
df = pd.get_dummies(df, columns=['ethnicity'])
df = pd.get_dummies(df, columns=['relation'])

df['relation_Self'] += df['relation_self']
df = df.drop(axis=1, columns=['country_Italy',
                              'ethnicity_Others',
                             'relation_Parent',
                             'relation_self',
                             'age_desc',
                             'result'])
df['gender'] = df['gender'].map({'m': 0, 'f': 1})

yn_mapping = {'yes':1,'YES':1,'no':0, 'NO':0}
for label in ['jaundice', 'autism', 'class', 'used_app_before']:
    df[label] = df[label].map(yn_mapping)
    
df.head()
df.tail()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,ethnicity_Asian,ethnicity_Black,ethnicity_Hispanic,ethnicity_Latino,ethnicity_Pasifika,ethnicity_Turkish,ethnicity_White-European,relation_'Health care professional',relation_Relative,relation_Self
287,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,1,0,0,0
288,1,0,0,0,1,0,1,0,0,1,...,0,0,0,0,0,0,1,0,0,0
289,1,0,1,1,1,1,1,0,0,1,...,0,0,0,1,0,0,0,0,0,0
290,1,1,1,0,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
291,0,0,1,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [2]:
#SVC Data
clf = SVC(gamma='auto')
X = df.drop(axis=1, columns='class').values
y = df['class'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

clf.fit(X_train_std, y_train) 
clf.predict(X_test_std)
clf.score(X_test_std, y_test, sample_weight=None)

C_range = np.logspace(-2, 4, 10)
gamma_range = np.logspace(-4, 2, 10)

param_grid = [{'C': C_range, 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': gamma_range, 'degree':[2,3,4]}]

gs, scores = cross_validation(clf, param_grid, X_train_std, y_train)
gs.fit(X_train_std, y_train)

print(scores['test_score'].mean())

best_SVC = gs.best_estimator_
print(best_SVC.get_params())
print_scoring_metrics('Optimized SVC', y_test, best_SVC.predict(X_test_std), best_SVC)

0.9552130325814536
{'C': 100.0, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 2, 'gamma': 0.002154434690031882, 'kernel': 'sigmoid', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}
Optimized SVC scoreAccuracy 0.88	Recall 0.84	Precision 0.91	F1 0.87	


In [3]:
#LR
lr = LogisticRegression(C=0.1, solver='lbfgs', multi_class='auto')
lr.fit(X_train_std, y_train)
lr.predict(X_test_std)
lr.score(X_test_std, y_test)

C_range = np.logspace(-2, 4, 10)
l1_range = np.linspace(0.1, 1, 10)

param_grid = [{'C': C_range, 'penalty': ['l2'], 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']},
             ]#{'C': C_range, 'penalty': ['elasticnet'], 'l1_ratio': l1_range, 'solver': ['saga']}]

gs, scores = cross_validation(lr, param_grid, X_train_std, y_train)
gs.fit(X_train_std, y_train)

print(scores['test_score'].mean())




0.9747368421052632


In [4]:
best_lr = gs.best_estimator_
print(best_lr.get_params())
print_scoring_metrics('Optimized lr', y_test, best_lr.predict(X_test_std), best_lr)

{'C': 0.046415888336127774, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
Optimized lr scoreAccuracy 0.90	Recall 0.92	Precision 0.88	F1 0.90	


In [None]:
#MLP
mlp = MLPClassifier(solver='lbfgs')
mlp.fit(X_train_std, y_train)
mlp.predict(X_test_std)
mlp.score(X_test_std, y_test)

C_range = np.logspace(-2, 4, 10)
l1_range = np.linspace(0.1, 1, 10)

param_grid = [{'learning_rate': ['constant', 'invscaling', 'adaptive'], 'solver': ['lbfgs', 'sgd', 'adam'],
               'activation':['identity', 'logistic', 'tanh', 'relu'],
               'hidden_layer_sizes': [(73,), (73,37), (73,73,37)]}]

gs, scores = cross_validation(mlp, param_grid, X_train_std, y_train)
gs.fit(X_train_std, y_train)

print(scores)
               
best_mlp = gs.best_estimator_
print(best_mlp.get_params())
print_scoring_metrics('Optimized MLP', y_test, best_mlp_predict(X_test_std), best_mlp)