# Childhood Autistic Spectrum Disorder Screening
### Description
In this project, we're going to take advantage of neural network using Keras API to diagnose Autistic Spectrum Disorder based on behavioral features and individual characteristics. 
### Dataset
This project use a dataset provided by the UCI Machine Learning Repository, which contains screening data for 292 patients.

In [28]:
import sys 
import pandas as pd
import numpy as np
import sklearn
import keras

print("Python: {}".format(sys.version))
print("pandas: {}".format(pd.__version__))
print("numpy: {}".format(np.__version__))
print("sklearn: {}".format(sklearn.__version__))
print("keras: {}".format(keras.__version__))

Python: 3.7.6 (tags/v3.7.6:43364a7ae0, Dec 19 2019, 00:42:30) [MSC v.1916 64 bit (AMD64)]
pandas: 1.2.2
numpy: 1.19.5
sklearn: 0.24.1
keras: 2.4.3


### Load the dataset

In [31]:
# read the dataset
from scipy.io import arff
data = arff.loadarff("d:/idealabs_ai/Autism-Child-Data.arff")

df = pd.DataFrame(data[0])

for col in df.columns:
    if df[col].dtypes != float:
        df[col] = df[col].apply(lambda x: str(x, 'utf-8')) # convert byte object to type string
        df[col] = pd.to_numeric(df[col], errors='ignore') # convert a number in string type to numeric

df['age'] = df['age'].apply(lambda x: str(x)) # convert column age to string type

In [32]:
# examine the dataset
print("Shape of the dataframe: {}".format(df.shape))
print(df.loc[0])

Shape of the dataframe: (292, 21)
A1_Score                    1
A2_Score                    1
A3_Score                    0
A4_Score                    0
A5_Score                    1
A6_Score                    1
A7_Score                    0
A8_Score                    1
A9_Score                    0
A10_Score                   0
age                       6.0
gender                      m
ethnicity              Others
jundice                    no
austim                     no
contry_of_res          Jordan
used_app_before            no
result                    5.0
age_desc           4-11 years
relation               Parent
Class/ASD                  NO
Name: 0, dtype: object


In [33]:
print(df.dtypes)

A1_Score             int64
A2_Score             int64
A3_Score             int64
A4_Score             int64
A5_Score             int64
A6_Score             int64
A7_Score             int64
A8_Score             int64
A9_Score             int64
A10_Score            int64
age                 object
gender              object
ethnicity           object
jundice             object
austim              object
contry_of_res       object
used_app_before     object
result             float64
age_desc            object
relation            object
Class/ASD           object
dtype: object


In [34]:
# examine the first ten patients
df.loc[:10]

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class/ASD
0,1,1,0,0,1,1,0,1,0,0,...,m,Others,no,no,Jordan,no,5.0,4-11 years,Parent,NO
1,1,1,0,0,1,1,0,1,0,0,...,m,Middle Eastern,no,no,Jordan,no,5.0,4-11 years,Parent,NO
2,1,1,0,0,0,1,1,1,0,0,...,m,?,no,no,Jordan,yes,5.0,4-11 years,?,NO
3,0,1,0,0,1,1,0,0,0,1,...,f,?,yes,no,Jordan,no,4.0,4-11 years,?,NO
4,1,1,1,1,1,1,1,1,1,1,...,m,Others,yes,no,United States,no,10.0,4-11 years,Parent,YES
5,0,0,1,0,1,1,0,1,0,1,...,m,?,no,yes,Egypt,no,5.0,4-11 years,?,NO
6,1,0,1,1,1,1,0,1,0,1,...,m,White-European,no,no,United Kingdom,no,7.0,4-11 years,Parent,YES
7,1,1,1,1,1,1,1,1,0,0,...,f,Middle Eastern,no,no,Bahrain,no,8.0,4-11 years,Parent,YES
8,1,1,1,1,1,1,1,0,0,0,...,f,Middle Eastern,no,no,Bahrain,no,7.0,4-11 years,Parent,YES
9,0,0,1,1,1,0,1,1,0,0,...,f,?,no,yes,Austria,no,5.0,4-11 years,?,NO


In [35]:
# describe the dataset
df.describe()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,result
count,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0
mean,0.633562,0.534247,0.743151,0.55137,0.743151,0.712329,0.606164,0.496575,0.493151,0.726027,6.239726
std,0.482658,0.499682,0.437646,0.498208,0.437646,0.453454,0.489438,0.500847,0.500811,0.446761,2.284882
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
50%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,6.0
75%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,8.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,10.0


### Preprocessing
Step 1: Drop some unnecessary columns

Step 2: Convert string data to categorical labels

In [36]:
# drop unwanted cols
df = df.drop(['result', 'age_desc'], axis=1)

In [37]:
df.loc[:10]

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,relation,Class/ASD
0,1,1,0,0,1,1,0,1,0,0,6.0,m,Others,no,no,Jordan,no,Parent,NO
1,1,1,0,0,1,1,0,1,0,0,6.0,m,Middle Eastern,no,no,Jordan,no,Parent,NO
2,1,1,0,0,0,1,1,1,0,0,6.0,m,?,no,no,Jordan,yes,?,NO
3,0,1,0,0,1,1,0,0,0,1,5.0,f,?,yes,no,Jordan,no,?,NO
4,1,1,1,1,1,1,1,1,1,1,5.0,m,Others,yes,no,United States,no,Parent,YES
5,0,0,1,0,1,1,0,1,0,1,4.0,m,?,no,yes,Egypt,no,?,NO
6,1,0,1,1,1,1,0,1,0,1,5.0,m,White-European,no,no,United Kingdom,no,Parent,YES
7,1,1,1,1,1,1,1,1,0,0,5.0,f,Middle Eastern,no,no,Bahrain,no,Parent,YES
8,1,1,1,1,1,1,1,0,0,0,11.0,f,Middle Eastern,no,no,Bahrain,no,Parent,YES
9,0,0,1,1,1,0,1,1,0,0,11.0,f,?,no,yes,Austria,no,?,NO


In [38]:
# create input X and output y for training
X = df.drop(['Class/ASD'], axis=1)
y = df['Class/ASD']

In [39]:
X.loc[:10]

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,relation
0,1,1,0,0,1,1,0,1,0,0,6.0,m,Others,no,no,Jordan,no,Parent
1,1,1,0,0,1,1,0,1,0,0,6.0,m,Middle Eastern,no,no,Jordan,no,Parent
2,1,1,0,0,0,1,1,1,0,0,6.0,m,?,no,no,Jordan,yes,?
3,0,1,0,0,1,1,0,0,0,1,5.0,f,?,yes,no,Jordan,no,?
4,1,1,1,1,1,1,1,1,1,1,5.0,m,Others,yes,no,United States,no,Parent
5,0,0,1,0,1,1,0,1,0,1,4.0,m,?,no,yes,Egypt,no,?
6,1,0,1,1,1,1,0,1,0,1,5.0,m,White-European,no,no,United Kingdom,no,Parent
7,1,1,1,1,1,1,1,1,0,0,5.0,f,Middle Eastern,no,no,Bahrain,no,Parent
8,1,1,1,1,1,1,1,0,0,0,11.0,f,Middle Eastern,no,no,Bahrain,no,Parent
9,0,0,1,1,1,0,1,1,0,0,11.0,f,?,no,yes,Austria,no,?


In [40]:
X = pd.get_dummies(X)

In [41]:
X.columns.values

array(['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score',
       'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score',
       'age_10.0', 'age_11.0', 'age_4.0', 'age_5.0', 'age_6.0', 'age_7.0',
       'age_8.0', 'age_9.0', 'age_nan', 'gender_f', 'gender_m',
       'ethnicity_?', 'ethnicity_Asian', 'ethnicity_Black',
       'ethnicity_Hispanic', 'ethnicity_Latino',
       'ethnicity_Middle Eastern ', 'ethnicity_Others',
       'ethnicity_Pasifika', 'ethnicity_South Asian', 'ethnicity_Turkish',
       'ethnicity_White-European', 'jundice_no', 'jundice_yes',
       'austim_no', 'austim_yes', 'contry_of_res_Afghanistan',
       'contry_of_res_Argentina', 'contry_of_res_Armenia',
       'contry_of_res_Australia', 'contry_of_res_Austria',
       'contry_of_res_Bahrain', 'contry_of_res_Bangladesh',
       'contry_of_res_Bhutan', 'contry_of_res_Brazil',
       'contry_of_res_Bulgaria', 'contry_of_res_Canada',
       'contry_of_res_China', 'contry_of_res_Costa Rica',
       'con

In [42]:
# print an example patient from the categorical data
X.loc[1]

A1_Score                             1
A2_Score                             1
A3_Score                             0
A4_Score                             0
A5_Score                             1
                                    ..
relation_Health care professional    0
relation_Parent                      1
relation_Relative                    0
relation_Self                        0
relation_self                        0
Name: 1, Length: 96, dtype: int64

In [43]:
# convert the class data to categorical values (one-hot encoded vectors)
y_categorical = pd.get_dummies(y)

In [44]:
y_categorical.iloc[:10]

Unnamed: 0,NO,YES
0,1,0
1,1,0
2,1,0
3,1,0
4,0,1
5,1,0
6,0,1
7,0,1
8,0,1
9,1,0


### Split the dataset into training and testing datasets

In [45]:
from sklearn import model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y_categorical, test_size=0.2)

In [46]:
# examine the shape of the new 4 generated datasets
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(233, 96)
(59, 96)
(233, 2)
(59, 2)


### Build the neural network with Keras

In [47]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

In [48]:
# define the training model
def create_model():
    # create model
    model = Sequential()
    model.add(Dense(8, input_dim=96, kernel_initializer='normal', activation='relu'))
    model.add(Dense(4, kernel_initializer='normal', activation='relu'))
    model.add(Dense(2, activation='sigmoid'))
    
    # compile the model
    adam = Adam(lr=0.001)
    model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
    
    return model

In [49]:
# re-examine the model
model = create_model()
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 8)                 776       
_________________________________________________________________
dense_4 (Dense)              (None, 4)                 36        
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 10        
Total params: 822
Trainable params: 822
Non-trainable params: 0
_________________________________________________________________
None


### Traing the network

In [50]:
model.fit(X_train, y_train, epochs=100, batch_size=10, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x1ba50eaf788>

### Testing and performance metrics

In [51]:
from sklearn.metrics import classification_report, accuracy_score

predictions = model.predict_classes(X_test)



In [52]:
predictions

array([0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1], dtype=int64)

In [53]:
print("results for categorical model")
print("Accuracy score: {}".format(accuracy_score(y_test[['YES']], predictions)))
print("Classification report:")
print(classification_report(y_test[['YES']], predictions))

results for categorical model
Accuracy score: 0.9491525423728814
Classification report:
              precision    recall  f1-score   support

           0       0.97      0.94      0.96        35
           1       0.92      0.96      0.94        24

    accuracy                           0.95        59
   macro avg       0.95      0.95      0.95        59
weighted avg       0.95      0.95      0.95        59

