In [189]:
# importing libraries
from scipy.io import arff
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [190]:
# Loading the data
data = arff.loadarff('Autism-Screening-Child-Data/Autism-Child-Data.arff')
   

df = pd.DataFrame(data[0])

# Converting columns to the right data types and replace missing values with Nan
stringColumns = ['gender','ethnicity', 'jundice', 'austim', 'contry_of_res', 'used_app_before', 'age_desc', 'relation', 'Class/ASD']
for column in stringColumns:
    df[column] = df[column].apply(lambda x: x.decode("utf-8")).replace('?', np.nan)
    
    
df[["A1_Score", "A2_Score", "A3_Score", "A4_Score", "A5_Score", "A6_Score", "A7_Score", "A8_Score", "A9_Score", "A10_Score"]] = df[["A1_Score", "A2_Score", "A3_Score", "A4_Score", "A5_Score", "A6_Score", "A7_Score", "A8_Score", "A9_Score", "A10_Score"]].apply(pd.to_numeric)

df.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class/ASD
0,1,1,0,0,1,1,0,1,0,0,...,m,Others,no,no,Jordan,no,5.0,4-11 years,Parent,NO
1,1,1,0,0,1,1,0,1,0,0,...,m,Middle Eastern,no,no,Jordan,no,5.0,4-11 years,Parent,NO
2,1,1,0,0,0,1,1,1,0,0,...,m,,no,no,Jordan,yes,5.0,4-11 years,,NO
3,0,1,0,0,1,1,0,0,0,1,...,f,,yes,no,Jordan,no,4.0,4-11 years,,NO
4,1,1,1,1,1,1,1,1,1,1,...,m,Others,yes,no,United States,no,10.0,4-11 years,Parent,YES


In [191]:
# Data description
print(data[1])

# 9% of the children eventually diagnosed with autism had jaundice
# during their first days of life, compared to 3% of children without autism.

Dataset: child
	A1_Score's type is nominal, range is ('0', '1')
	A2_Score's type is nominal, range is ('0', '1')
	A3_Score's type is nominal, range is ('0', '1')
	A4_Score's type is nominal, range is ('0', '1')
	A5_Score's type is nominal, range is ('0', '1')
	A6_Score's type is nominal, range is ('0', '1')
	A7_Score's type is nominal, range is ('0', '1')
	A8_Score's type is nominal, range is ('0', '1')
	A9_Score's type is nominal, range is ('0', '1')
	A10_Score's type is nominal, range is ('0', '1')
	age's type is numeric
	gender's type is nominal, range is ('m', 'f')
	ethnicity's type is nominal, range is ('Others', 'Middle Eastern ', 'White-European', 'Black', 'South Asian', 'Asian', 'Pasifika', 'Hispanic', 'Turkish', 'Latino')
	jundice's type is nominal, range is ('no', 'yes')
	austim's type is nominal, range is ('no', 'yes')
	contry_of_res's type is nominal, range is ('Jordan', 'United States', 'Egypt', 'United Kingdom', 'Bahrain', 'Austria', 'Kuwait', 'United Arab Emirates', 'Eur

In [192]:
df.shape

(292, 21)

In [193]:
df.columns

Index(['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score',
       'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'age', 'gender',
       'ethnicity', 'jundice', 'austim', 'contry_of_res', 'used_app_before',
       'result', 'age_desc', 'relation', 'Class/ASD'],
      dtype='object')

In [194]:
df.index

RangeIndex(start=0, stop=292, step=1)

In [195]:
# Checking out for missing values
df.isnull().sum()

A1_Score            0
A2_Score            0
A3_Score            0
A4_Score            0
A5_Score            0
A6_Score            0
A7_Score            0
A8_Score            0
A9_Score            0
A10_Score           0
age                 4
gender              0
ethnicity          43
jundice             0
austim              0
contry_of_res       0
used_app_before     0
result              0
age_desc            0
relation           43
Class/ASD           0
dtype: int64

In [196]:
# Handling missing values

# ethnicity and relation coluumn is going to be removed the df
# the missing values in the age column will be handled by replacing the NaNs with mean value of the ages

df.drop(["ethnicity", "relation"], axis=1, inplace=True)
df["age"] = df["age"].replace(np.nan, df["age"].mean())


In [197]:
df.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,gender,jundice,austim,contry_of_res,used_app_before,result,age_desc,Class/ASD
0,1,1,0,0,1,1,0,1,0,0,6.0,m,no,no,Jordan,no,5.0,4-11 years,NO
1,1,1,0,0,1,1,0,1,0,0,6.0,m,no,no,Jordan,no,5.0,4-11 years,NO
2,1,1,0,0,0,1,1,1,0,0,6.0,m,no,no,Jordan,yes,5.0,4-11 years,NO
3,0,1,0,0,1,1,0,0,0,1,5.0,f,yes,no,Jordan,no,4.0,4-11 years,NO
4,1,1,1,1,1,1,1,1,1,1,5.0,m,yes,no,United States,no,10.0,4-11 years,YES


In [199]:
# Encoding the columns with categorical values
label_encoder = LabelEncoder()

stringColumns = ['gender', 'jundice', 'austim', 'contry_of_res', 'used_app_before', 'age_desc', 'Class/ASD']
for column in stringColumns:
    df[column] = label_encoder.fit_transform(df[column])

In [200]:
df.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,gender,jundice,austim,contry_of_res,used_app_before,result,age_desc,Class/ASD
0,1,1,0,0,1,1,0,1,0,0,6.0,1,0,0,24,0,5.0,0,0
1,1,1,0,0,1,1,0,1,0,0,6.0,1,0,0,24,0,5.0,0,0
2,1,1,0,0,0,1,1,1,0,0,6.0,1,0,0,24,1,5.0,0,0
3,0,1,0,0,1,1,0,0,0,1,5.0,0,1,0,24,0,4.0,0,0
4,1,1,1,1,1,1,1,1,1,1,5.0,1,1,0,51,0,10.0,0,1
