# Autism Prediction Model

#### 1. Importing

In [395]:
#Basic Imports
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler

#Visualization Imports
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('fivethirtyeight')

#Model Building Imports
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier

#### 2. Getting the Data and Exploring

In [396]:
df = pd.read_csv('autismData.csv')
df.head()

Unnamed: 0,Case_No,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Age_Mons,Qchat-10-Score,Sex,Ethnicity,Jaundice,Family_mem_with_ASD,Who completed the test,Class/ASD Traits
0,1,0,0,0,0,0,0,1,1,0,1,28,3,f,middle eastern,yes,no,family member,No
1,2,1,1,0,0,0,1,1,0,0,0,36,4,m,White European,yes,no,family member,Yes
2,3,1,0,0,0,0,0,1,1,0,1,36,4,m,middle eastern,yes,no,family member,Yes
3,4,1,1,1,1,1,1,1,1,1,1,24,10,m,Hispanic,no,no,family member,Yes
4,5,1,1,0,1,1,1,1,1,1,1,20,9,f,White European,no,yes,family member,Yes


In [397]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1054 entries, 0 to 1053
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Case_No                 1054 non-null   int64 
 1   A1                      1054 non-null   int64 
 2   A2                      1054 non-null   int64 
 3   A3                      1054 non-null   int64 
 4   A4                      1054 non-null   int64 
 5   A5                      1054 non-null   int64 
 6   A6                      1054 non-null   int64 
 7   A7                      1054 non-null   int64 
 8   A8                      1054 non-null   int64 
 9   A9                      1054 non-null   int64 
 10  A10                     1054 non-null   int64 
 11  Age_Mons                1054 non-null   int64 
 12  Qchat-10-Score          1054 non-null   int64 
 13  Sex                     1054 non-null   object
 14  Ethnicity               1054 non-null   object
 15  Jaun

Checking for null values

In [398]:
df.isnull().sum()

Case_No                   0
A1                        0
A2                        0
A3                        0
A4                        0
A5                        0
A6                        0
A7                        0
A8                        0
A9                        0
A10                       0
Age_Mons                  0
Qchat-10-Score            0
Sex                       0
Ethnicity                 0
Jaundice                  0
Family_mem_with_ASD       0
Who completed the test    0
Class/ASD Traits          0
dtype: int64

#### 3. Data Cleaning and Preprocessing

In [399]:
# We remove the following features, since they do not have a significant impact on our data.

df.drop('Case_No', axis=1, inplace=True)
df.drop('Who completed the test', axis=1, inplace=True)
df.drop('Qchat-10-Score', axis=1, inplace=True)

In [400]:
# Rename the columns

df.columns = ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'Age', 
              'Sex', 'Ethnicity', 'Jaundice', 'Family_Member',
       'Result']

##### Converting categorical variables

Using lambda functions, we will convert the values to numerical and make them compatible for the model building process. 
The following conditions are applied:

|     Category     |   1  |    0   |
|------------------|------|--------|
|       Sex        | Male | Female |
|     Jaundice     |  Yes |   No   |
|   Family_Member  |  Yes |   No   |
|       Result     |  Yes |   No   |

In [401]:
df['Sex'] = df['Sex'].apply(lambda x: 1 if x == 'm' else 0)
df['Jaundice'] = df['Jaundice'].apply(lambda x: 1 if x=='yes' else 0)
df['Family_Member'] = df['Family_Member'].apply(lambda x: 1 if x=='yes' else 0)
df['Result'] = df['Result'].apply(lambda x: 1 if x=='Yes' else 0)

The outcomes of 'Ethnicity' feature are of more than two categories (White European, Asian, Hispanic, Latino, etc.)
Therefore, we will use the LabelEncoder from sklearn library to do the conversion.

In [402]:
lab_encoder = LabelEncoder()
df['Ethnicity']=lab_encoder.fit_transform(df[['Ethnicity']])


  y = column_or_1d(y, warn=True)


In [403]:
classes_val = lab_encoder.classes_
classes_ind = [0,1,2,3,4,5,6,7,8,9,10]
ethnicity_encoding = pd.DataFrame(data=(classes_ind, classes_val))
ethnicity_encoding = ethnicity_encoding.T
ethnicity_encoding.columns = ['Index', 'Ethnicity']

The results from the conversion are as following:

In [404]:
ethnicity_encoding

Unnamed: 0,Index,Ethnicity
0,0,Hispanic
1,1,Latino
2,2,Native Indian
3,3,Others
4,4,Pacifica
5,5,White European
6,6,asian
7,7,black
8,8,middle eastern
9,9,mixed


#### 4. Train-Test Split

In [405]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

X.shape, y.shape 

((1054, 15), (1054,))

In [409]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

#### 5. Model Building

#### Logistic Regression

In [414]:
log_regression = LogisticRegression(penalty='l1', C=0.3, solver='liblinear')
log_regression.fit(X_train, y_train)
log_pred = log_regression.predict(X_test)

In [415]:
log_score = log_regression.score(X_test, y_test)
log_score

0.9747634069400631

In [416]:
log_conf = confusion_matrix(y_test, log_pred)
log_conf

array([[ 96,   4],
       [  4, 213]], dtype=int64)

In [417]:
log_report = classification_report(y_test, log_pred)
log_report

'              precision    recall  f1-score   support\n\n           0       0.96      0.96      0.96       100\n           1       0.98      0.98      0.98       217\n\n    accuracy                           0.97       317\n   macro avg       0.97      0.97      0.97       317\nweighted avg       0.97      0.97      0.97       317\n'

#### K-Nearest Neighbors

In [420]:
knn = KNeighborsClassifier(n_neighbors = 10)
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)

In [421]:
knn_score = knn.score(X_test, y_test)
knn_score

0.9274447949526814

In [422]:
knn_conf = confusion_matrix(y_test, knn_pred)
knn_conf

array([[ 91,   9],
       [ 14, 203]], dtype=int64)

In [423]:
knn_report = classification_report(y_test, knn_pred)
knn_report

'              precision    recall  f1-score   support\n\n           0       0.87      0.91      0.89       100\n           1       0.96      0.94      0.95       217\n\n    accuracy                           0.93       317\n   macro avg       0.91      0.92      0.92       317\nweighted avg       0.93      0.93      0.93       317\n'

#### Support Vector Machine (SVM)

In [424]:
svm_c = svm.SVC()
svm_c.fit(X_train, y_train)
svm_c_pred = svm_c.predict(X_test)

In [425]:
svm_c_score = svm_c.score(X_test, y_test)
svm_c_score

0.8296529968454258

In [426]:
svm_c_conf = confusion_matrix(y_test, svm_c_pred)
svm_c_conf

array([[ 46,  54],
       [  0, 217]], dtype=int64)

In [427]:
svm_report = classification_report(y_test, svm_c_pred)
svm_report

'              precision    recall  f1-score   support\n\n           0       1.00      0.46      0.63       100\n           1       0.80      1.00      0.89       217\n\n    accuracy                           0.83       317\n   macro avg       0.90      0.73      0.76       317\nweighted avg       0.86      0.83      0.81       317\n'

#### Decision Tree Classifier

In [428]:
dtc = DecisionTreeClassifier()
dtc = dtc.fit(X_train,y_train)
dtc_pred = dtc.predict(X_test)

In [429]:
dtc_score = dtc.score(X_test, y_test)
dtc_score

0.8706624605678234

In [430]:
dtc_conf = confusion_matrix(y_test, dtc_pred)
dtc_conf

array([[ 81,  19],
       [ 22, 195]], dtype=int64)

In [431]:
dtc_report = classification_report(y_test, dtc_pred)
dtc_report

'              precision    recall  f1-score   support\n\n           0       0.79      0.81      0.80       100\n           1       0.91      0.90      0.90       217\n\n    accuracy                           0.87       317\n   macro avg       0.85      0.85      0.85       317\nweighted avg       0.87      0.87      0.87       317\n'