In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

Data collection & analysis

In [2]:
asdt = pd.read_csv('/content/Toddler Autism dataset csv.csv')
asdt.head()

Unnamed: 0,Case_No,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Age_Mons,Qchat-10-Score,Sex,Ethnicity,Jaundice,Family_mem_with_ASD,Who completed the test,Class/ASD Traits
0,1,0,0,0,0,0,0,1,1,0,1,28,3,f,middle eastern,yes,no,family member,No
1,2,1,1,0,0,0,1,1,0,0,0,36,4,m,White European,yes,no,family member,Yes
2,3,1,0,0,0,0,0,1,1,0,1,36,4,m,middle eastern,yes,no,family member,Yes
3,4,1,1,1,1,1,1,1,1,1,1,24,10,m,Hispanic,no,no,family member,Yes
4,5,1,1,0,1,1,1,1,1,1,1,20,9,f,White European,no,yes,family member,Yes


In [3]:
asdt = asdt.drop(['Ethnicity', 'Case_No'], axis=1)

In [4]:
asdt.shape

(1054, 17)

In [5]:
asdt.sum().isnull()

Unnamed: 0,0
A1,False
A2,False
A3,False
A4,False
A5,False
A6,False
A7,False
A8,False
A9,False
A10,False


In [6]:
asdt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1054 entries, 0 to 1053
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   A1                      1054 non-null   int64 
 1   A2                      1054 non-null   int64 
 2   A3                      1054 non-null   int64 
 3   A4                      1054 non-null   int64 
 4   A5                      1054 non-null   int64 
 5   A6                      1054 non-null   int64 
 6   A7                      1054 non-null   int64 
 7   A8                      1054 non-null   int64 
 8   A9                      1054 non-null   int64 
 9   A10                     1054 non-null   int64 
 10  Age_Mons                1054 non-null   int64 
 11  Qchat-10-Score          1054 non-null   int64 
 12  Sex                     1054 non-null   object
 13  Jaundice                1054 non-null   object
 14  Family_mem_with_ASD     1054 non-null   object
 15  Who 

In [7]:
asdt.describe()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Age_Mons,Qchat-10-Score
count,1054.0,1054.0,1054.0,1054.0,1054.0,1054.0,1054.0,1054.0,1054.0,1054.0,1054.0,1054.0
mean,0.563567,0.448767,0.401328,0.512334,0.524668,0.57685,0.649905,0.459203,0.489564,0.586338,27.867173,5.212524
std,0.496178,0.497604,0.4904,0.500085,0.499628,0.494293,0.477226,0.498569,0.500128,0.492723,7.980354,2.907304
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.0,3.0
50%,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,30.0,5.0
75%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,36.0,8.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,36.0,10.0


In [8]:
asdt.columns

Index(['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'Age_Mons',
       'Qchat-10-Score', 'Sex', 'Jaundice', 'Family_mem_with_ASD',
       'Who completed the test', 'Class/ASD Traits '],
      dtype='object')

In [9]:
asdt['Class/ASD Traits '].value_counts()

Unnamed: 0_level_0,count
Class/ASD Traits,Unnamed: 1_level_1
Yes,728
No,326


In [10]:
# Convert columns with numeric data to numeric type
for column in asdt.columns:
    if column != 'Class/ASD Traits ':  # Exclude the grouping column
        try:
            asdt[column] = pd.to_numeric(asdt[column])
        except ValueError:
            # Handle columns that cannot be converted to numeric, e.g., print a warning
            print(f"Warning: Column '{column}' could not be converted to numeric.")

# Calculate the mean for numeric columns only
asdt.groupby('Class/ASD Traits ').mean(numeric_only=True)



Unnamed: 0_level_0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Age_Mons,Qchat-10-Score
Class/ASD Traits,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
No,0.190184,0.104294,0.101227,0.134969,0.104294,0.156442,0.248466,0.141104,0.058282,0.453988,27.070552,1.693252
Yes,0.730769,0.603022,0.535714,0.681319,0.712912,0.76511,0.82967,0.601648,0.682692,0.645604,28.223901,6.788462


Test train split

In [11]:
# separating the data and labels
X = asdt.drop(columns = 'Class/ASD Traits ', axis=1)
Y = asdt['Class/ASD Traits ']

In [12]:
print(X)

      A1  A2  A3  A4  A5  A6  A7  A8  A9  A10  Age_Mons  Qchat-10-Score Sex  \
0      0   0   0   0   0   0   1   1   0    1        28               3   f   
1      1   1   0   0   0   1   1   0   0    0        36               4   m   
2      1   0   0   0   0   0   1   1   0    1        36               4   m   
3      1   1   1   1   1   1   1   1   1    1        24              10   m   
4      1   1   0   1   1   1   1   1   1    1        20               9   f   
...   ..  ..  ..  ..  ..  ..  ..  ..  ..  ...       ...             ...  ..   
1049   0   0   0   0   0   0   0   0   0    1        24               1   f   
1050   0   0   1   1   1   0   1   0   1    0        12               5   m   
1051   1   0   1   1   1   1   1   1   1    1        18               9   m   
1052   1   0   0   0   0   0   0   1   0    1        19               3   m   
1053   1   1   0   0   1   1   0   1   1    0        24               6   m   

     Jaundice Family_mem_with_ASD Who completed the

In [13]:
print(Y)

0        No
1       Yes
2       Yes
3       Yes
4       Yes
       ... 
1049     No
1050    Yes
1051    Yes
1052     No
1053    Yes
Name: Class/ASD Traits , Length: 1054, dtype: object


In [14]:
categorical_cols = X.select_dtypes(include=['object']).columns

# Apply Label Encoding to all categorical columns
for col in categorical_cols:
    le = LabelEncoder()  # Create a new LabelEncoder for each column
    X[col] = le.fit_transform(X[col]) #Fit and transform the column data

In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, stratify=Y, random_state=2)

In [16]:
print(X.shape, X_train.shape, X_test.shape)

(1054, 16) (737, 16) (317, 16)


train model


In [17]:
sc = MinMaxScaler()
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

In [18]:
def train_model(model, X_train, y_train, X_test, y_test):
  model.fit(X_train, y_train)
  y_pred_train = model.predict(X_train)
  y_pred_test = model.predict(X_test)
  training_accuracy = accuracy_score(y_train, y_pred_train)
  testing_accuracy = accuracy_score(y_test, y_pred_test)
  print("Training Accuracy:", training_accuracy)
  print("Testing Accuracy:", testing_accuracy)

  results = pd.Series([training_accuracy, testing_accuracy], index=['Training Accuracy', 'Testing Accuracy'])
  return results

In [19]:
model = KNeighborsClassifier()
knn_results = train_model(model, X_train_scaled, Y_train, X_test_scaled, Y_test)
knn_results.index = ["KNeighbors Classifier Training Accuracy", "KNeighbors Classifier Testing Accuracy"]
knn_results

Training Accuracy: 0.9823609226594301
Testing Accuracy: 0.943217665615142


Unnamed: 0,0
KNeighbors Classifier Training Accuracy,0.982361
KNeighbors Classifier Testing Accuracy,0.943218


Making a Predictive System

In [21]:
input_data = (1,1,0,0,1,1,0,1,1,0,1,1,1,1,0,0) # Modified input data with 16 features, removing the extra feature
# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)
# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)
prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('The person is not autistic')
else:
  print('The person is autistic')

['Yes']
The person is autistic


In [22]:
print(X.columns)

Index(['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'Age_Mons',
       'Qchat-10-Score', 'Sex', 'Jaundice', 'Family_mem_with_ASD',
       'Who completed the test'],
      dtype='object')


In [23]:
import pickle

In [25]:
filename = 'ASD_todd_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [26]:
# loading the saved model
loaded_model = pickle.load(open('ASD_todd_model.sav', 'rb'))