In [1]:
import pandas as pd

df = pd.read_csv("autism_screening.csv")
df.shape

(704, 21)

In [2]:
# check column names
df.columns

Index(['A1_Score ', 'A2_Score ', 'A3_Score ', 'A4_Score ', 'A5_Score ',
       'A6_Score ', 'A7_Score ', 'A8_Score ', 'A9_Score ', 'A10_Score ',
       'age   ', 'gender ', 'ethnicity      ', 'jundice ', 'austim ',
       'contry_of_res        ', 'used_app_before ', 'result ', 'age_desc    ',
       'relation                 ', 'Class/ASD'],
      dtype='object')

In [3]:
# remove column space
df.columns = df.columns.str.replace(' ', '') 
df.columns

Index(['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score',
       'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'age', 'gender',
       'ethnicity', 'jundice', 'austim', 'contry_of_res', 'used_app_before',
       'result', 'age_desc', 'relation', 'Class/ASD'],
      dtype='object')

In [4]:
# column typo
df.rename(columns={
    "austim": "autism", 
    "contry_of_res":"country_of_res",
    "jundice":"jaundice"}, inplace=True)

df.columns

Index(['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score',
       'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'age', 'gender',
       'ethnicity', 'jaundice', 'autism', 'country_of_res', 'used_app_before',
       'result', 'age_desc', 'relation', 'Class/ASD'],
      dtype='object')

In [5]:
# change column name
df.rename(columns={
    "autism": "fam_with_autism", 
    "result": 'sum_score'}, inplace=True)

df.columns

Index(['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score',
       'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'age', 'gender',
       'ethnicity', 'jaundice', 'fam_with_autism', 'country_of_res',
       'used_app_before', 'sum_score', 'age_desc', 'relation', 'Class/ASD'],
      dtype='object')

In [6]:
# check if there are any missing values
df.isnull().sum()

A1_Score           0
A2_Score           0
A3_Score           0
A4_Score           0
A5_Score           0
A6_Score           0
A7_Score           0
A8_Score           0
A9_Score           0
A10_Score          0
age                0
gender             0
ethnicity          0
jaundice           0
fam_with_autism    0
country_of_res     0
used_app_before    0
sum_score          0
age_desc           0
relation           0
Class/ASD          0
dtype: int64

In [7]:
df['ethnicity'].unique()

array(['White-European ', 'Latino         ', '?              ',
       'Others         ', 'Black          ', 'Asian          ',
       'Middle Eastern ', 'Pasifika       ', 'South Asian    ',
       'Hispanic       ', 'Turkish        ', 'others         '],
      dtype=object)

In [8]:
# trim the spaces
df = df.map(lambda x: x.strip() if isinstance(x, str) else x)
df['ethnicity'].unique()

array(['White-European', 'Latino', '?', 'Others', 'Black', 'Asian',
       'Middle Eastern', 'Pasifika', 'South Asian', 'Hispanic', 'Turkish',
       'others'], dtype=object)

In [9]:
# check if there are any out of place values
for row in df.columns:
    print(row, df[row].unique())

A1_Score [1 0]
A2_Score [1 0]
A3_Score [1 0]
A4_Score [1 0]
A5_Score [0 1]
A6_Score [0 1]
A7_Score [1 0]
A8_Score [1 0]
A9_Score [0 1]
A10_Score [0 1]
age ['26.0' '24.0' '27.0' '35.0' '40.0' '36.0' '17.0' '64.0' '29.0' '33.0'
 '18.0' '31.0' '30.0' '34.0' '38.0' '42.0' '43.0' '48.0' '37.0' '55.0'
 '50.0' '53.0' '20.0' '28.0' '21.0' '383.0' '47.0' '32.0' '44.0' '' '19.0'
 '58.0' '45.0' '22.0' '39.0' '25.0' '23.0' '54.0' '60.0' '41.0' '46.0'
 '56.0' '61.0' '59.0' '52.0' '49.0' '51.0']
gender ['f' 'm']
ethnicity ['White-European' 'Latino' '?' 'Others' 'Black' 'Asian' 'Middle Eastern'
 'Pasifika' 'South Asian' 'Hispanic' 'Turkish' 'others']
jaundice ['no' 'yes']
fam_with_autism ['no' 'yes']
country_of_res ['United States' 'Brazil' 'Spain' 'Egypt' 'New Zealand' 'Bahamas'
 'Burundi' 'Austria' 'Argentina' 'Jordan' 'Ireland' 'United Arab Emirates'
 'Afghanistan' 'Lebanon' 'United Kingdom' 'South Africa' 'Italy'
 'Pakistan' 'Bangladesh' 'Chile' 'France' 'China' 'Australia' 'Canada'
 'Saudi Arabi

In [10]:
print("relation", df[df['relation']=='?'].shape)
print("ethnicity", df[df['ethnicity']=='?'].shape)

relation (95, 21)
ethnicity (95, 21)


In [11]:
df.loc[df['ethnicity'].eq("?") & df['relation'].eq("?")].shape

(95, 21)

In [12]:
# remove '?' values
df.drop(df.loc[df['relation']=='?'].index, inplace=True)
df.shape 

(609, 21)

In [13]:
# Before
df['ethnicity'].unique()

array(['White-European', 'Latino', 'Others', 'Black', 'Asian',
       'Middle Eastern', 'Pasifika', 'South Asian', 'Hispanic', 'Turkish',
       'others'], dtype=object)

In [14]:
df['ethnicity']=df['ethnicity'].replace({
    'others':'Others', 
    'Turkish':'Middle Eastern',
    'White-European':'White European'})
df['ethnicity'].unique()

array(['White European', 'Latino', 'Others', 'Black', 'Asian',
       'Middle Eastern', 'Pasifika', 'South Asian', 'Hispanic'],
      dtype=object)

In [15]:
# After change dtypes
df[['age','sum_score']]=df[['age','sum_score']].astype('float64').astype('int64')
df.dtypes

A1_Score            int64
A2_Score            int64
A3_Score            int64
A4_Score            int64
A5_Score            int64
A6_Score            int64
A7_Score            int64
A8_Score            int64
A9_Score            int64
A10_Score           int64
age                 int64
gender             object
ethnicity          object
jaundice           object
fam_with_autism    object
country_of_res     object
used_app_before    object
sum_score           int64
age_desc           object
relation           object
Class/ASD          object
dtype: object

In [16]:
# current DF
df[['age','sum_score']].head()

Unnamed: 0,age,sum_score
0,26,6
1,24,5
2,27,8
3,35,6
5,36,9


In [17]:
# check unique value under age_desc
df["age_desc"].unique()

array(['18 and more'], dtype=object)

In [18]:
# remove unused column: age_desc
df.drop("age_desc",axis=1,inplace=True)
df.columns

Index(['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score',
       'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'age', 'gender',
       'ethnicity', 'jaundice', 'fam_with_autism', 'country_of_res',
       'used_app_before', 'sum_score', 'relation', 'Class/ASD'],
      dtype='object')

In [22]:
# Create DF for AQ-10 only
score_df = df.iloc[:, 0:10] 
score_df[['sum', 'target']] = df[['sum_score', 'Class/ASD']]
score_df.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,sum,target
0,1,1,1,1,0,0,1,1,0,0,6,NO
1,1,1,0,1,0,0,0,1,0,1,5,NO
2,1,1,0,1,1,0,1,1,1,1,8,YES
3,1,1,0,1,0,0,1,1,0,1,6,NO
5,1,1,1,1,1,0,1,1,1,1,9,YES


In [21]:
# Save as CSV
score_df.to_csv("score.csv", index = False)
df.to_csv("cleaned_autism_screening.csv", index = False)

In [20]:
score_df.columns

Index(['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score',
       'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'sum', 'target'],
      dtype='object')