In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io import arff

%matplotlib inline

In [None]:
dataset = arff.loadarff('Autism-Child-Data.arff')
child = pd.DataFrame(dataset[0])

In [None]:
dataset = arff.loadarff('Autism-Adolescent-Data.arff')
adolescent = pd.DataFrame(dataset[0])

In [None]:
dataset = arff.loadarff('Autism-Adult-Data.arff')
adult = pd.DataFrame(dataset[0])

In [None]:
child.describe()

In [None]:
adolescent.describe()

In [None]:
adult.describe()

In [None]:
pd.options.display.max_columns = 22
pd.options.display.max_rows = 1500

## Pré processamento base Child

### Verificamos que há dados faltantes nas colunas ethnicity e relation

In [None]:
# Subtituindo por NAN
child = child.replace(b'?', np.nan)

In [None]:
# Verificando a quantidade de itens faltantes
child.isnull().sum()

### 43 das 288 instâncias da base Child Autism possuem valores nulos, isso representa 14,93% das instâncias, por isso iremos substituir os valores pela mediana das instâncias.

In [None]:
child['ethnicity'].mode()

In [None]:
child['ethnicity'] = child['ethnicity'].replace(np.nan, b'White-European')

In [None]:
child['relation'].mode()

In [None]:
child['relation'] = child['relation'].replace(np.nan, b'Parent')

In [None]:
child.head()

In [None]:
child = child.drop(child[child['age'].isnull() == True].index, axis = 0)

In [None]:
adolescent.duplicated().value_counts()

In [None]:
# Arrumar gráfico
child['age'].plot(kind = 'hist')

In [None]:
child['age'].value_counts()

In [None]:
child.boxplot()

In [None]:
child['age_desc'] = 0

## Pré Processando Adolescent Autism

In [None]:
adolescent = adolescent.replace(b'?', np.nan)

In [None]:
adolescent.isnull().sum()

### Foram encontrados 6 instâncias que apresentam valor nulo no campo ethnicity e relation que representa 5,77% dos dados

In [None]:
adolescent['ethnicity'].mode()

In [None]:
adolescent['ethnicity'] = adolescent['ethnicity'].replace(np.nan, b'White-European')

In [None]:
adolescent['relation'].mode()

In [None]:
adolescent['relation'] = adolescent['relation'].replace(np.nan, b'Self')

In [None]:
adolescent.isnull().sum()

In [None]:
adolescent.duplicated().value_counts()

In [None]:
adolescent.drop_duplicates(subset=None, keep= 'first', inplace=True)

In [None]:
adolescent.boxplot()

In [None]:
adolescent['age'].plot(kind = 'hist')

In [None]:
adolescent['age'].value_counts()

In [None]:
adolescent.head()

In [None]:
adolescent['age_desc'] = 1

## Pré processamento Adult Altism

### Foram identificadas 95 instâncias com itens faltantes, representando 13,49% das amostras

In [None]:
adult = adult.replace(b'?', np.nan)

In [None]:
adult.isnull().sum()

In [None]:
adult['ethnicity'].mode()

In [None]:
adult['ethnicity'] = adult['ethnicity'].replace(np.nan, b'White-European')

In [None]:
adult['relation'].mode()

In [None]:
adult['relation'] = adult['relation'].replace(np.nan, b'Self')

In [None]:
adult.isnull().sum()

In [None]:
adult = adult.drop(adult[adult['age'].isnull() == True].index, axis = 0)

In [None]:
adult.duplicated().value_counts()

In [None]:
adult.drop_duplicates(subset=None, keep= 'first', inplace=True)

In [None]:
adult.boxplot()

In [None]:
adult['age_desc'] = 2

### Removendo outlier de idade

In [None]:
adult['age'].nlargest(2)

In [None]:
adult = adult.drop(52, axis = 0)

In [None]:
adult.boxplot()

# Unindo os Datasets

In [None]:
df = pd.concat([child,adolescent,adult])

In [None]:
df.head()

### Mudando valores binários para inteiro

In [None]:
columns_score = ['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score']

In [None]:
df[columns_score] = df[columns_score].apply(pd.to_numeric)

In [None]:
df.head()

## Realizando label encoder

In [None]:
from sklearn.preprocessing import LabelEncoder
# Label encoder gender
df['gender'] = LabelEncoder().fit_transform(df['gender'])

In [None]:
#0 Feminino
#1 Masculino
df['gender'].unique()

0. Asian
1. Black
2. Hispanic
3. Latino
4. Middle Eastern
5. Others
6. Pasifika
7. South Asian
8. Turkish
9. White-European

In [None]:
# Label encoder ethnicity
ethnicity = LabelEncoder().fit_transform(df['ethnicity'])

In [None]:
df['ethnicity'] = LabelEncoder().fit_transform(ethnicity)

In [None]:
# Label encoder jundice
# 0. NO
# 1. YES

df['jundice'] = LabelEncoder().fit_transform(df['jundice'])

In [None]:
# Label encoder austim
# 0. NO
# 1. YES
df['austim'] = LabelEncoder().fit_transform(df['austim'])

In [None]:
# Label encoder used_app_before
# 0. NO
# 1. YES
df['used_app_before'] = LabelEncoder().fit_transform(df['used_app_before'])

In [None]:
df['relation'].unique()

In [None]:
df['relation'] = df['relation'].replace(b'self', b'Self')

In [None]:
df['relation'].unique()

In [None]:
# Label encoder used_app_before
# 0. Health Care Professional
# 1. Others
# 2. Parent
# 3. Relative
# 4. Self

df['relation'] = LabelEncoder().fit_transform(df['relation'])

In [None]:
df['relation'].unique()

In [None]:
# Label encoder na coluna target

#0. NO
#1. YES

df['Class/ASD'] = LabelEncoder().fit_transform(df['Class/ASD'])

In [None]:
# Não ha necessidade da cidade de residencia

df.drop('contry_of_res', axis = 1, inplace = True)