<a href="https://colab.research.google.com/github/svponte/md/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Mineração de Dados**

In [5]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor

# Carrega dataframe
pdEnade = pd.read_csv("microdados_enade_2019.txt", sep = ';', low_memory=False)

# Quantas linhas e colunas no dataframe
pdEnade.shape


(130127, 137)

In [2]:
# Qual o nome das variáveis (colunas)
pdEnade.columns


Index(['NU_ANO', 'CO_IES', 'CO_CATEGAD', 'CO_ORGACAD', 'CO_GRUPO', 'CO_CURSO',
       'CO_MODALIDADE', 'CO_MUNIC_CURSO', 'CO_UF_CURSO', 'CO_REGIAO_CURSO',
       ...
       'QE_I59', 'QE_I60', 'QE_I61', 'QE_I62', 'QE_I63', 'QE_I64', 'QE_I65',
       'QE_I66', 'QE_I67', 'QE_I68'],
      dtype='object', length=137)

In [3]:

# Visualizar as primeiras linhas do dataframe
pdEnade.head()



Unnamed: 0,NU_ANO,CO_IES,CO_CATEGAD,CO_ORGACAD,CO_GRUPO,CO_CURSO,CO_MODALIDADE,CO_MUNIC_CURSO,CO_UF_CURSO,CO_REGIAO_CURSO,...,QE_I59,QE_I60,QE_I61,QE_I62,QE_I63,QE_I64,QE_I65,QE_I66,QE_I67,QE_I68
0,2019,1,10002,10028,5710,3,1,5103403,51,5,...,2.0,5.0,1.0,1.0,2.0,5.0,8.0,7.0,1.0,2.0
1,2019,1,10002,10028,5710,3,1,5103403,51,5,...,1.0,4.0,2.0,2.0,2.0,5.0,4.0,4.0,2.0,2.0
2,2019,1,10002,10028,5710,3,1,5103403,51,5,...,3.0,4.0,4.0,3.0,3.0,4.0,1.0,1.0,1.0,4.0
3,2019,1,10002,10028,5710,3,1,5103403,51,5,...,3.0,5.0,2.0,2.0,2.0,3.0,3.0,4.0,3.0,3.0
4,2019,1,10002,10028,5710,3,1,5103403,51,5,...,,,,,,,,,,


In [4]:
# Quais os dados faltantes
pdEnade.isnull().sum()



NU_ANO           0
CO_IES           0
CO_CATEGAD       0
CO_ORGACAD       0
CO_GRUPO         0
              ... 
QE_I64        5211
QE_I65        5211
QE_I66        5211
QE_I67        5211
QE_I68        5213
Length: 137, dtype: int64

In [6]:
# Sumário estatístico 
pdEnade.describe()



Unnamed: 0,NU_ANO,CO_IES,CO_CATEGAD,CO_ORGACAD,CO_GRUPO,CO_CURSO,CO_MODALIDADE,CO_MUNIC_CURSO,CO_UF_CURSO,CO_REGIAO_CURSO,...,QE_I59,QE_I60,QE_I61,QE_I62,QE_I63,QE_I64,QE_I65,QE_I66,QE_I67,QE_I68
count,130127.0,130127.0,130127.0,130127.0,130127.0,130127.0,130127.0,130127.0,130127.0,130127.0,...,124917.0,124917.0,124916.0,124915.0,124914.0,124916.0,124916.0,124916.0,124916.0,124914.0
mean,2019.0,651.845981,8092.331207,10026.15677,1978.701568,19807.223405,1.0,3436635.0,34.179778,3.068541,...,5.113371,5.098265,4.910692,4.787712,4.919569,5.345168,5.35463,5.27424,4.936269,4.970468
std,0.0,1018.862493,3906.414128,3.217906,2736.568759,15330.67588,0.0,825450.7,8.254537,0.908658,...,1.328882,1.323329,1.343456,1.456123,1.381967,1.111004,1.562077,1.220731,1.561222,1.427707
min,2019.0,1.0,93.0,10019.0,5.0,3.0,1.0,1100049.0,11.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,2019.0,243.0,10002.0,10022.0,17.0,9709.0,1.0,3106200.0,31.0,3.0,...,5.0,4.0,4.0,4.0,4.0,5.0,5.0,5.0,4.0,4.0
50%,2019.0,527.0,10005.0,10028.0,23.0,15475.0,1.0,3509502.0,35.0,3.0,...,6.0,6.0,5.0,5.0,5.0,6.0,6.0,6.0,5.0,6.0
75%,2019.0,634.0,10007.0,10028.0,5710.0,21599.0,1.0,4106902.0,41.0,4.0,...,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
max,2019.0,18759.0,10009.0,10028.0,6405.0,56664.0,1.0,5300108.0,53.0,5.0,...,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0


In [7]:
# Calculando a idade
pdEnade["Tempo"] = pdEnade["ANO_IN_GRAD"] - pdEnade["ANO_FIM_EM"]
pdEnade.head()

Unnamed: 0,NU_ANO,CO_IES,CO_CATEGAD,CO_ORGACAD,CO_GRUPO,CO_CURSO,CO_MODALIDADE,CO_MUNIC_CURSO,CO_UF_CURSO,CO_REGIAO_CURSO,...,QE_I60,QE_I61,QE_I62,QE_I63,QE_I64,QE_I65,QE_I66,QE_I67,QE_I68,Tempo
0,2019,1,10002,10028,5710,3,1,5103403,51,5,...,5.0,1.0,1.0,2.0,5.0,8.0,7.0,1.0,2.0,2
1,2019,1,10002,10028,5710,3,1,5103403,51,5,...,4.0,2.0,2.0,2.0,5.0,4.0,4.0,2.0,2.0,6
2,2019,1,10002,10028,5710,3,1,5103403,51,5,...,4.0,4.0,3.0,3.0,4.0,1.0,1.0,1.0,4.0,4
3,2019,1,10002,10028,5710,3,1,5103403,51,5,...,5.0,2.0,2.0,2.0,3.0,3.0,4.0,3.0,3.0,1
4,2019,1,10002,10028,5710,3,1,5103403,51,5,...,,,,,,,,,,1


In [8]:
# Prediction Target
y = pdEnade.Tempo
pdEnade.head() 

Unnamed: 0,NU_ANO,CO_IES,CO_CATEGAD,CO_ORGACAD,CO_GRUPO,CO_CURSO,CO_MODALIDADE,CO_MUNIC_CURSO,CO_UF_CURSO,CO_REGIAO_CURSO,...,QE_I60,QE_I61,QE_I62,QE_I63,QE_I64,QE_I65,QE_I66,QE_I67,QE_I68,Tempo
0,2019,1,10002,10028,5710,3,1,5103403,51,5,...,5.0,1.0,1.0,2.0,5.0,8.0,7.0,1.0,2.0,2
1,2019,1,10002,10028,5710,3,1,5103403,51,5,...,4.0,2.0,2.0,2.0,5.0,4.0,4.0,2.0,2.0,6
2,2019,1,10002,10028,5710,3,1,5103403,51,5,...,4.0,4.0,3.0,3.0,4.0,1.0,1.0,1.0,4.0,4
3,2019,1,10002,10028,5710,3,1,5103403,51,5,...,5.0,2.0,2.0,2.0,3.0,3.0,4.0,3.0,3.0,1
4,2019,1,10002,10028,5710,3,1,5103403,51,5,...,,,,,,,,,,1


In [9]:
enade_features = ['CO_IES', 'CO_CATEGAD', 'CO_ORGACAD', 'CO_GRUPO', 'CO_CURSO', 'ANO_FIM_EM', 'ANO_IN_GRAD', 'NT_OBJ_FG', 'NT_DIS_FG', 'QE_I15', 'QE_I17', 'QE_I19', 'QE_I23', 'QE_I25']
X = pdEnade[enade_features]
X.describe()

Unnamed: 0,CO_IES,CO_CATEGAD,CO_ORGACAD,CO_GRUPO,CO_CURSO,ANO_FIM_EM,ANO_IN_GRAD
count,130127.0,130127.0,130127.0,130127.0,130127.0,130127.0,130127.0
mean,651.845981,8092.331207,10026.15677,1978.701568,19807.223405,2010.84834,2014.21615
std,1018.862493,3906.414128,3.217906,2736.568759,15330.67588,6.348421,18.917391
min,1.0,93.0,10019.0,5.0,3.0,1111.0,14.0
25%,243.0,10002.0,10022.0,17.0,9709.0,2010.0,2014.0
50%,527.0,10005.0,10028.0,23.0,15475.0,2012.0,2015.0
75%,634.0,10007.0,10028.0,5710.0,21599.0,2014.0,2015.0
max,18759.0,10009.0,10028.0,6405.0,56664.0,2612.0,2092.0


In [11]:
# 1. Definir o modelo -- my_model = ModelName()
enade_model = DecisionTreeRegressor(random_state=1) # Cria um novo modelo não treinado


In [None]:
# 2. Fitting a model -- my_model.fit(data)
enade_model.fit(X, y)

print(X.head())
print(enade_model.predict(X.head()))