# Configuração de ambiente

Importação das bibliotecas:
- Numpy: para manipulação de arrays
- Pandas: para manipulação de dataframes
- Matplotlib: para visualização de dados
- Seaborn: para visualização de dados

E dentre outras para o aprendizado não supervisionado.

In [45]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering

# Leitura e visualização do dataset

Leitura do arquivo "dataset.csv" através da função "read_csv" do Pandas e armazenamento do resultado na variável "df". Uso do parâmetro "sep" para informar qual o separador de colunas do arquivo e do parâmetro "encoding" para informar qual o tipo de codificação do arquivo.

In [46]:
csv_url = "https://github.com/viniciusgugelmin/data-science-2/blob/master/projects/cursos-prouni/data/dataset_clean.csv?raw=true"

df = pd.read_csv(csv_url, sep=';', encoding='utf-8', low_memory=False)

Exibição das 5 primeiras linhas do DataFrame através da função "head" do Pandas para verificar se o arquivo foi carregado corretamente e ter uma ideia do que ele contém.

In [47]:
df.head()

Unnamed: 0,grau,turno,mensalidade,bolsa_integral_cotas,bolsa_integral_ampla,bolsa_parcial_cotas,bolsa_parcial_ampla,curso_id,curso_busca,cidade_busca,uf_busca,cidade_filtro,universidade_nome,campus_nome,campus_id,nome,nota_integral_ampla,nota_integral_cotas,nota_parcial_ampla,nota_parcial_cotas
0,Bacharelado,Integral,9999.99,15.0,14.0,0.0,0.0,706710394154,Medicina,Campo Grande,MS,NTAwMjAwNDAyNzA0,Universidade Anhanguera - UNIDERP,CAMPO GRANDE - SEDE - Miguel Couto,706710,Medicina,740.22,726.46,0.0,0.0
1,Bacharelado,Noturno,9836.4,1.0,0.0,0.0,0.0,104191210567043,Enfermagem,Crateus,CE,MjMwNDAxODA0MTAz,Faculdade Princesa do Oeste - FPO,UNIDADE SEDE - São Vicente,1041912,Enfermagem,663.36,0.0,0.0,0.0
2,Bacharelado,Integral,9715.61,2.0,5.0,6.0,10.0,1002328574024,Medicina,Sao Paulo,SP,MzUxNTA2MTUwMzA4,Universidade Cidade de São Paulo - UNICID,UNIVERSIDADE CIDADE DE SÃO PAULO - UNICID - SE...,1002328,Medicina,739.62,738.08,738.96,718.64
3,Bacharelado,Noturno,9689.34,3.0,2.0,0.0,0.0,104191212798093,Psicologia,Crateus,CE,MjMwNDAxODA0MTAz,Faculdade Princesa do Oeste - FPO,UNIDADE SEDE - São Vicente,1041912,Psicologia,651.0,652.22,0.0,0.0
4,Bacharelado,Integral,9674.34,4.0,1.0,5.0,2.0,65899611932754,Medicina,Rio Branco,AC,MTIwMjAwNDAwNDAx,Faculdade Barão do Rio Branco - FAB,CAMPUS - RIO BRANCO - JARDIM EUROPA II - Jard...,658996,Medicina,758.32,723.94,734.92,711.26


Realizamos algumas visualizações de gráficos para entender qual a proporção e com o que estamos trabalhando.

In [48]:
hist =  px.histogram (df,  x = "turno", nbins=60) 
hist.update_layout(width=600,height=400,title_text='Distribuição de turnos') 
hist.show()

In [49]:
df_uf_sort = df.copy()
df_uf_sort = df_uf_sort.sort_values(by=['uf_busca'], ascending=True)

hist =  px.histogram (df_uf_sort,  x = "uf_busca", nbins=60) 
hist.update_layout(width=1000,height=400,title_text='Distribuição de estados') 
hist.show()

Verificamos o total de linhas e colunas

In [50]:
df.shape

(41447, 20)

# Limpeza e troca de valores

Iniciamos a limpeza de colunas desnecessárias para o aprendizado e trocamos valores de texto usando o LabelEncoder para opções numéricas.

In [51]:
le = LabelEncoder()
df_new = df.copy()

df_new["grau"] = le.fit_transform(df_new.grau.values)
df_new["turno"] = le.fit_transform(df_new.turno.values)
df_new["curso_busca"] = le.fit_transform(df_new.curso_busca.values)
df_new["cidade_busca"] = le.fit_transform(df_new.cidade_busca.values)
df_new = df_new.drop(['cidade_filtro'], axis=1)
df_new["uf_busca"] = le.fit_transform(df_new.uf_busca.values)
df_new["universidade_nome"] = le.fit_transform(df_new.universidade_nome.values)
df_new["campus_nome"] = le.fit_transform(df_new.campus_nome.values)
df_new["nome"] = le.fit_transform(df_new.nome.values)

df_new

Unnamed: 0,grau,turno,mensalidade,bolsa_integral_cotas,bolsa_integral_ampla,bolsa_parcial_cotas,bolsa_parcial_ampla,curso_id,curso_busca,cidade_busca,uf_busca,universidade_nome,campus_nome,campus_id,nome,nota_integral_ampla,nota_integral_cotas,nota_parcial_ampla,nota_parcial_cotas
0,0,1,9999.99,15.0,14.0,0.0,0.0,706710394154,210,207,11,1230,371,706710,210,740.22,726.46,0.00,0.00
1,0,3,9836.40,1.0,0.0,0.0,0.0,104191210567043,79,314,5,588,4574,1041912,79,663.36,0.00,0.00,0.00
2,0,1,9715.61,2.0,5.0,6.0,10.0,1002328574024,210,1006,25,1242,4678,1002328,210,739.62,738.08,738.96,718.64
3,0,3,9689.34,3.0,2.0,0.0,0.0,104191212798093,248,314,5,588,4574,1041912,248,651.00,652.22,0.00,0.00
4,0,1,9674.34,4.0,1.0,5.0,2.0,65899611932754,210,881,0,254,568,658996,210,758.32,723.94,734.92,711.26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41442,1,0,149.00,1.0,0.0,0.0,0.0,994312865605,281,751,26,1251,2697,9943,281,502.36,0.00,0.00,0.00
41443,2,0,144.00,1.0,2.0,2.0,5.0,65868712869275,205,1020,25,988,4408,658687,205,533.34,450.00,450.00,450.00
41444,2,4,139.00,1.0,0.0,0.0,0.0,1056445674232,169,154,6,73,1236,1056445,169,580.76,0.00,0.00,0.00
41445,0,0,139.00,1.0,0.0,0.0,0.0,96781210935,270,1147,7,1251,4753,9678,270,548.26,0.00,0.00,0.00


# Peparação para aplicação dos algoritmos

Pegamos uma amostra do dataset

In [52]:
df_ul = df_new.copy()
df_ul = df_ul.sample(frac = 0.1)

df_ul

Unnamed: 0,grau,turno,mensalidade,bolsa_integral_cotas,bolsa_integral_ampla,bolsa_parcial_cotas,bolsa_parcial_ampla,curso_id,curso_busca,cidade_busca,uf_busca,universidade_nome,campus_nome,campus_id,nome,nota_integral_ampla,nota_integral_cotas,nota_parcial_ampla,nota_parcial_cotas
3880,0,3,1256.00,3.0,3.0,0.0,0.0,1049909714333,137,1108,10,586,844,1049909,137,634.86,603.36,0.00,0.0
28954,2,0,293.00,2.0,5.0,0.0,0.0,200563213636925,168,614,25,58,3106,2005632,168,485.32,450.00,0.00,0.0
35571,1,0,257.19,0.0,0.0,1.0,1.0,105131913230005,30,799,25,1276,3318,1051319,30,0.00,0.00,594.80,556.9
7879,0,3,960.00,0.0,0.0,2.0,0.0,105860113304503,88,367,4,889,4628,1058601,88,0.00,0.00,602.76,0.0
20020,1,0,410.00,0.0,0.0,1.0,1.0,107270414204465,144,1006,25,17,4684,1072704,144,0.00,0.00,490.06,450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10519,2,2,813.54,4.0,13.0,0.0,0.0,65873813494671,254,558,14,882,500,658738,254,553.48,544.52,0.00,0.0
26306,1,0,308.00,0.0,1.0,0.0,0.0,24604999405,177,138,23,58,3028,24604,177,552.30,0.00,0.00,0.0
35372,1,0,257.19,0.0,0.0,1.0,1.0,200180712667385,136,143,25,1242,2009,2001807,136,0.00,0.00,450.00,450.0
10852,0,3,798.00,0.0,2.0,0.0,0.0,657723574443,30,560,23,1271,807,657723,30,618.92,0.00,0.00,0.0


# Escalonamento

In [53]:
scale = StandardScaler()
df_scale = scale.fit_transform(df_ul)
df_scale

array([[-0.98149636,  1.2845193 ,  1.03758735, ...,  1.34755799,
        -0.81264445, -0.37905932],
       [ 1.30342496, -0.87079004, -0.53665925, ...,  0.79989829,
        -0.81264445, -0.37905932],
       [ 0.1609643 , -0.87079004, -0.59519899, ..., -0.80708441,
         1.45743606,  2.87528413],
       ...,
       [ 0.1609643 , -0.87079004, -0.59519899, ..., -0.80708441,
         0.90480045,  2.25059511],
       [-0.98149636,  1.2845193 ,  0.28888025, ..., -0.80708441,
        -0.81264445, -0.37905932],
       [ 1.30342496, -0.87079004, -0.59573845, ..., -0.80708441,
        -0.81264445, -0.37905932]])

# Dendograma

Vamos ver uma representação gráfica em forma de árvore (Dendograma) sobre a estrutura dos agrupamentos.

In [54]:
plt.figure(figsize=(10, 10))
df_dendrogram = dendrogram(linkage(df_scale, method = "complete"))

Error in callback <function install_repl_displayhook.<locals>.post_execute at 0x7ffb22826ef0> (for post_execute):


KeyboardInterrupt: ignored

Error in callback <function flush_figures at 0x7ffb21723d40> (for post_execute):


KeyboardInterrupt: ignored