In [None]:
import pandas as pd
import geopandas as gpd
import gmaps
import gmaps.datasets
from IPython.display import display
import os
import re
import random
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['figure.dpi']= 300
DATA_DIR = os.path.join('data', 'argentina')
gmaps.configure(api_key="AIzaSyAfC08SFyHiKyMaS_jEmevwxH3KBXghV94")

In [None]:
#importing plotly and cufflinks in offline mode
import cufflinks as cf
import plotly.offline
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

# Get all databases

In [None]:
import generate_databases
generate_databases.store_all()

# pxdpto geo test

In [None]:
PXLOC = os.path.join(DATA_DIR, 'indec', 'pxdptodatosok.shp')
geodata = gpd.read_file(PXLOC, encoding='utf-8')
from utils.utils import normalize_dpto_name, validate_dpto_indexes
geodata['departamen'] = [normalize_dpto_name(n) for n in geodata['departamen']]
geodata['link'] = [int(n) for n in geodata['link']]
geodata

In [None]:
geodata[geodata['provincia']=='Santa Fe'].groupby('departamen').mean()

In [None]:
geodata['geometry'][1]

In [None]:
print(geodata['geometry'][1].centroid)

In [None]:
# Store our latitude and longitude
puntos = [[dpto.centroid.y, dpto.centroid.x] for dpto in geodata['geometry']]
latitudes = [dpto.centroid.y for dpto in geodata['geometry']]
longitudes = [dpto.centroid.x for dpto in geodata['geometry']]
pesos = geodata['hogares']

m = gmaps.Map()
m.add_layer(gmaps.heatmap_layer(
    puntos, weights=pesos,
    max_intensity=float(max(pesos)), point_radius=10.0
))
m

# Densidad

In [None]:
DENS_FILE = os.path.join(DATA_DIR, 'datosgobar-densidad-poblacion', 'pais.geojson')
db_densidad = gpd.read_file(DENS_FILE, encoding='utf-8')
db_densidad['area'] = [float(a) for a in db_densidad['area']]
db_densidad['poblacion'] = [float(a) if a else a for a in db_densidad['poblacion']]
db_densidad['hogares'] = [int(re.sub(r'(\d+).0+', r'\1', x)) if x else 0 for x in db_densidad['hogares']]
db_densidad

In [None]:
db_densidad['poblacion'].iplot(kind="histogram", bins=100, theme="white", title="Histograma de poblacion de secciones del pais", xTitle='Poblacion', yTitle='Cant.')

In [None]:
f"population: {sum(map(lambda s: float(s) if s else 0.0, db_densidad['poblacion']))}"

In [None]:
_ = list(map(display, random.choices(db_densidad['geometry'], k=3)))

In [None]:
db_densidad["area"].iplot(kind="histogram", bins=10000, theme="white", title="Histograma de areas de secciones del pais", xTitle='Area', yTitle='Cant.')

In [None]:
db_densidad["densidad"].iplot(kind="histogram", bins=10000, theme="white", title="Histograma de densidades de secciones del pais", xTitle='Densidad', yTitle='Cant.')

In [None]:
diff=(set(map(int, db_densidad['dpto_id']))^set(map(int, geodata['link'])))
diff

In [None]:
print("Missing deparments in density")
geodata.loc[geodata['link'].isin(diff)][['link', 'departamen', 'provincia']]

In [None]:
ax = db_densidad[db_densidad['dpto_id']=='82084'].plot(column='densidad', cmap='hot')
ax.set_title('Densidad de secciones de Rosario')
ax.set_xlim(-60.7, -60.6)
ax.set_ylim(-33.0, -32.9)

In [None]:
ax = db_densidad[db_densidad['prov_id']=='82'].plot(column='densidad', cmap='hot')
ax.set_title('Densidad de secciones de Santa Fe')

In [None]:
# Store our latitude and longitude
puntos = [[dpto.centroid.y, dpto.centroid.x] for dpto in db_densidad['geometry']]
pesos = db_densidad['hogares']

m = gmaps.Map()
m.add_layer(gmaps.heatmap_layer(
    puntos, weights=pesos,
    max_intensity=float(max(pesos)), point_radius=5.0
))
m

# Ministerio educacion database

In [None]:
pd.options.display.max_columns = None
SCHOOL_HDF = os.path.join(DATA_DIR, 'ministerio-educacion', 'matricula_y_secciones.hdf')
schooldb = pd.read_hdf(SCHOOL_HDF, 'matricula_y_secciones')
schooldb = schooldb.replace(to_replace="Ciudad de Buenos Aires", value="Ciudad Autónoma de Buenos Aires")
schooldb

In [None]:
count_cols = list(filter(lambda s: s.startswith('Alumnos con Sobreedad') or s.startswith('Repitentes') or s.startswith('Matrícula.'), schooldb.columns))

In [None]:
schooldb['total_alumnos'] = schooldb.loc[:,count_cols].sum(axis=1)

In [None]:
schooldb = schooldb[['Provincia', 'Ámbito', 'total_alumnos']].groupby(['Provincia', 'Ámbito']).mean().reset_index()

In [None]:
PXLOCDPTO = os.path.join(DATA_DIR, 'indec', 'pxdptodatosok.shp')
geodata = gpd.read_file(PXLOCDPTO, encoding='utf-8')
geodata['link'] = [int(n) for n in geodata['link']]
geodata

In [None]:
schooldb = pd.merge(geodata[['link', 'provincia']], schooldb, left_on='provincia', right_on='Provincia')

In [None]:
schooldb = schooldb.pivot_table('total_alumnos', ['link'], 'Ámbito').reset_index()[['link', 'Rural', 'Urbano']]
schooldb

In [None]:
schooldb.rename(columns={'link':'area', 'Rural': 'Alumnos rural', 'Urbano': 'Alumnos urbano'}, inplace=True)
schooldb

In [None]:
schooldb[schooldb['Ámbito'] == 'Rural']["total_alumnos"].iplot(kind="histogram", bins=1000, theme="white", title="Histograma de cantidad de alumnos por escuela", xTitle='Cantidad de alumnos', yTitle='Cant.')

In [None]:
schooldb[schooldb['Ámbito'] == 'Urbano']["total_alumnos"].iplot(kind="histogram", bins=1000, theme="white", title="Histograma de cantidad de alumnos por escuela", xTitle='Cantidad de alumnos', yTitle='Cant.')

In [None]:
schooldb[(schooldb['Ámbito'] == 'Urbano') & (schooldb['Provincia'] == 'Buenos Aires')]["total_alumnos"].iplot(kind="histogram", bins=1000, theme="white", title="Histograma de cantidad de alumnos por escuela", xTitle='Cantidad de alumnos', yTitle='Cant.')

In [None]:
schooldb['total_alumnos'].mean()

In [None]:
grouped = schooldb[['Provincia', 'Ámbito', 'total_alumnos']].groupby(['Provincia', 'Ámbito']).mean()
grouped

In [None]:
pd.options.display.max_columns = None
SCHOOL_HDF = os.path.join(DATA_DIR, 'ministerio-educacion', 'matricula_por_edad.hdf')
schooldb = pd.read_hdf(SCHOOL_HDF, 'matricula_por_edad')
schooldb

In [None]:
count_cols = list(filter(lambda s: 'años' in s, schooldb.columns))
count_cols

In [None]:
schooldb['total_alumnos'] = schooldb[count_cols].sum(axis=1)

In [None]:
class_size = schooldb.groupby(['Provincia', 'Ámbito']).mean().reset_index()[['Provincia', 'Ámbito', 'total_alumnos']]

In [None]:
schooldb['total_alumnos'].iplot(kind="histogram", bins=1000, theme="white", title="Histograma de cantidad de alumnos por escuela", xTitle='Cantidad de alumnos', yTitle='Cant.')

In [None]:
group_id = schooldb.groupby(['Ámbito']).sum()
group_id[group_id['De 20 a 24 años'] != 0]

In [None]:
cols = [, , ,, , '25 años y más', '30 años y más']
jardin = ['0 años', '1 año', '2 años', '3 años', '4 años', '5 años', '6 años y más']
primaria = ['6 años', '7 años', '8 años', '9 años', '10 años', '11 años']
secundaria = ['11 años y menos', '12 años', '13 años', '14 años', '15 años', '16 años', '17 años', '18 años y más']
empa = ['Menos de 18 años', '19 años', '20 años', '21 años', '22 años', '23 años', '24 años', 'De 20 a 24 años', 'De 25 a 29 años']

In [None]:
grouped = schooldb.groupby(['ID', 'Provincia', 'Ámbito']).sum()

# Fake population generator

In [None]:
FAKE_POP_HDF = os.path.join(DATA_DIR, 'fake_population.hdf')

In [None]:
fake_pop = pd.read_hdf(FAKE_POP_HDF, 'population')
fake_pop