# 1. Install, Imports, Settings

In [21]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [22]:
plt.rcParams["figure.figsize"] = [10, 6]
%config InlineBackend.figure_format = 'retina'

# 2. Load data

In [23]:
df = pd.read_csv("data/alertas.csv", sep="|")
print("Total Number of documents:", len(df))
print("Number of documents with no accesible text:", len(df[df['Text'].isnull()]))
print("Number of documents with accesible text:",  len(df[df['Text'].notnull()]))

Total Number of documents: 1753
Number of documents with no accesible text: 5
Number of documents with accesible text: 1748


In [24]:
df.head()

Unnamed: 0,Filename,Text,Subtype,Type,Year,Path,Departamento
0,"AT N° 003-18 NAR-Cumbitara, Maguí Payán, Polic...",Defensoria \ndel Pueblo \nCOLOMB IA \n \nBog...,Alerta Temprana,Advertencia,2018,data\Advertencia_PDF\AT 2018\AT N° 003-18 NAR-...,Nariño
1,AT N° 004-18 NAR-Tumaco.pdf,Defensoría \ndel Pueblo \nCO LO Mllt \nCarre...,Alerta Temprana,Advertencia,2018,data\Advertencia_PDF\AT 2018\AT N° 004-18 NAR-...,Nariño
2,AT N° 005-18 COR-Tierralta.pdf,Carrera 9 16 21 Bogotá DC \nPBX 57 1 3147300...,Alerta Temprana,Advertencia,2018,data\Advertencia_PDF\AT 2018\AT N° 005-18 COR-...,Córdoba
3,AT N° 006-18 ARA-Saravena.pdf,Defensoría \ndel Pueblo \nCarrera 9 1621 Bo...,Alerta Temprana,Advertencia,2018,data\Advertencia_PDF\AT 2018\AT N° 006-18 ARA-...,Arauca
4,"AT N° 007-18 MET-Puerto Lleras, Puerto Rico y ...",San Vicente Bajo \nl \nMargen \nIzquierda ...,Alerta Temprana,Advertencia,2018,data\Advertencia_PDF\AT 2018\AT N° 007-18 MET-...,Meta


# 3. Number of Documents by Type, Subtype, Year 

In [25]:
at = df[df['Text'].notnull()].groupby(["Type", "Subtype", "Year"]).agg({"Filename":"count"})
at.columns = ["TxtAvailable"]

nat = df[df['Text'].isnull()].groupby(["Type", "Subtype", "Year"]).agg({"Filename":"count"})
nat.columns = ['NoTxtAvailable']

dfr = pd.concat([at, nat], axis=1)
dfr = dfr.fillna(0)
dfr["Total"] = dfr["TxtAvailable"] + dfr["NoTxtAvailable"]
dfr['%'] = round((dfr["NoTxtAvailable"] * 100) / (dfr["Total"]), 2)
dfr[dfr["%"] > 0]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,TxtAvailable,NoTxtAvailable,Total,%
Type,Subtype,Year,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Advertencia,Informe de Riesgo,2003,82,2.0,84.0,2.38
Advertencia,Informe de Riesgo,2007,36,2.0,38.0,5.26
Advertencia,Informe de Riesgo,2008,31,1.0,32.0,3.12


# 7 Documentos con texto no disponible

In [26]:
df[df['Text'].isnull()]

Unnamed: 0,Filename,Text,Subtype,Type,Year,Path,Departamento
475,IR N° 079-03 ATLANTICO-Barranquilla.pdf,,Informe de Riesgo,Advertencia,2003,data\Advertencia_PDF\IR 2003PDF\IR N° 079-03 A...,Atlántico
476,IR N° 080-03 ATLANTICO-Barranquilla.pdf,,Informe de Riesgo,Advertencia,2003,data\Advertencia_PDF\IR 2003PDF\IR N° 080-03 A...,Atlántico
711,IR N° 024-07 META-Puerto López.pdf,,Informe de Riesgo,Advertencia,2007,data\Advertencia_PDF\IR 2007PDF\IR N° 024-07 M...,Meta
714,IR N° 027-07 A.I. META-Mapiripan y Puerto Conc...,,Informe de Riesgo,Advertencia,2007,data\Advertencia_PDF\IR 2007PDF\IR N° 027-07 A...,Meta
756,IR N° 031-08 BOLIVAR-Norosí - Inminencia.pdf,,Informe de Riesgo,Advertencia,2008,data\Advertencia_PDF\IR 2008PDF\IR N° 031-08 B...,Norte de Santander


In [27]:
list(df[df['Text'].isnull()]['Filename'])

['IR N° 079-03 ATLANTICO-Barranquilla.pdf',
 'IR N° 080-03 ATLANTICO-Barranquilla.pdf',
 'IR N° 024-07 META-Puerto López.pdf',
 'IR N° 027-07 A.I. META-Mapiripan y Puerto Concordia GUAVIARE-San José del Guaviare.pdf',
 'IR N° 031-08 BOLIVAR-Norosí - Inminencia.pdf']

# 7. Export HTML

In [29]:
!jupyter nbconvert --to html 2_Null_Documents.ipynb

[NbConvertApp] Converting notebook 2_Null_Documents.ipynb to html
[NbConvertApp] Writing 596079 bytes to 2_Null_Documents.html
