Skip to content

Commit

Permalink
Merge branch 'master' into #22-Policy-headings
Browse files Browse the repository at this point in the history
  • Loading branch information
Jordi Planas committed Dec 22, 2020
2 parents a81d786 + efa2d71 commit a472ebe
Show file tree
Hide file tree
Showing 563 changed files with 136,578 additions and 1,060 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Custom
input/
Omdena_key_S3.json

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down Expand Up @@ -92,4 +93,7 @@ target/
.mypy_cache/
tasks/preprocess_text/notebooks/client_secret.json
tasks/extract_text/notebooks/client_secret.json
tasks/sBERT/models/uncased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001
tasks/data_augmentation/models/uncased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001
tasks/Scrapy/scrapy_official_newspapers/Scraped_Documents_local.csv

8 changes: 8 additions & 0 deletions tasks/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
## Tasks

To create a new task folder, run `./create_task_folder.sh --name={FOLDER_NAME}`. It will automatically create a folder named `{FOLDER_NAME}` and 3 directories inside of it (with `.gitkeep` files inside each to maintain the structure):
- input
- output
- src

In addition, if you want to have a `notebooks` folder, you can add the argument `--add_notebooks=True` or `-an=True`

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"Keywords_Legal_jargon.ipynb","provenance":[],"collapsed_sections":[]},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.7.6"}},"cells":[{"cell_type":"markdown","metadata":{"id":"fiQBQ95YqmNf"},"source":["## Legal Jargon Keywords\n","\n","This notebook is intended for the building of a keyword dictionary of legal jargon for the WRI Landscape Restoration Project"]},{"cell_type":"code","metadata":{"id":"cJShIdz8H3ue"},"source":["import json"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"nWYEtOSeqj-Y"},"source":["keywords = {\n","\t'Official_documents_types' : [\t\t\n","\t\t'Acta',\n","\t\t'Acuerdo',\n","\t\t'Anuncio',\n","\t\t'Aviso',\n","\t\t'Certificado',\n","\t\t'Circular',\n","\t\t'Comunicado',\n","\t\t'Concesión',\n","\t\t'Contrato',\n","\t\t'Convenio',\n","\t\t'Convocatoria',\n","\t\t'Corrección',\n","\t\t'Declaratoria',\n","\t\t'Decreto',\n","\t\t'Disposiciones',\n","\t\t'Disposición',\n","\t\t'Edicto',\n","\t\t'Ejecutoria',\n","\t\t'Ejecutoriada',\n","\t\t'Extracto',\n","\t\t'Fe de errata',\n","\t\t'Ley',\n","\t\t'Licitación',\n","\t\t'Listado',\n","\t\t'Norma',\n","\t\t'Nota',\n","\t\t'Notificación',\n","\t\t'Oficio',\n","\t\t'Orden',\n","\t\t'Ordenanza',\n","\t\t'Permiso',\n","\t\t'Programa',\n","\t\t'Rectificación',\n","\t\t'Resoluciones',\n","\t\t'Resolución']\n","}"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"grdDjU62H3uk","outputId":"00a5300e-50be-4523-b0bf-0cab878bab4b"},"source":["print(keywords)"],"execution_count":null,"outputs":[{"output_type":"stream","text":["{'Official_documents_types': ['Acta', 'Anuncio', 'Aviso', 'Certificado', 'Circular', 'Comunicado', 'Concesión', 'Contrato', 'Convenio', 'Convocatoria', 'Corrección', 'Declaratoria', 'Decreto', 'Disposiciones', 'Disposición', 'Edicto', 'Ejecutoria', 'Ejecutoriada', 'Extracto', 'Fe de errata', 'Ley', 'Licitación', 'Listado', 'Norma', 'Nota', 'Notificación', 'Oficio', 'Orden', 'Ordenanza', 'Permiso', 'Programa', 'Rectificación', 'ResolucionesAcuerdo', 'Resolución']}\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"hi9oKzbOqlrW"},"source":["with open('Data/keywords_Legal_jargon.json', 'w') as dict:\n"," json.dump(keywords, dict)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"rJ9Wc_Anqlue"},"source":[""],"execution_count":null,"outputs":[]}]}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"nbformat":4,"nbformat_minor":0,"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.6.9"},"colab":{"name":"Keywords_maker.ipynb","provenance":[]}},"cells":[{"cell_type":"code","metadata":{"id":"bx_JaDQUsoBQ"},"source":["geostring = \"Aguascalientes,Baja California,Baja California Sur,Campeche,Coahuila de Zaragoza,Colima,Chiapas,Chihuahua,Distrito Federal,Durango,Guanajuato,Guerrero,Hidalgo,Jalisco,Estado de México,Michoacán de Ocampo,Morelos,Nayarit,Nuevo León,Oaxaca,Puebla,Querétaro,Quintana Roo,San Luis Potosí,Sinaloa,Sonora,Tabasco,Tamaulipas,Tlaxcala,Veracruz de Ignacio de la Llave,Yucatán,Zacatecas\""],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"FxCixUNFsoBZ"},"source":["geocodes=geostring.split(sep=',')"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"cO7RMSPHsoBg","outputId":"22cbb2d9-4fb8-4039-e861-311fcbfdd8cc"},"source":["geocodes"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["['Aguascalientes',\n"," 'Baja California',\n"," 'Baja California Sur',\n"," 'Campeche',\n"," 'Coahuila de Zaragoza',\n"," 'Colima',\n"," 'Chiapas',\n"," 'Chihuahua',\n"," 'Distrito Federal',\n"," 'Durango',\n"," 'Guanajuato',\n"," 'Guerrero',\n"," 'Hidalgo',\n"," 'Jalisco',\n"," 'Estado de México',\n"," 'Michoacán de Ocampo',\n"," 'Morelos',\n"," 'Nayarit',\n"," 'Nuevo León',\n"," 'Oaxaca',\n"," 'Puebla',\n"," 'Querétaro',\n"," 'Quintana Roo',\n"," 'San Luis Potosí',\n"," 'Sinaloa',\n"," 'Sonora',\n"," 'Tabasco',\n"," 'Tamaulipas',\n"," 'Tlaxcala',\n"," 'Veracruz de Ignacio de la Llave',\n"," 'Yucatán',\n"," 'Zacatecas']"]},"metadata":{"tags":[]},"execution_count":95}]},{"cell_type":"code","metadata":{"id":"AznbB0qtsoBl"},"source":["spec_strings = \"Arbóreo,Boscoso,Bosque,Especie maderable,Especie para leña,Forestación,Forestal,Leña,Madera,Maderero,Plantación,Reforestación,Silvicultura,Sobrepastoreo,Tala,Árbol,Agricultura,Agropecuaria,Biocombustibles,Cultivo,Cultivo ilícito,Ganadero,Ganado,Germoplasma,Rebaño,Regadío,Riego,Rural,Vegetal,Vivero,Desarrollo urbano,Libertad de terreno,Promoción vivienda,Recalificación terreno,Suelo urbanizable,Urbanización,Uso del suelo,Área protegida privada,Área silvestre protegida,Áreas protegidas privadas,Áreas silvestres protegidas,Ambiental,Bienes naturales,Conservación mediambiental,Corredor biológico,Deforestación,Especies amenazadas,Fauna\""],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"Dvm-OhxVsoBo"},"source":["spec_codes = spec_strings.split(sep=',')"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"vaRtoCwHsoBr","outputId":"497c3e1c-4667-4d64-ffb0-68f08df6e7ee"},"source":["spec_codes"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["['Arbóreo',\n"," 'Boscoso',\n"," 'Bosque',\n"," 'Especie maderable',\n"," 'Especie para leña',\n"," 'Forestación',\n"," 'Forestal',\n"," 'Leña',\n"," 'Madera',\n"," 'Maderero',\n"," 'Plantación',\n"," 'Reforestación',\n"," 'Silvicultura',\n"," 'Sobrepastoreo',\n"," 'Tala',\n"," 'Árbol',\n"," 'Agricultura',\n"," 'Agropecuaria',\n"," 'Biocombustibles',\n"," 'Cultivo',\n"," 'Cultivo ilícito',\n"," 'Ganadero',\n"," 'Ganado',\n"," 'Germoplasma',\n"," 'Rebaño',\n"," 'Regadío',\n"," 'Riego',\n"," 'Rural',\n"," 'Vegetal',\n"," 'Vivero',\n"," 'Desarrollo urbano',\n"," 'Libertad de terreno',\n"," 'Promoción vivienda',\n"," 'Recalificación terreno',\n"," 'Suelo urbanizable',\n"," 'Urbanización',\n"," 'Uso del suelo',\n"," 'Área protegida privada',\n"," 'Área silvestre protegida',\n"," 'Áreas protegidas privadas',\n"," 'Áreas silvestres protegidas',\n"," 'Ambiental',\n"," 'Bienes naturales',\n"," 'Conservación mediambiental',\n"," 'Corredor biológico',\n"," 'Deforestación',\n"," 'Especies amenazadas',\n"," 'Fauna']"]},"metadata":{"tags":[]},"execution_count":108}]},{"cell_type":"code","metadata":{"id":"UjuShYDNsoBv"},"source":["law_strings =\"Acta,Acuerdo,Anuncio,Aviso,Circular,Comunicado,Concesión,Contrato,Convenio,Convocatoria,Declaratoria,Decreto,Disposiciones,Disposición,Ejecutoria,Fe de errata,Ley,Licitación,Norma,Nota,Notificación,Oficio,Orden,Ordenanza,Programa,Resolución\""],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"U4_RI2n3soBy"},"source":["law_codes = law_strings.split(sep=',')"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"a54Jxfi4soB4","outputId":"f7e9944c-0b37-46ed-d226-2b08ce58e756"},"source":["\n","law_codes"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["['Acta',\n"," 'Acuerdo',\n"," 'Anuncio',\n"," 'Aviso',\n"," 'Circular',\n"," 'Comunicado',\n"," 'Concesión',\n"," 'Contrato',\n"," 'Convenio',\n"," 'Convocatoria',\n"," 'Declaratoria',\n"," 'Decreto',\n"," 'Disposiciones',\n"," 'Disposición',\n"," 'Ejecutoria',\n"," 'Fe de errata',\n"," 'Ley',\n"," 'Licitación',\n"," 'Norma',\n"," 'Nota',\n"," 'Notificación',\n"," 'Oficio',\n"," 'Orden',\n"," 'Ordenanza',\n"," 'Programa',\n"," 'Resolución']"]},"metadata":{"tags":[]},"execution_count":112}]},{"cell_type":"code","metadata":{"id":"Ugp-279ssoCB","outputId":"b3408944-b65b-4f7b-bb95-c678562d60df"},"source":["len(geocodes)"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["32"]},"metadata":{"tags":[]},"execution_count":102}]},{"cell_type":"code","metadata":{"id":"ogC2lKeesoCF","outputId":"fd5f99bc-16d7-4110-f6b4-cd1730654700"},"source":["len(spec_codes)"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["48"]},"metadata":{"tags":[]},"execution_count":109}]},{"cell_type":"code","metadata":{"scrolled":true,"id":"3MnfxKv2soCK","outputId":"7d13f454-4c29-4b07-8da0-686c11d85c57"},"source":["len(law_codes)"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["26"]},"metadata":{"tags":[]},"execution_count":113}]},{"cell_type":"code","metadata":{"id":"M1W2f2vvsoCP","outputId":"7e6156fc-6407-4bd9-9134-1aa151471522"},"source":["len(geocodes) * len(spec_codes) * len(law_codes)"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["39936"]},"metadata":{"tags":[]},"execution_count":114}]},{"cell_type":"code","metadata":{"id":"m5tn7qWfsoCU","outputId":"aa7c9d68-2d85-4879-942d-6de7f6a14318"},"source":["spec_codes[0]"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["'Arbóreo'"]},"metadata":{"tags":[]},"execution_count":47}]},{"cell_type":"code","metadata":{"id":"u5ScbWvlsoCY"},"source":["keywords_string = \"\"\n","count= 0\n","for i in range(len(geocodes)):\n"," for j in range(len(spec_codes)):\n"," for k in range(len(law_codes)):\n"," if count < 30:\n"," count +=1\n"," codes = ['pdf', 'doc','html']\n"," for l in codes:\n","# code = codes[count%3]\n"," keyw = geocodes[i] + \"+\" + spec_codes[j] + \"+\" + law_codes[k] + \"+\" + \"filetype\" + \"+\" + l + r'\\n'\n"," keywords_string += keyw\n","\n"," else:\n"," break"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"rpd_7muBsoCc","outputId":"c88bb88e-03b4-4fc9-b5cc-bd7a5a22baa8"},"source":["print(keywords_string)"],"execution_count":null,"outputs":[{"output_type":"stream","text":["Aguascalientes+Arbóreo+Acta+filetype+pdf\\nAguascalientes+Arbóreo+Acta+filetype+doc\\nAguascalientes+Arbóreo+Acta+filetype+ html\\nAguascalientes+Arbóreo+Acuerdo+filetype+pdf\\nAguascalientes+Arbóreo+Acuerdo+filetype+doc\\nAguascalientes+Arbóreo+Acuerdo+filetype+ html\\nAguascalientes+Arbóreo+Anuncio+filetype+pdf\\nAguascalientes+Arbóreo+Anuncio+filetype+doc\\nAguascalientes+Arbóreo+Anuncio+filetype+ html\\nAguascalientes+Arbóreo+Aviso+filetype+pdf\\nAguascalientes+Arbóreo+Aviso+filetype+doc\\nAguascalientes+Arbóreo+Aviso+filetype+ html\\nAguascalientes+Arbóreo+Circular+filetype+pdf\\nAguascalientes+Arbóreo+Circular+filetype+doc\\nAguascalientes+Arbóreo+Circular+filetype+ html\\nAguascalientes+Arbóreo+Comunicado+filetype+pdf\\nAguascalientes+Arbóreo+Comunicado+filetype+doc\\nAguascalientes+Arbóreo+Comunicado+filetype+ html\\nAguascalientes+Arbóreo+Concesión+filetype+pdf\\nAguascalientes+Arbóreo+Concesión+filetype+doc\\nAguascalientes+Arbóreo+Concesión+filetype+ html\\nAguascalientes+Arbóreo+Contrato+filetype+pdf\\nAguascalientes+Arbóreo+Contrato+filetype+doc\\nAguascalientes+Arbóreo+Contrato+filetype+ html\\nAguascalientes+Arbóreo+Convenio+filetype+pdf\\nAguascalientes+Arbóreo+Convenio+filetype+doc\\nAguascalientes+Arbóreo+Convenio+filetype+ html\\nAguascalientes+Arbóreo+Convocatoria+filetype+pdf\\nAguascalientes+Arbóreo+Convocatoria+filetype+doc\\nAguascalientes+Arbóreo+Convocatoria+filetype+ html\\nAguascalientes+Arbóreo+Declaratoria+filetype+pdf\\nAguascalientes+Arbóreo+Declaratoria+filetype+doc\\nAguascalientes+Arbóreo+Declaratoria+filetype+ html\\nAguascalientes+Arbóreo+Decreto+filetype+pdf\\nAguascalientes+Arbóreo+Decreto+filetype+doc\\nAguascalientes+Arbóreo+Decreto+filetype+ html\\nAguascalientes+Arbóreo+Disposiciones+filetype+pdf\\nAguascalientes+Arbóreo+Disposiciones+filetype+doc\\nAguascalientes+Arbóreo+Disposiciones+filetype+ html\\nAguascalientes+Arbóreo+Disposición+filetype+pdf\\nAguascalientes+Arbóreo+Disposición+filetype+doc\\nAguascalientes+Arbóreo+Disposición+filetype+ html\\nAguascalientes+Arbóreo+Ejecutoria+filetype+pdf\\nAguascalientes+Arbóreo+Ejecutoria+filetype+doc\\nAguascalientes+Arbóreo+Ejecutoria+filetype+ html\\nAguascalientes+Arbóreo+Fe de errata+filetype+pdf\\nAguascalientes+Arbóreo+Fe de errata+filetype+doc\\nAguascalientes+Arbóreo+Fe de errata+filetype+ html\\nAguascalientes+Arbóreo+Ley+filetype+pdf\\nAguascalientes+Arbóreo+Ley+filetype+doc\\nAguascalientes+Arbóreo+Ley+filetype+ html\\nAguascalientes+Arbóreo+Licitación+filetype+pdf\\nAguascalientes+Arbóreo+Licitación+filetype+doc\\nAguascalientes+Arbóreo+Licitación+filetype+ html\\nAguascalientes+Arbóreo+Norma+filetype+pdf\\nAguascalientes+Arbóreo+Norma+filetype+doc\\nAguascalientes+Arbóreo+Norma+filetype+ html\\nAguascalientes+Arbóreo+Nota+filetype+pdf\\nAguascalientes+Arbóreo+Nota+filetype+doc\\nAguascalientes+Arbóreo+Nota+filetype+ html\\nAguascalientes+Arbóreo+Notificación+filetype+pdf\\nAguascalientes+Arbóreo+Notificación+filetype+doc\\nAguascalientes+Arbóreo+Notificación+filetype+ html\\nAguascalientes+Arbóreo+Oficio+filetype+pdf\\nAguascalientes+Arbóreo+Oficio+filetype+doc\\nAguascalientes+Arbóreo+Oficio+filetype+ html\\nAguascalientes+Arbóreo+Orden+filetype+pdf\\nAguascalientes+Arbóreo+Orden+filetype+doc\\nAguascalientes+Arbóreo+Orden+filetype+ html\\nAguascalientes+Arbóreo+Ordenanza+filetype+pdf\\nAguascalientes+Arbóreo+Ordenanza+filetype+doc\\nAguascalientes+Arbóreo+Ordenanza+filetype+ html\\nAguascalientes+Arbóreo+Programa+filetype+pdf\\nAguascalientes+Arbóreo+Programa+filetype+doc\\nAguascalientes+Arbóreo+Programa+filetype+ html\\nAguascalientes+Arbóreo+Resolución+filetype+pdf\\nAguascalientes+Arbóreo+Resolución+filetype+doc\\nAguascalientes+Arbóreo+Resolución+filetype+ html\\nAguascalientes+Boscoso+Acta+filetype+pdf\\nAguascalientes+Boscoso+Acta+filetype+doc\\nAguascalientes+Boscoso+Acta+filetype+ html\\nAguascalientes+Boscoso+Acuerdo+filetype+pdf\\nAguascalientes+Boscoso+Acuerdo+filetype+doc\\nAguascalientes+Boscoso+Acuerdo+filetype+ html\\nAguascalientes+Boscoso+Anuncio+filetype+pdf\\nAguascalientes+Boscoso+Anuncio+filetype+doc\\nAguascalientes+Boscoso+Anuncio+filetype+ html\\nAguascalientes+Boscoso+Aviso+filetype+pdf\\nAguascalientes+Boscoso+Aviso+filetype+doc\\nAguascalientes+Boscoso+Aviso+filetype+ html\\n\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"x-TL86vKsoCf"},"source":[""],"execution_count":null,"outputs":[]}]}
Loading

0 comments on commit a472ebe

Please sign in to comment.