In [1]:
import os, sys
sys.path.insert(0, os.path.abspath(".."))

In [2]:
import pandas as pd
import boto3
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [3]:
sns.set_style("whitegrid")

## Importamos la información

In [4]:
session = boto3.session.Session()
s3client = session.client('s3')

In [5]:
response = s3client.get_object(Bucket='inai-summerofdata', Key='mlpreproc/pickles/df.pkl')

body = response['Body'].read()
df = pickle.loads(body)

In [6]:
df.head()

Unnamed: 0,folio,fechasolicitud,dependencia,estatus,medioentrada,tiposolicitud,descripcionsolicitud,otrosdatos,archivoadjuntosolicitud,medioentrega,...,sector,tipo_archivo_respuesta,cantidad_archivos_respuesta,texto_respuesta_adjunto,respuestareal,calidad_respuesta,calidad_respuesta_real,solicitud_longitud,descripcionsolicitud_lemma,codigo_calidad_respuesta_real
0,1857500001912,2012-01-01 05:04:07,pemex exploracion y produccion,terminada,electronica,informacion publica,copia de los documentos y los con que cuenta a...,,,entrega por internet en el infomex,...,energia,pdf,1.0,pemex exploracion y produccion oficio fecha 02...,entrega de informacion en medio electronico,satisfactoria,satisfactoria,768,copiar documento comportamiento historico tipi...,0
1,1857500002012,2012-01-01 05:13:16,pemex exploracion y produccion,terminada,electronica,informacion publica,del informacion y usados en la de flujo de efe...,,,entrega por internet en el infomex,...,energia,pdf,1.0,pemex exploracion y produccion oficio fecha 01...,entrega de informacion en medio electronico,satisfactoria,satisfactoria,548,informacion usar flujo efectivo pozo tipico ag...,0
2,1800000712,2012-01-01 09:46:39,secretaria de energia,terminada,electronica,informacion publica,se me proporcione en magnetico y por este medi...,,,entrega por internet en el infomex,...,energia,zip,2.0,,inexistencia de la informacion solicitada,no respondida,no respondida,269,proporcionar magnetico relacion ocupar amparar...,1
3,600003712,2012-01-01 10:11:41,secretaria de hacienda y credito publico,terminada,electronica,informacion publica,se me proporcione en magnetico y por este medi...,,,entrega por internet en el infomex,...,hacienda y credito publico,pdf,1.0,inidos secretaria de hacienda y credito public...,la informacion esta disponible publicamente,satisfactoria,satisfactoria,269,proporcionar magnetico relacion ocupar amparar...,0
4,1500002812,2012-01-01 10:32:15,secretaria de desarrollo agrario territorial y...,terminada,electronica,informacion publica,se me proporcione en magnetico y por este medi...,,,entrega por internet en el infomex,...,reforma agraria,zip,4.0,,sin respuesta,satisfactoria,no respondida,398,proporcionar magnetico relacion ocupar amparar...,1


## Implementación LDA

Quitamos palabras identificadas en el primer analisis LDA validadas con SocialTIC.

In [7]:
banned = ['federativo', 'materia', 'activar', 'cargar', 'transparencia', 'poblacion', 'favor', 
          'directo', 'fecho', 'registrar', 'emplear', 'campar', 'atencion',
          'pagar', 'reportar', 'publicos', 'encontrar', 'entidad',
          'acceso', 'derecho', 'concepto', 'recurso', 'beneficiario', 'esperar', 'expedientar', 'basar',
          'documento', 'presentar', 'clavar', 'personal', 'solucion', 'adjuntar', 'adjunto',
          'respuesta','solicitud','solicitar', 'solicito', 'solicitamos', 'solicite', 'informacion', 'mexico', 
          'titular', 'copiar', 'copie', 'copiamos', 'dependencia', 'contratar', 
          'contrato', 'servicio', 'proyectar', 'proyectamos', 'proyecte', 'proyeccion', 
          'nacional', 'administrativo', 'correo', 'programar', 'programa',
          'programacion', 'personar', 'edad', 'unidad', 'circuito']

In [8]:
f = lambda x: ' '.join([item for item in x.split() if item not in banned])
df['descripcionsolicitud_lemma'] = df['descripcionsolicitud_lemma'].apply(f)

In [9]:
df['solicitud_lemma_longitud'] = df['descripcionsolicitud_lemma'].str.len()

In [10]:
df.head()

Unnamed: 0,folio,fechasolicitud,dependencia,estatus,medioentrada,tiposolicitud,descripcionsolicitud,otrosdatos,archivoadjuntosolicitud,medioentrega,...,tipo_archivo_respuesta,cantidad_archivos_respuesta,texto_respuesta_adjunto,respuestareal,calidad_respuesta,calidad_respuesta_real,solicitud_longitud,descripcionsolicitud_lemma,codigo_calidad_respuesta_real,solicitud_lemma_longitud
0,1857500001912,2012-01-01 05:04:07,pemex exploracion y produccion,terminada,electronica,informacion publica,copia de los documentos y los con que cuenta a...,,,entrega por internet en el infomex,...,pdf,1.0,pemex exploracion y produccion oficio fecha 02...,entrega de informacion en medio electronico,satisfactoria,satisfactoria,768,comportamiento historico tipicos comprender es...,0,424
1,1857500002012,2012-01-01 05:13:16,pemex exploracion y produccion,terminada,electronica,informacion publica,del informacion y usados en la de flujo de efe...,,,entrega por internet en el infomex,...,pdf,1.0,pemex exploracion y produccion oficio fecha 01...,entrega de informacion en medio electronico,satisfactoria,satisfactoria,548,usar flujo efectivo pozo tipico aguar fria ela...,0,289
2,1800000712,2012-01-01 09:46:39,secretaria de energia,terminada,electronica,informacion publica,se me proporcione en magnetico y por este medi...,,,entrega por internet en el infomex,...,zip,2.0,,inexistencia de la informacion solicitada,no respondida,no respondida,269,proporcionar magnetico relacion ocupar amparar...,1,143
3,600003712,2012-01-01 10:11:41,secretaria de hacienda y credito publico,terminada,electronica,informacion publica,se me proporcione en magnetico y por este medi...,,,entrega por internet en el infomex,...,pdf,1.0,inidos secretaria de hacienda y credito public...,la informacion esta disponible publicamente,satisfactoria,satisfactoria,269,proporcionar magnetico relacion ocupar amparar...,0,143
4,1500002812,2012-01-01 10:32:15,secretaria de desarrollo agrario territorial y...,terminada,electronica,informacion publica,se me proporcione en magnetico y por este medi...,,,entrega por internet en el infomex,...,zip,4.0,,sin respuesta,satisfactoria,no respondida,398,proporcionar magnetico relacion ocupar amparar...,1,188


In [11]:
df_new = df[df['descripcionsolicitud_lemma'].map(len) > 15]

In [12]:
df_new['solicitud_lemma_longitud'].describe()

count    1.142011e+06
mean     2.324750e+02
std      2.612111e+02
min      1.600000e+01
25%      8.100000e+01
50%      1.450000e+02
75%      2.800000e+02
max      3.681000e+03
Name: solicitud_lemma_longitud, dtype: float64

In [13]:
df_new['anosolicitud'] = pd.DatetimeIndex(df_new['fechasolicitud']).year

In [14]:
df_new.head()

Unnamed: 0,folio,fechasolicitud,dependencia,estatus,medioentrada,tiposolicitud,descripcionsolicitud,otrosdatos,archivoadjuntosolicitud,medioentrega,...,cantidad_archivos_respuesta,texto_respuesta_adjunto,respuestareal,calidad_respuesta,calidad_respuesta_real,solicitud_longitud,descripcionsolicitud_lemma,codigo_calidad_respuesta_real,solicitud_lemma_longitud,anosolicitud
0,1857500001912,2012-01-01 05:04:07,pemex exploracion y produccion,terminada,electronica,informacion publica,copia de los documentos y los con que cuenta a...,,,entrega por internet en el infomex,...,1.0,pemex exploracion y produccion oficio fecha 02...,entrega de informacion en medio electronico,satisfactoria,satisfactoria,768,comportamiento historico tipicos comprender es...,0,424,2012
1,1857500002012,2012-01-01 05:13:16,pemex exploracion y produccion,terminada,electronica,informacion publica,del informacion y usados en la de flujo de efe...,,,entrega por internet en el infomex,...,1.0,pemex exploracion y produccion oficio fecha 01...,entrega de informacion en medio electronico,satisfactoria,satisfactoria,548,usar flujo efectivo pozo tipico aguar fria ela...,0,289,2012
2,1800000712,2012-01-01 09:46:39,secretaria de energia,terminada,electronica,informacion publica,se me proporcione en magnetico y por este medi...,,,entrega por internet en el infomex,...,2.0,,inexistencia de la informacion solicitada,no respondida,no respondida,269,proporcionar magnetico relacion ocupar amparar...,1,143,2012
3,600003712,2012-01-01 10:11:41,secretaria de hacienda y credito publico,terminada,electronica,informacion publica,se me proporcione en magnetico y por este medi...,,,entrega por internet en el infomex,...,1.0,inidos secretaria de hacienda y credito public...,la informacion esta disponible publicamente,satisfactoria,satisfactoria,269,proporcionar magnetico relacion ocupar amparar...,0,143,2012
4,1500002812,2012-01-01 10:32:15,secretaria de desarrollo agrario territorial y...,terminada,electronica,informacion publica,se me proporcione en magnetico y por este medi...,,,entrega por internet en el infomex,...,4.0,,sin respuesta,satisfactoria,no respondida,398,proporcionar magnetico relacion ocupar amparar...,1,188,2012


# 2012

In [15]:
filtro = df_new['anosolicitud'] == 2012
df_2012 = df_new[filtro]

In [16]:
df_2012.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 87022 entries, 0 to 652131
Data columns (total 31 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   folio                          87022 non-null  object        
 1   fechasolicitud                 87022 non-null  datetime64[ns]
 2   dependencia                    87022 non-null  object        
 3   estatus                        87022 non-null  object        
 4   medioentrada                   87022 non-null  object        
 5   tiposolicitud                  87022 non-null  object        
 6   descripcionsolicitud           87022 non-null  object        
 7   otrosdatos                     87022 non-null  object        
 8   archivoadjuntosolicitud        87022 non-null  object        
 9   medioentrega                   87022 non-null  object        
 10  fechalimite                    82412 non-null  datetime64[ns]
 11  respuesta     

In [17]:
data = df_2012[['descripcionsolicitud_lemma']]
data['index'] = data.index
docs = data

In [18]:
docs.head()

Unnamed: 0,descripcionsolicitud_lemma,index
0,comportamiento historico tipicos comprender es...,0
1,usar flujo efectivo pozo tipico aguar fria ela...,1
2,proporcionar magnetico relacion ocupar amparar...,2
3,proporcionar magnetico relacion ocupar amparar...,3
4,proporcionar magnetico relacion ocupar amparar...,4


In [19]:
import spacy
import nltk
from nltk import SnowballStemmer

In [20]:
nlp = spacy.load('es_core_news_sm')

In [21]:
def ProcesarTexto(texto):
    
    doc = nlp(texto)
    words = [t.text for t in doc]
    
    return words

In [22]:
import multiprocessing
from multiprocessing.dummy import Pool as ThreadPool

In [23]:
cores = multiprocessing.cpu_count()
pool = ThreadPool(cores)
cores

12

In [24]:
processed_docs = pool.map(ProcesarTexto, docs['descripcionsolicitud_lemma'])
pool.close()

In [25]:
import gensim

In [26]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [27]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [28]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [29]:
from gensim import corpora, models
from pprint import pprint

In [30]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

### 5 tópicos TF-IDF

In [31]:
lda_model_tfidf_5 = gensim.models.LdaMulticore(corpus_tfidf, 
                                            num_topics=5,
                                            id2word=dictionary,
                                            passes=2,
                                            workers=cores)

In [32]:
for idx, topic in lda_model_tfidf_5.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.005*"federal" + 0.004*"montar" + 0.004*"licitacion" + 0.004*"secretario" + 0.003*"relacion" + 0.003*"requerir" + 0.003*"profesional" + 0.003*"municipio" + 0.003*"correspondiente" + 0.003*"informar"
Topic: 1 Word: 0.005*"imss" + 0.004*"medicar" + 0.004*"federal" + 0.004*"trabajador" + 0.004*"medicamento" + 0.004*"periodo" + 0.004*"secretario" + 0.004*"hepatitis" + 0.003*"requerir" + 0.003*"informar"
Topic: 2 Word: 0.007*"secretario" + 0.005*"presupuestar" + 0.005*"publicar" + 0.004*"salud" + 0.004*"seguridad" + 0.004*"federal" + 0.004*"montar" + 0.003*"requerir" + 0.003*"obrar" + 0.003*"destinar"
Topic: 3 Word: 0.010*"pieza" + 0.009*"existencia" + 0.009*"medicamento" + 0.008*"inventario" + 0.006*"cuadrar" + 0.006*"social" + 0.006*"basico" + 0.006*"seguro" + 0.006*"almacen" + 0.006*"infomex"
Topic: 4 Word: 0.006*"federal" + 0.005*"tarde" + 0.005*"generar" + 0.004*"tener" + 0.004*"morbilidad" + 0.004*"excelente" + 0.004*"municipio" + 0.004*"montar" + 0.003*"institucion" +

### 10 tópicos TF-IDF

In [33]:
lda_model_tfidf_10 = gensim.models.LdaMulticore(corpus_tfidf, 
                                            num_topics=10,
                                            id2word=dictionary,
                                            passes=2,
                                            workers=cores)

In [34]:
for idx, topic in lda_model_tfidf_10.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.010*"imss" + 0.010*"excelente" + 0.009*"desagregar" + 0.009*"hospital" + 0.009*"consecutivo" + 0.008*"virus" + 0.008*"diagnosticar" + 0.008*"sexo" + 0.008*"hospitalario" + 0.008*"numeros"
Topic: 1 Word: 0.013*"medicamento" + 0.012*"medicar" + 0.007*"pieza" + 0.006*"fundamentar" + 0.006*"hospital" + 0.006*"periodo" + 0.005*"paciente" + 0.005*"recetar" + 0.005*"aplicar" + 0.005*"parrafo"
Topic: 2 Word: 0.006*"municipio" + 0.005*"secretario" + 0.005*"federal" + 0.004*"publicar" + 0.004*"director" + 0.003*"gobernar" + 0.003*"informar" + 0.003*"casar" + 0.003*"documentar" + 0.003*"plaza"
Topic: 3 Word: 0.008*"hepatitis" + 0.006*"federal" + 0.005*"requerir" + 0.005*"tarde" + 0.005*"morbilidad" + 0.004*"secretario" + 0.004*"delito" + 0.004*"gracia" + 0.003*"educacion" + 0.003*"escuela"
Topic: 4 Word: 0.006*"secretario" + 0.006*"pemex" + 0.005*"federal" + 0.004*"municipio" + 0.004*"informar" + 0.004*"casar" + 0.004*"montar" + 0.004*"seguridad" + 0.004*"presupuestar" + 0.003*"a

### 20 tópicos TF-IDF

In [35]:
lda_model_tfidf_20 = gensim.models.LdaMulticore(corpus_tfidf, 
                                            num_topics=20,
                                            id2word=dictionary,
                                            passes=2,
                                            workers=cores)

In [36]:
for idx, topic in lda_model_tfidf_20.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.008*"tabulador" + 0.006*"sueldo" + 0.006*"federal" + 0.005*"certificar" + 0.005*"secretario" + 0.004*"nivel" + 0.004*"periodo" + 0.004*"social" + 0.004*"trabajador" + 0.004*"silla"
Topic: 1 Word: 0.017*"medicar" + 0.015*"consecutivo" + 0.013*"diagnosticar" + 0.013*"numeros" + 0.013*"hospitalario" + 0.013*"desagregar" + 0.013*"virus" + 0.012*"imss" + 0.012*"sexo" + 0.012*"especialidad"
Topic: 2 Word: 0.010*"usuario" + 0.009*"comprador" + 0.008*"trabajar" + 0.007*"dato" + 0.007*"contactar" + 0.007*"afirmativo" + 0.007*"publicar" + 0.006*"informar" + 0.006*"secretario" + 0.006*"considerar"
Topic: 3 Word: 0.013*"morbilidad" + 0.013*"tarde" + 0.013*"tener" + 0.013*"excelente" + 0.012*"generar" + 0.010*"inconformidad" + 0.010*"hospital" + 0.008*"gobernar" + 0.007*"portal" + 0.005*"federal"
Topic: 4 Word: 0.006*"licitacion" + 0.006*"acta" + 0.005*"aguar" + 0.005*"secretario" + 0.004*"federal" + 0.004*"dictaminacion" + 0.004*"santo" + 0.004*"calificacion" + 0.004*"obrar" + 0.0

### Guardamos en S3

In [37]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2012/df_2012.pkl'
pickle_byte_obj = pickle.dumps(df_2012)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

{'ResponseMetadata': {'RequestId': '3A73112D6F387002',
  'HostId': 'o8jjUqXIF4rm5TkK9fKuQdbL3mO4p8wFWSOAMKkdMWFFN8w8mPtHX023Pmb5YGh6lcJWAsZ4j7g=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'o8jjUqXIF4rm5TkK9fKuQdbL3mO4p8wFWSOAMKkdMWFFN8w8mPtHX023Pmb5YGh6lcJWAsZ4j7g=',
   'x-amz-request-id': '3A73112D6F387002',
   'date': 'Thu, 30 Jul 2020 16:50:38 GMT',
   'etag': '"0513246133c4bf08fe3804235e65837b"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"0513246133c4bf08fe3804235e65837b"'}

In [38]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2012/processed_docs.pkl'
pickle_byte_obj = pickle.dumps(processed_docs)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

{'ResponseMetadata': {'RequestId': '53E4D906E40F3AE0',
  'HostId': 'etPll+TlI3xcMMm6Qwd20yCdDQte72aD7XdDaKQLjtbziE39MY57ElOBGNyZ2YUcOMpHH44J/o8=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'etPll+TlI3xcMMm6Qwd20yCdDQte72aD7XdDaKQLjtbziE39MY57ElOBGNyZ2YUcOMpHH44J/o8=',
   'x-amz-request-id': '53E4D906E40F3AE0',
   'date': 'Thu, 30 Jul 2020 16:51:15 GMT',
   'etag': '"4a42eff23258cba74ebc1ac3be93dc91"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"4a42eff23258cba74ebc1ac3be93dc91"'}

In [39]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2012/lda_model_tfidf_5.pkl'
pickle_byte_obj = pickle.dumps(lda_model_tfidf_5)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

{'ResponseMetadata': {'RequestId': 'A0A95F327D7BE6DD',
  'HostId': 'SFpTom//7mM+AXUn8HSEEQBsOSMnQBlfCUhVYlvatcgx4vUTBLuA0Cph59pN7gaOV40dVxQ38kA=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'SFpTom//7mM+AXUn8HSEEQBsOSMnQBlfCUhVYlvatcgx4vUTBLuA0Cph59pN7gaOV40dVxQ38kA=',
   'x-amz-request-id': 'A0A95F327D7BE6DD',
   'date': 'Thu, 30 Jul 2020 16:51:29 GMT',
   'etag': '"0bfa0bd35975e4fd953efd567d660a0a"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"0bfa0bd35975e4fd953efd567d660a0a"'}

In [40]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2012/lda_model_tfidf_10.pkl'
pickle_byte_obj = pickle.dumps(lda_model_tfidf_10)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

{'ResponseMetadata': {'RequestId': 'DA2BF73C3E71CEC3',
  'HostId': 'NQm/zhR9heLHnary/ekEADROEVy5txBvI8DLTkUikQXDjnI0MLXEUGqWOh3p+kLRrbRzUMzmpf4=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'NQm/zhR9heLHnary/ekEADROEVy5txBvI8DLTkUikQXDjnI0MLXEUGqWOh3p+kLRrbRzUMzmpf4=',
   'x-amz-request-id': 'DA2BF73C3E71CEC3',
   'date': 'Thu, 30 Jul 2020 16:51:34 GMT',
   'etag': '"3899b3bd83993372723da4d986957e23"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"3899b3bd83993372723da4d986957e23"'}

In [41]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2012/lda_model_tfidf_20.pkl'
pickle_byte_obj = pickle.dumps(lda_model_tfidf_20)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

{'ResponseMetadata': {'RequestId': 'FV1V5G5HDZAX4KEG',
  'HostId': 'RFKIuU/lHeFIBMHE+OEovPCPaM6cux9aBwwwXcAH8DBb4/sZhkBv68fIm6YYqWvq7yMcv7nfgkM=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'RFKIuU/lHeFIBMHE+OEovPCPaM6cux9aBwwwXcAH8DBb4/sZhkBv68fIm6YYqWvq7yMcv7nfgkM=',
   'x-amz-request-id': 'FV1V5G5HDZAX4KEG',
   'date': 'Thu, 30 Jul 2020 16:51:38 GMT',
   'etag': '"ba06bd42b4fdeb84791b8eee97e57dea"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"ba06bd42b4fdeb84791b8eee97e57dea"'}

# 2013

In [42]:
filtro = df_new['anosolicitud'] == 2013
df_2013 = df_new[filtro]

In [43]:
df_2013.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 101141 entries, 64999 to 735674
Data columns (total 31 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   folio                          101141 non-null  object        
 1   fechasolicitud                 101141 non-null  datetime64[ns]
 2   dependencia                    101141 non-null  object        
 3   estatus                        101141 non-null  object        
 4   medioentrada                   101141 non-null  object        
 5   tiposolicitud                  101141 non-null  object        
 6   descripcionsolicitud           101141 non-null  object        
 7   otrosdatos                     101141 non-null  object        
 8   archivoadjuntosolicitud        101141 non-null  object        
 9   medioentrega                   101141 non-null  object        
 10  fechalimite                    95819 non-null   datetime64[ns]
 

In [44]:
data = df_2013[['descripcionsolicitud_lemma']]
data['index'] = data.index
docs = data

In [45]:
docs.head()

Unnamed: 0,descripcionsolicitud_lemma,index
64999,recibir dirigir enlazar administracion tributa...,64999
65000,requerir sueldo confianza liconsa operativo ma...,65000
65001,guerra funcionar secretario marino,65001
65002,investigacion comision areas natural protegido...,65002
65005,referente buque clase reabastecer naval aguar ...,65005


In [46]:
import spacy
import nltk
from nltk import SnowballStemmer

In [47]:
nlp = spacy.load('es_core_news_sm')

In [48]:
def ProcesarTexto(texto):
    
    doc = nlp(texto)
    words = [t.text for t in doc]
    
    return words

In [49]:
import multiprocessing
from multiprocessing.dummy import Pool as ThreadPool

In [50]:
cores = multiprocessing.cpu_count()
pool = ThreadPool(cores)
cores

12

In [51]:
processed_docs = pool.map(ProcesarTexto, docs['descripcionsolicitud_lemma'])
pool.close()

In [52]:
import gensim

In [53]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [54]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [55]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [56]:
from gensim import corpora, models
from pprint import pprint

In [57]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

### 5 tópicos TF-IDF

In [58]:
lda_model_tfidf_5 = gensim.models.LdaMulticore(corpus_tfidf, 
                                            num_topics=5,
                                            id2word=dictionary,
                                            passes=2,
                                            workers=cores)

In [59]:
for idx, topic in lda_model_tfidf_5.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.007*"existencia" + 0.006*"pieza" + 0.006*"proveedor" + 0.006*"inventario" + 0.005*"delegacion" + 0.005*"licitacion" + 0.005*"basico" + 0.005*"dental" + 0.005*"presupuestar" + 0.005*"cuadrar"
Topic: 1 Word: 0.022*"acta" + 0.015*"fallir" + 0.012*"presupuestal" + 0.011*"partir" + 0.009*"fundamentar" + 0.008*"apertura" + 0.008*"proposición" + 0.008*"procedimiento" + 0.008*"articular" + 0.008*"gubernamental"
Topic: 2 Word: 0.005*"cofepris" + 0.005*"documentar" + 0.005*"federal" + 0.004*"sanitario" + 0.004*"servidor" + 0.004*"tramitar" + 0.004*"publicar" + 0.004*"secretario" + 0.003*"periodo" + 0.003*"mencionar"
Topic: 3 Word: 0.005*"medicamento" + 0.005*"requerir" + 0.004*"anual" + 0.004*"proveedor" + 0.004*"municipio" + 0.004*"especificación" + 0.004*"informar" + 0.003*"presupuestar" + 0.003*"gobernar" + 0.003*"mensual"
Topic: 4 Word: 0.006*"techar" + 0.004*"publicar" + 0.004*"obrar" + 0.004*"empresa" + 0.003*"imss" + 0.003*"informar" + 0.003*"pemex" + 0.003*"fideicomiso" 

### 10 tópicos TF-IDF

In [60]:
lda_model_tfidf_10 = gensim.models.LdaMulticore(corpus_tfidf, 
                                            num_topics=10,
                                            id2word=dictionary,
                                            passes=2,
                                            workers=cores)

In [61]:
for idx, topic in lda_model_tfidf_10.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.014*"cofepris" + 0.011*"documentar" + 0.010*"sanitario" + 0.007*"verificacion" + 0.007*"tramitar" + 0.006*"certificar" + 0.005*"ciudadano" + 0.005*"mencionar" + 0.005*"contraloria" + 0.005*"practicar"
Topic: 1 Word: 0.035*"acta" + 0.024*"fallir" + 0.013*"proposición" + 0.013*"suficiencia" + 0.013*"xiii" + 0.013*"apertura" + 0.013*"aclaración" + 0.013*"fundamentar" + 0.013*"compranet" + 0.013*"antecedente"
Topic: 2 Word: 0.012*"pieza" + 0.011*"existencia" + 0.009*"medicamento" + 0.008*"inventario" + 0.008*"cuadrar" + 0.008*"basico" + 0.006*"inicial" + 0.006*"aguar" + 0.005*"entrar" + 0.005*"consumir"
Topic: 3 Word: 0.006*"presupuestar" + 0.004*"secretario" + 0.004*"proyecto" + 0.004*"montar" + 0.004*"partir" + 0.004*"federal" + 0.004*"trabajador" + 0.004*"apoyar" + 0.004*"pemex" + 0.004*"educacion"
Topic: 4 Word: 0.020*"partir" + 0.017*"principal" + 0.017*"especificación" + 0.013*"proveedor" + 0.010*"dental" + 0.010*"presupuestal" + 0.009*"necesidad" + 0.009*"cepillo" +

### 20 tópicos TF-IDF

In [62]:
lda_model_tfidf_20 = gensim.models.LdaMulticore(corpus_tfidf, 
                                            num_topics=20,
                                            id2word=dictionary,
                                            passes=2,
                                            workers=cores)

In [63]:
for idx, topic in lda_model_tfidf_20.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.009*"caracteristicas" + 0.006*"establecio" + 0.006*"terminacion" + 0.006*"publicar" + 0.006*"ancho" + 0.005*"banda" + 0.005*"tecnicas" + 0.005*"mensual" + 0.005*"minero" + 0.005*"proporcionar"
Topic: 1 Word: 0.043*"acta" + 0.028*"fallir" + 0.015*"junto" + 0.015*"suficiencia" + 0.015*"apertura" + 0.015*"xiii" + 0.015*"antecedente" + 0.015*"fraccion" + 0.014*"aclaración" + 0.014*"compranet"
Topic: 2 Word: 0.007*"credito" + 0.006*"sexto" + 0.006*"banco" + 0.005*"fondo" + 0.005*"clausular" + 0.004*"partir" + 0.004*"salud" + 0.004*"creditos" + 0.004*"secretario" + 0.004*"consultar"
Topic: 3 Word: 0.005*"distrito" + 0.005*"federal" + 0.004*"proporcionar" + 0.004*"empresa" + 0.004*"local" + 0.004*"domiciliar" + 0.004*"administrador" + 0.004*"delegacion" + 0.004*"denominacion" + 0.004*"administracion"
Topic: 4 Word: 0.008*"siniestro" + 0.007*"servidor" + 0.006*"director" + 0.006*"publicar" + 0.005*"vacante" + 0.005*"relacion" + 0.005*"departamento" + 0.005*"bien" + 0.005*"lici

### Guardamos en S3

In [64]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2013/df_2013.pkl'
pickle_byte_obj = pickle.dumps(df_2013)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

{'ResponseMetadata': {'RequestId': '3B9F5F99CDA549ED',
  'HostId': '7tGrx5dwoCbNlti7s99pNvKofdAUk0gxi9oSVEr4R0KiRPxRRdmmTWJOyxI63pLpVSrc+cYmueg=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '7tGrx5dwoCbNlti7s99pNvKofdAUk0gxi9oSVEr4R0KiRPxRRdmmTWJOyxI63pLpVSrc+cYmueg=',
   'x-amz-request-id': '3B9F5F99CDA549ED',
   'date': 'Thu, 30 Jul 2020 17:06:00 GMT',
   'etag': '"6a50e1c9aea18d2c4cba34ab04909ddd"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"6a50e1c9aea18d2c4cba34ab04909ddd"'}

In [65]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2013/processed_docs.pkl'
pickle_byte_obj = pickle.dumps(processed_docs)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

{'ResponseMetadata': {'RequestId': 'C5D4C7F096374407',
  'HostId': 'iRb82CRkoosQ0Tq1M1FcYympKdZa930nFG+ysLZxT/Wn3hPqr9m9SHltbr+NsDZNnPxwO7RTyZo=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'iRb82CRkoosQ0Tq1M1FcYympKdZa930nFG+ysLZxT/Wn3hPqr9m9SHltbr+NsDZNnPxwO7RTyZo=',
   'x-amz-request-id': 'C5D4C7F096374407',
   'date': 'Thu, 30 Jul 2020 17:06:53 GMT',
   'etag': '"774514acc83c1e95a18feb90578ff8a1"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"774514acc83c1e95a18feb90578ff8a1"'}

In [66]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2013/lda_model_tfidf_5.pkl'
pickle_byte_obj = pickle.dumps(lda_model_tfidf_5)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

{'ResponseMetadata': {'RequestId': '787798C71D58DD1E',
  'HostId': 'DZ7pubYrxysN9DZFy6BtlyBh5KdjqkhMpkiaULNMrhpWOgNGZ0k5tDA6hr7gCPQ0+CNe+Dg1CFA=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'DZ7pubYrxysN9DZFy6BtlyBh5KdjqkhMpkiaULNMrhpWOgNGZ0k5tDA6hr7gCPQ0+CNe+Dg1CFA=',
   'x-amz-request-id': '787798C71D58DD1E',
   'date': 'Thu, 30 Jul 2020 17:07:07 GMT',
   'etag': '"63c1df4f0bd00015b507dfe3944802bf"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"63c1df4f0bd00015b507dfe3944802bf"'}

In [67]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2013/lda_model_tfidf_10.pkl'
pickle_byte_obj = pickle.dumps(lda_model_tfidf_10)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

{'ResponseMetadata': {'RequestId': '331288B63DBFC0EF',
  'HostId': '+LFc6xOp9ZJxa0ROoQhM24HQrxMDruWtzhn3caobPbPFGqwSSgk84d9iT88th+tgHNsE7oiAR2Y=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '+LFc6xOp9ZJxa0ROoQhM24HQrxMDruWtzhn3caobPbPFGqwSSgk84d9iT88th+tgHNsE7oiAR2Y=',
   'x-amz-request-id': '331288B63DBFC0EF',
   'date': 'Thu, 30 Jul 2020 17:07:11 GMT',
   'etag': '"9604dc912fba646bfb4a5f91bcc46515"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"9604dc912fba646bfb4a5f91bcc46515"'}

In [68]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2013/lda_model_tfidf_20.pkl'
pickle_byte_obj = pickle.dumps(lda_model_tfidf_20)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

{'ResponseMetadata': {'RequestId': '2A542280FC23A20F',
  'HostId': 'HgJNTs9M478SZ5z9P4emNx/5PW8ypHYX52nSE0aM2NbA/P/yPAQBi06Pd+xv5RTMb6Qo8s/bJF0=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'HgJNTs9M478SZ5z9P4emNx/5PW8ypHYX52nSE0aM2NbA/P/yPAQBi06Pd+xv5RTMb6Qo8s/bJF0=',
   'x-amz-request-id': '2A542280FC23A20F',
   'date': 'Thu, 30 Jul 2020 17:07:15 GMT',
   'etag': '"3e3324f0e71df72f21997152486251a3"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"3e3324f0e71df72f21997152486251a3"'}

# 2014

In [69]:
filtro = df_new['anosolicitud'] == 2014
df_2014 = df_new[filtro]

In [70]:
df_2014.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 106645 entries, 129999 to 817123
Data columns (total 31 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   folio                          106645 non-null  object        
 1   fechasolicitud                 106645 non-null  datetime64[ns]
 2   dependencia                    106645 non-null  object        
 3   estatus                        106645 non-null  object        
 4   medioentrada                   106645 non-null  object        
 5   tiposolicitud                  106645 non-null  object        
 6   descripcionsolicitud           106645 non-null  object        
 7   otrosdatos                     106645 non-null  object        
 8   archivoadjuntosolicitud        106645 non-null  object        
 9   medioentrega                   106645 non-null  object        
 10  fechalimite                    101290 non-null  datetime64[ns]


In [71]:
data = df_2014[['descripcionsolicitud_lemma']]
data['index'] = data.index
docs = data

In [72]:
docs.head()

Unnamed: 0,descripcionsolicitud_lemma,index
129999,instituto temer flujo legal migratorio fronter...,129999
130001,juicio amparar juicio juicio penal existente c...,130001
130002,zona riesgo migrante frotera concretamente zon...,130002
130003,zona riesgo migrante frotera concretamente zon...,130003
130004,zona riesgo migrante frotera concretamente zon...,130004


In [73]:
import spacy
import nltk
from nltk import SnowballStemmer

In [74]:
nlp = spacy.load('es_core_news_sm')

In [75]:
def ProcesarTexto(texto):
    
    doc = nlp(texto)
    words = [t.text for t in doc]
    
    return words

In [76]:
import multiprocessing
from multiprocessing.dummy import Pool as ThreadPool

In [77]:
cores = multiprocessing.cpu_count()
pool = ThreadPool(cores)
cores

12

In [78]:
processed_docs = pool.map(ProcesarTexto, docs['descripcionsolicitud_lemma'])
pool.close()

In [79]:
import gensim

In [80]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [81]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [82]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [83]:
from gensim import corpora, models
from pprint import pprint

In [84]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

### 5 tópicos TF-IDF

In [85]:
lda_model_tfidf_5 = gensim.models.LdaMulticore(corpus_tfidf, 
                                            num_topics=5,
                                            id2word=dictionary,
                                            passes=2,
                                            workers=cores)

In [86]:
for idx, topic in lda_model_tfidf_5.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.008*"presupuestar" + 0.006*"medicamento" + 0.005*"delegacion" + 0.005*"anual" + 0.005*"medicar" + 0.004*"hospital" + 0.004*"destinar" + 0.004*"social" + 0.004*"parrafo" + 0.004*"instituto"
Topic: 1 Word: 0.004*"federal" + 0.003*"publicar" + 0.003*"secretario" + 0.003*"escuela" + 0.003*"municipio" + 0.003*"informar" + 0.003*"periodo" + 0.003*"institucion" + 0.003*"educacion" + 0.003*"presupuestar"
Topic: 2 Word: 0.011*"acta" + 0.008*"fallir" + 0.007*"federal" + 0.006*"aplicar" + 0.006*"convocatorio" + 0.006*"fraccion" + 0.005*"articular" + 0.005*"presupuestal" + 0.005*"seguridad" + 0.005*"montar"
Topic: 3 Word: 0.006*"medicar" + 0.005*"medicamento" + 0.005*"imss" + 0.005*"especialidad" + 0.005*"hospital" + 0.004*"sanitario" + 0.004*"escribir" + 0.004*"delegacion" + 0.004*"sondar" + 0.004*"centrar"
Topic: 4 Word: 0.004*"federal" + 0.004*"requerir" + 0.004*"tramitar" + 0.004*"municipio" + 0.004*"internet" + 0.003*"publicar" + 0.003*"gracia" + 0.003*"empresa" + 0.003*"trab

### 10 tópicos TF-IDF

In [87]:
lda_model_tfidf_10 = gensim.models.LdaMulticore(corpus_tfidf, 
                                            num_topics=10,
                                            id2word=dictionary,
                                            passes=2,
                                            workers=cores)

In [88]:
for idx, topic in lda_model_tfidf_10.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.005*"ambiental" + 0.004*"informar" + 0.004*"internet" + 0.004*"publicar" + 0.004*"monto" + 0.004*"federal" + 0.004*"municipio" + 0.003*"sanitario" + 0.003*"costo" + 0.003*"formato"
Topic: 1 Word: 0.007*"sindicato" + 0.006*"trabajador" + 0.005*"federal" + 0.005*"articular" + 0.005*"sindical" + 0.004*"fraccion" + 0.004*"global" + 0.004*"secretario" + 0.003*"requerir" + 0.003*"publicar"
Topic: 2 Word: 0.017*"medicamento" + 0.008*"pieza" + 0.007*"parrafo" + 0.007*"medicar" + 0.006*"hospital" + 0.006*"fundamentar" + 0.006*"saludo" + 0.005*"establecer" + 0.005*"gratuito" + 0.005*"basico"
Topic: 3 Word: 0.011*"acta" + 0.007*"fallir" + 0.006*"montar" + 0.005*"viajar" + 0.005*"adquisicion" + 0.005*"sistema" + 0.004*"convenio" + 0.004*"vigente" + 0.004*"extinguidores" + 0.004*"presupuestar"
Topic: 4 Word: 0.009*"pemex" + 0.007*"produccion" + 0.006*"exploracion" + 0.004*"aplicar" + 0.004*"obrar" + 0.004*"celebrar" + 0.004*"empresa" + 0.004*"licitacion" + 0.004*"tipo" + 0.004*"fue

### 20 tópicos TF-IDF

In [89]:
lda_model_tfidf_20 = gensim.models.LdaMulticore(corpus_tfidf, 
                                            num_topics=20,
                                            id2word=dictionary,
                                            passes=2,
                                            workers=cores)

In [90]:
for idx, topic in lda_model_tfidf_20.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.008*"contener" + 0.008*"pensionario" + 0.007*"certificar" + 0.007*"seguridad" + 0.006*"casar" + 0.006*"previamente" + 0.006*"talar" + 0.006*"social" + 0.006*"prestacion" + 0.006*"instituto"
Topic: 1 Word: 0.006*"federal" + 0.005*"investigacion" + 0.005*"comision" + 0.005*"aplicar" + 0.004*"proteccion" + 0.004*"denunciar" + 0.004*"presupuestar" + 0.004*"documentar" + 0.004*"informar" + 0.004*"municipio"
Topic: 2 Word: 0.013*"pemex" + 0.010*"produccion" + 0.010*"exploracion" + 0.006*"celebrar" + 0.005*"oficio" + 0.005*"acuerdo" + 0.005*"escrito" + 0.004*"resolución" + 0.004*"plataforma" + 0.004*"tipo"
Topic: 3 Word: 0.009*"internet" + 0.008*"requerir" + 0.007*"formato" + 0.007*"salariar" + 0.007*"soldar" + 0.006*"servidor" + 0.006*"cuestionario" + 0.005*"costo" + 0.005*"contestar" + 0.005*"hacerse"
Topic: 4 Word: 0.026*"presupuestar" + 0.015*"anual" + 0.011*"partir" + 0.007*"presupuestal" + 0.005*"instituto" + 0.005*"gastar" + 0.005*"referente" + 0.004*"principal" + 0.00

### Guardamos en S3

In [91]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2014/df_2014.pkl'
pickle_byte_obj = pickle.dumps(df_2014)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

{'ResponseMetadata': {'RequestId': '5582815593136E05',
  'HostId': '+G93VgabKphXlCH45C1pJmoH0R3PWFMPLhx8xG3PhPzu/YrCHdTaZp3qKLj3AgXhhZxcOvO3ViI=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '+G93VgabKphXlCH45C1pJmoH0R3PWFMPLhx8xG3PhPzu/YrCHdTaZp3qKLj3AgXhhZxcOvO3ViI=',
   'x-amz-request-id': '5582815593136E05',
   'date': 'Thu, 30 Jul 2020 17:22:12 GMT',
   'etag': '"1ea543a39daef472b7d9ea8b58c29b14"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"1ea543a39daef472b7d9ea8b58c29b14"'}

In [92]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2014/processed_docs.pkl'
pickle_byte_obj = pickle.dumps(processed_docs)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

{'ResponseMetadata': {'RequestId': '7BA98940B16A76D5',
  'HostId': 'RAjD5TsH74FN9dKV+2H40Lit1+oHUcRliuDsswSk/etxsoX04BAcEyAsW9poH+fhU98CZszo/0U=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'RAjD5TsH74FN9dKV+2H40Lit1+oHUcRliuDsswSk/etxsoX04BAcEyAsW9poH+fhU98CZszo/0U=',
   'x-amz-request-id': '7BA98940B16A76D5',
   'date': 'Thu, 30 Jul 2020 17:23:35 GMT',
   'etag': '"aabf5cbe6515c21af962972c27b5c914"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"aabf5cbe6515c21af962972c27b5c914"'}

In [93]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2014/lda_model_tfidf_5.pkl'
pickle_byte_obj = pickle.dumps(lda_model_tfidf_5)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

{'ResponseMetadata': {'RequestId': '6W8JCS2N6G2V8Q0T',
  'HostId': 'gpEYM+sfXfkITWvg2HbKHgBrDcebPwKJhSMFMhE9TvR92RO9m3zB6vpN16/jCRN8QsxtlyIyM88=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'gpEYM+sfXfkITWvg2HbKHgBrDcebPwKJhSMFMhE9TvR92RO9m3zB6vpN16/jCRN8QsxtlyIyM88=',
   'x-amz-request-id': '6W8JCS2N6G2V8Q0T',
   'date': 'Thu, 30 Jul 2020 17:23:51 GMT',
   'etag': '"6fde0f0c2eba5c5502eb073c00ad6afd"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"6fde0f0c2eba5c5502eb073c00ad6afd"'}

In [94]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2014/lda_model_tfidf_10.pkl'
pickle_byte_obj = pickle.dumps(lda_model_tfidf_10)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

{'ResponseMetadata': {'RequestId': '87C4CE99360481C9',
  'HostId': 'GeyGIFTr0peaaRruhjewRrmecm/dT6tS47RqFTKW0CWCTjsbiccyoDB7I84ETeJ0CLAm17i/i2g=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'GeyGIFTr0peaaRruhjewRrmecm/dT6tS47RqFTKW0CWCTjsbiccyoDB7I84ETeJ0CLAm17i/i2g=',
   'x-amz-request-id': '87C4CE99360481C9',
   'date': 'Thu, 30 Jul 2020 17:23:55 GMT',
   'etag': '"3aeb4709ff6cf82bbc31c9c3f1074f8b"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"3aeb4709ff6cf82bbc31c9c3f1074f8b"'}

In [95]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2014/lda_model_tfidf_20.pkl'
pickle_byte_obj = pickle.dumps(lda_model_tfidf_20)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

{'ResponseMetadata': {'RequestId': '817201B5875D93E0',
  'HostId': 'HJt5UfouLcAa9BK4s6fp3jAKaZRT/SuYiOW2ScxW4LnmwRUKGCR7RYiBjiOXfg0FzLNwj5qEb6s=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'HJt5UfouLcAa9BK4s6fp3jAKaZRT/SuYiOW2ScxW4LnmwRUKGCR7RYiBjiOXfg0FzLNwj5qEb6s=',
   'x-amz-request-id': '817201B5875D93E0',
   'date': 'Thu, 30 Jul 2020 17:24:00 GMT',
   'etag': '"a78fd00bcdbdbb4e300cbe0326f5a956"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"a78fd00bcdbdbb4e300cbe0326f5a956"'}

# 2015

In [96]:
filtro = df_new['anosolicitud'] == 2015
df_2015 = df_new[filtro]

In [97]:
df_2015.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 111590 entries, 194997 to 908310
Data columns (total 31 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   folio                          111590 non-null  object        
 1   fechasolicitud                 111590 non-null  datetime64[ns]
 2   dependencia                    111590 non-null  object        
 3   estatus                        111590 non-null  object        
 4   medioentrada                   111590 non-null  object        
 5   tiposolicitud                  111590 non-null  object        
 6   descripcionsolicitud           111590 non-null  object        
 7   otrosdatos                     111590 non-null  object        
 8   archivoadjuntosolicitud        111590 non-null  object        
 9   medioentrega                   111590 non-null  object        
 10  fechalimite                    106108 non-null  datetime64[ns]


In [98]:
data = df_2015[['descripcionsolicitud_lemma']]
data['index'] = data.index
docs = data

In [99]:
docs.head()

Unnamed: 0,descripcionsolicitud_lemma,index
194997,separar infraestructura construir lascar const...,194997
194998,separar infraestructurasconstruidas construcci...,194998
195000,proporcionar manual deprocedimientos utilizar ...,195000
195001,proporcionar calendario deconvocatorias public...,195001
195002,andres borunda valle jubilar central termoelec...,195002


In [100]:
import spacy
import nltk
from nltk import SnowballStemmer

In [101]:
nlp = spacy.load('es_core_news_sm')

In [102]:
def ProcesarTexto(texto):
    
    doc = nlp(texto)
    words = [t.text for t in doc]
    
    return words

In [103]:
import multiprocessing
from multiprocessing.dummy import Pool as ThreadPool

In [104]:
cores = multiprocessing.cpu_count()
pool = ThreadPool(cores)
cores

12

In [105]:
processed_docs = pool.map(ProcesarTexto, docs['descripcionsolicitud_lemma'])
pool.close()

In [106]:
import gensim

In [107]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [108]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [109]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [110]:
from gensim import corpora, models
from pprint import pprint

In [111]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

### 5 tópicos TF-IDF

In [112]:
lda_model_tfidf_5 = gensim.models.LdaMulticore(corpus_tfidf, 
                                            num_topics=5,
                                            id2word=dictionary,
                                            passes=2,
                                            workers=cores)

In [113]:
for idx, topic in lda_model_tfidf_5.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.008*"presupuestar" + 0.005*"federal" + 0.005*"articular" + 0.005*"otorgar" + 0.004*"destinar" + 0.004*"sanitario" + 0.004*"municipio" + 0.004*"permiso" + 0.004*"partir" + 0.004*"escribir"
Topic: 1 Word: 0.005*"servidor" + 0.004*"publicar" + 0.004*"secretario" + 0.004*"federal" + 0.003*"municipio" + 0.003*"casar" + 0.003*"informar" + 0.003*"capacitacion" + 0.003*"requerir" + 0.003*"proporcionar"
Topic: 2 Word: 0.007*"medicamento" + 0.005*"hospital" + 0.005*"medicar" + 0.004*"delegacion" + 0.004*"imss" + 0.004*"informar" + 0.004*"ciudadano" + 0.004*"parrafo" + 0.004*"contraloria" + 0.004*"licitacion"
Topic: 3 Word: 0.004*"requerir" + 0.004*"informar" + 0.004*"documentar" + 0.003*"federal" + 0.003*"secretario" + 0.003*"municipio" + 0.003*"seguridad" + 0.003*"incluir" + 0.003*"escuela" + 0.003*"social"
Topic: 4 Word: 0.005*"montar" + 0.004*"trabajador" + 0.004*"federal" + 0.004*"informar" + 0.004*"secretario" + 0.004*"ayudar" + 0.003*"periodo" + 0.003*"administracion" + 0.

### 10 tópicos TF-IDF

In [114]:
lda_model_tfidf_10 = gensim.models.LdaMulticore(corpus_tfidf, 
                                            num_topics=10,
                                            id2word=dictionary,
                                            passes=2,
                                            workers=cores)

In [115]:
for idx, topic in lda_model_tfidf_10.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.009*"ahorrador" + 0.008*"escribir" + 0.008*"requerimiento" + 0.007*"proteccion" + 0.007*"librar" + 0.007*"informar" + 0.006*"octubre" + 0.006*"ficrea" + 0.005*"consecutivo" + 0.005*"especificar"
Topic: 1 Word: 0.005*"federal" + 0.004*"empresa" + 0.004*"municipio" + 0.004*"ambiental" + 0.004*"padron" + 0.003*"santo" + 0.003*"energia" + 0.003*"obrar" + 0.003*"proagro" + 0.003*"impactar"
Topic: 2 Word: 0.006*"laborar" + 0.006*"ayudar" + 0.005*"federal" + 0.005*"informar" + 0.005*"trabajador" + 0.005*"prestación" + 0.005*"administracion" + 0.004*"secretario" + 0.004*"montar" + 0.004*"autorizar"
Topic: 3 Word: 0.007*"pormenorizar" + 0.006*"requerir" + 0.005*"documentar" + 0.005*"preve" + 0.005*"campana" + 0.004*"social" + 0.004*"preguntar" + 0.004*"anual" + 0.004*"difusion" + 0.004*"contener"
Topic: 4 Word: 0.008*"partir" + 0.007*"cuestionario" + 0.007*"contestar" + 0.007*"presupuestar" + 0.007*"montar" + 0.005*"destinar" + 0.005*"formato" + 0.004*"desglosar" + 0.004*"secre

### 20 tópicos TF-IDF

In [116]:
lda_model_tfidf_20 = gensim.models.LdaMulticore(corpus_tfidf, 
                                            num_topics=20,
                                            id2word=dictionary,
                                            passes=2,
                                            workers=cores)

In [117]:
for idx, topic in lda_model_tfidf_20.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.007*"ciudadano" + 0.006*"gente" + 0.006*"delegacion" + 0.006*"contraloria" + 0.006*"detwitter" + 0.006*"imss" + 0.005*"corrupto" + 0.005*"acto" + 0.005*"denunciar" + 0.005*"decorrupcion"
Topic: 1 Word: 0.008*"social" + 0.007*"campana" + 0.006*"redar" + 0.006*"difusion" + 0.006*"salud" + 0.005*"comunicacion" + 0.005*"instituto" + 0.005*"consultar" + 0.004*"medio" + 0.004*"secretario"
Topic: 2 Word: 0.011*"requerimiento" + 0.010*"librar" + 0.010*"escribir" + 0.009*"octubre" + 0.005*"oficiar" + 0.005*"igualdad" + 0.005*"sonar" + 0.005*"derrotar" + 0.004*"secretario" + 0.004*"anexo"
Topic: 3 Word: 0.007*"partir" + 0.006*"trabajador" + 0.006*"prestacion" + 0.005*"informar" + 0.005*"montar" + 0.005*"autorizar" + 0.005*"nominar" + 0.005*"presupuestar" + 0.005*"presupuestal" + 0.005*"confianza"
Topic: 4 Word: 0.005*"prestar" + 0.005*"licenciar" + 0.005*"medicamento" + 0.005*"medicar" + 0.004*"administracion" + 0.004*"paciente" + 0.004*"recetar" + 0.004*"laborar" + 0.003*"estra

### Guardamos en S3

In [118]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2015/df_2015.pkl'
pickle_byte_obj = pickle.dumps(df_2015)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

{'ResponseMetadata': {'RequestId': '9C0BF1D2304BBDFB',
  'HostId': 'h2bPi3pdzroPTkcs0phimkuPBGtLVtqEnsA35in0hAiXS487WGTQ4FYMCaalXSgiRR7uvPMMy6A=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'h2bPi3pdzroPTkcs0phimkuPBGtLVtqEnsA35in0hAiXS487WGTQ4FYMCaalXSgiRR7uvPMMy6A=',
   'x-amz-request-id': '9C0BF1D2304BBDFB',
   'date': 'Thu, 30 Jul 2020 17:39:43 GMT',
   'etag': '"c89018291ab1819d16567f250bb6dad1"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"c89018291ab1819d16567f250bb6dad1"'}

In [119]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2015/processed_docs.pkl'
pickle_byte_obj = pickle.dumps(processed_docs)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

{'ResponseMetadata': {'RequestId': '202696941FB881E2',
  'HostId': 'zNVJfZdUHZyP9sX5Gpq4YLH6sAOY91C2+AHowKJ/JJ7A5N1ZBJP3bFlm2j5pJzNvMaEKATx5x4I=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'zNVJfZdUHZyP9sX5Gpq4YLH6sAOY91C2+AHowKJ/JJ7A5N1ZBJP3bFlm2j5pJzNvMaEKATx5x4I=',
   'x-amz-request-id': '202696941FB881E2',
   'date': 'Thu, 30 Jul 2020 17:40:25 GMT',
   'etag': '"d1deacf25555657471617e231e4204d9"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"d1deacf25555657471617e231e4204d9"'}

In [120]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2015/lda_model_tfidf_5.pkl'
pickle_byte_obj = pickle.dumps(lda_model_tfidf_5)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

{'ResponseMetadata': {'RequestId': '1Z3PANFMBK8QEX0G',
  'HostId': 'QIXsk8qA/83BNKvXcIl4CbHwWsZ0lqBAI3R1UrlKt8LPiB3ndJE01E40RhOcd4Qpg8ZpyJbQV2k=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'QIXsk8qA/83BNKvXcIl4CbHwWsZ0lqBAI3R1UrlKt8LPiB3ndJE01E40RhOcd4Qpg8ZpyJbQV2k=',
   'x-amz-request-id': '1Z3PANFMBK8QEX0G',
   'date': 'Thu, 30 Jul 2020 17:40:54 GMT',
   'etag': '"a5d555a90c03c14492f39c1b00638bdf"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"a5d555a90c03c14492f39c1b00638bdf"'}

In [121]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2015/lda_model_tfidf_10.pkl'
pickle_byte_obj = pickle.dumps(lda_model_tfidf_10)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

{'ResponseMetadata': {'RequestId': 'EB65DAE63F3A6010',
  'HostId': 'ueyp0Pb1ejVhY2GVDDNG5dumMy4FLnQcY1rv5gkN33cIF3fCHES1zFwrM6nhcXJp3zMilKB8jWo=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'ueyp0Pb1ejVhY2GVDDNG5dumMy4FLnQcY1rv5gkN33cIF3fCHES1zFwrM6nhcXJp3zMilKB8jWo=',
   'x-amz-request-id': 'EB65DAE63F3A6010',
   'date': 'Thu, 30 Jul 2020 17:40:58 GMT',
   'etag': '"6632db186978d1f0d3747597377fca91"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"6632db186978d1f0d3747597377fca91"'}

In [122]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2015/lda_model_tfidf_20.pkl'
pickle_byte_obj = pickle.dumps(lda_model_tfidf_20)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

{'ResponseMetadata': {'RequestId': '609D5E0A06208EDC',
  'HostId': 'Py7duunpl9nU76p6frIUUgsI/nFhlUZ67DAwd+V/G1r9K7JlUlwjkC8A4kan7Wx97hpCImjX6vA=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'Py7duunpl9nU76p6frIUUgsI/nFhlUZ67DAwd+V/G1r9K7JlUlwjkC8A4kan7Wx97hpCImjX6vA=',
   'x-amz-request-id': '609D5E0A06208EDC',
   'date': 'Thu, 30 Jul 2020 17:41:03 GMT',
   'etag': '"9d4539abcd984ed56bf9da079082436e"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"9d4539abcd984ed56bf9da079082436e"'}

# 2016

In [123]:
filtro = df_new['anosolicitud'] == 2016
df_2016 = df_new[filtro]

In [124]:
df_2016.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 132866 entries, 259996 to 1032903
Data columns (total 31 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   folio                          132866 non-null  object        
 1   fechasolicitud                 132866 non-null  datetime64[ns]
 2   dependencia                    132866 non-null  object        
 3   estatus                        132866 non-null  object        
 4   medioentrada                   132866 non-null  object        
 5   tiposolicitud                  132866 non-null  object        
 6   descripcionsolicitud           132866 non-null  object        
 7   otrosdatos                     132866 non-null  object        
 8   archivoadjuntosolicitud        132866 non-null  object        
 9   medioentrega                   132866 non-null  object        
 10  fechalimite                    126237 non-null  datetime64[ns]

In [125]:
data = df_2016[['descripcionsolicitud_lemma']]
data['index'] = data.index
docs = data

In [126]:
docs.head()

Unnamed: 0,descripcionsolicitud_lemma,index
259996,referenciar clinica imss medicar familiar atot...,259996
259997,referenciar clinica imss medicar familiar atot...,259997
259998,enterarme procesar ejecucion convenio establec...,259998
259999,contener completar covocatoria ingresar estudi...,259999
260000,informar rendicion empresa compraventa empresa...,260000


In [127]:
import spacy
import nltk
from nltk import SnowballStemmer

In [128]:
nlp = spacy.load('es_core_news_sm')

In [129]:
def ProcesarTexto(texto):
    
    doc = nlp(texto)
    words = [t.text for t in doc]
    
    return words

In [130]:
import multiprocessing
from multiprocessing.dummy import Pool as ThreadPool

In [131]:
cores = multiprocessing.cpu_count()
pool = ThreadPool(cores)
cores

12

In [132]:
processed_docs = pool.map(ProcesarTexto, docs['descripcionsolicitud_lemma'])
pool.close()

In [133]:
import gensim

In [134]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [135]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [136]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [137]:
from gensim import corpora, models
from pprint import pprint

In [138]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

### 5 tópicos TF-IDF

In [139]:
lda_model_tfidf_5 = gensim.models.LdaMulticore(corpus_tfidf, 
                                            num_topics=5,
                                            id2word=dictionary,
                                            passes=2,
                                            workers=cores)

In [140]:
for idx, topic in lda_model_tfidf_5.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.006*"procampo" + 0.005*"padron" + 0.004*"delito" + 0.004*"federal" + 0.004*"delegacion" + 0.003*"documentar" + 0.003*"secretario" + 0.003*"denunciar" + 0.003*"municipio" + 0.003*"correspondiente"
Topic: 1 Word: 0.012*"presupuestar" + 0.005*"fiscal" + 0.005*"destinar" + 0.004*"partir" + 0.004*"empresa" + 0.004*"social" + 0.004*"gastar" + 0.004*"anual" + 0.004*"informar" + 0.004*"municipio"
Topic: 2 Word: 0.006*"sanitario" + 0.005*"federal" + 0.004*"informar" + 0.004*"documentar" + 0.004*"tramitar" + 0.004*"articular" + 0.004*"permiso" + 0.003*"cofepris" + 0.003*"casar" + 0.003*"relacionar"
Topic: 3 Word: 0.004*"secretario" + 0.004*"ciudad" + 0.003*"municipio" + 0.003*"montar" + 0.003*"listar" + 0.003*"empresa" + 0.003*"trabajador" + 0.003*"requerir" + 0.003*"federal" + 0.002*"ambiental"
Topic: 4 Word: 0.013*"medicamento" + 0.013*"medicar" + 0.006*"paciente" + 0.006*"recetar" + 0.006*"hospital" + 0.005*"nivel" + 0.004*"pieza" + 0.004*"articular" + 0.004*"periodo" + 0.004

### 10 tópicos TF-IDF

In [141]:
lda_model_tfidf_10 = gensim.models.LdaMulticore(corpus_tfidf, 
                                            num_topics=10,
                                            id2word=dictionary,
                                            passes=2,
                                            workers=cores)

In [142]:
for idx, topic in lda_model_tfidf_10.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.015*"sanitario" + 0.010*"tramitar" + 0.009*"cofepris" + 0.007*"delito" + 0.006*"permiso" + 0.005*"consultar" + 0.005*"denunciar" + 0.005*"completo" + 0.005*"sustanciar" + 0.005*"informar"
Topic: 1 Word: 0.008*"municipio" + 0.007*"obrar" + 0.005*"santo" + 0.005*"construccion" + 0.005*"ubicar" + 0.004*"ambiental" + 0.004*"empresa" + 0.004*"zona" + 0.004*"aguar" + 0.004*"impactar"
Topic: 2 Word: 0.005*"laborar" + 0.005*"trabajador" + 0.004*"salariar" + 0.004*"secretario" + 0.004*"institucion" + 0.004*"plaza" + 0.004*"instituto" + 0.004*"soldar" + 0.004*"casar" + 0.003*"publicar"
Topic: 3 Word: 0.004*"tribunal" + 0.004*"amparar" + 0.004*"federal" + 0.004*"revision" + 0.003*"presupuestar" + 0.003*"oficiar" + 0.003*"instituto" + 0.003*"secretario" + 0.003*"colegiar" + 0.003*"resolucion"
Topic: 4 Word: 0.006*"escribir" + 0.005*"octubre" + 0.005*"librar" + 0.005*"requerimiento" + 0.004*"requerir" + 0.004*"electronicos" + 0.004*"institucional" + 0.004*"exel" + 0.004*"direcciona

### 20 tópicos TF-IDF

In [143]:
lda_model_tfidf_20 = gensim.models.LdaMulticore(corpus_tfidf, 
                                            num_topics=20,
                                            id2word=dictionary,
                                            passes=2,
                                            workers=cores)

In [144]:
for idx, topic in lda_model_tfidf_20.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.006*"surgir" + 0.006*"imposible" + 0.006*"hernan" + 0.006*"hesse" + 0.003*"municipio" + 0.003*"obrar" + 0.003*"empresa" + 0.003*"produccion" + 0.003*"informar" + 0.003*"casar"
Topic: 1 Word: 0.011*"computar" + 0.010*"telecomunicación" + 0.010*"digitalizacion" + 0.010*"fotocopiar" + 0.009*"centro" + 0.009*"tecnologias" + 0.009*"tecnicos" + 0.009*"anexo" + 0.009*"relacionar" + 0.009*"cuyo"
Topic: 2 Word: 0.007*"chiapas" + 0.006*"investigacion" + 0.004*"guerrero" + 0.004*"protocolar" + 0.004*"municipio" + 0.004*"energia" + 0.004*"requerir" + 0.004*"conacyt" + 0.004*"oaxaca" + 0.004*"informar"
Topic: 3 Word: 0.007*"delito" + 0.006*"denunciar" + 0.006*"humano" + 0.005*"desglosar" + 0.004*"regresar" + 0.004*"robar" + 0.004*"asociacion" + 0.004*"cuestionario" + 0.004*"partir" + 0.004*"contestar"
Topic: 4 Word: 0.013*"sanitario" + 0.010*"consultar" + 0.009*"cofepris" + 0.006*"instituto" + 0.006*"recibir" + 0.005*"anexo" + 0.005*"contener" + 0.005*"completo" + 0.005*"informar" 

### Guardamos en S3

In [145]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2016/df_2016.pkl'
pickle_byte_obj = pickle.dumps(df_2016)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

{'ResponseMetadata': {'RequestId': '99B1885E7223C724',
  'HostId': 'gT5L7NHcGwVoLFekdawQc6EyukYC6LekhewBBunwFT0SC/uKw7g8SW+ADuD83dKa7Y1Hv54E55w=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'gT5L7NHcGwVoLFekdawQc6EyukYC6LekhewBBunwFT0SC/uKw7g8SW+ADuD83dKa7Y1Hv54E55w=',
   'x-amz-request-id': '99B1885E7223C724',
   'date': 'Thu, 30 Jul 2020 17:59:45 GMT',
   'etag': '"af26342660663983f2fa930c79bf2ce0"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"af26342660663983f2fa930c79bf2ce0"'}

In [146]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2016/processed_docs.pkl'
pickle_byte_obj = pickle.dumps(processed_docs)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

{'ResponseMetadata': {'RequestId': 'CB468C77D19274EC',
  'HostId': 'XvhzMo/WWOUGO1XhRtGnHiNrdrrgosUwW+CN9FVmBp4vJYBAl/NjUTGeiKMLnzCLmeM0Jk7Kb24=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'XvhzMo/WWOUGO1XhRtGnHiNrdrrgosUwW+CN9FVmBp4vJYBAl/NjUTGeiKMLnzCLmeM0Jk7Kb24=',
   'x-amz-request-id': 'CB468C77D19274EC',
   'date': 'Thu, 30 Jul 2020 18:02:22 GMT',
   'etag': '"dba26894956c5a541806098885954de6"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"dba26894956c5a541806098885954de6"'}

In [147]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2016/lda_model_tfidf_5.pkl'
pickle_byte_obj = pickle.dumps(lda_model_tfidf_5)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

{'ResponseMetadata': {'RequestId': '7C30B40C15E75DD5',
  'HostId': 'wafH2GO4SkU6cC44a3dhTSyI8e8R9Q81i8GVFZrfwG+ZmDxDCtDGYWu4zvHRu68mSqRKnpXZ5Wg=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'wafH2GO4SkU6cC44a3dhTSyI8e8R9Q81i8GVFZrfwG+ZmDxDCtDGYWu4zvHRu68mSqRKnpXZ5Wg=',
   'x-amz-request-id': '7C30B40C15E75DD5',
   'date': 'Thu, 30 Jul 2020 18:02:38 GMT',
   'etag': '"774db951f40e34019daf868c40e56bed"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"774db951f40e34019daf868c40e56bed"'}

In [148]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2016/lda_model_tfidf_10.pkl'
pickle_byte_obj = pickle.dumps(lda_model_tfidf_10)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

{'ResponseMetadata': {'RequestId': 'DC2D4B5B666987D6',
  'HostId': 'L4ONYdTlS1OsdKGxUuNihvSl1aDH61a6lGC4p7TL8tXTWIGaqA1WuPfD/uRJnPw8+BI60QbT9Jw=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'L4ONYdTlS1OsdKGxUuNihvSl1aDH61a6lGC4p7TL8tXTWIGaqA1WuPfD/uRJnPw8+BI60QbT9Jw=',
   'x-amz-request-id': 'DC2D4B5B666987D6',
   'date': 'Thu, 30 Jul 2020 18:02:43 GMT',
   'etag': '"222493be288caa31b22d5bb3a5e3417e"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"222493be288caa31b22d5bb3a5e3417e"'}

In [149]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2016/lda_model_tfidf_20.pkl'
pickle_byte_obj = pickle.dumps(lda_model_tfidf_20)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

{'ResponseMetadata': {'RequestId': 'C1C8C26BF3AFC086',
  'HostId': 'qEKxBgtUKncTX9Bh4+LTcT+lZ50wXlUkAe/v9AeRCuYqdSGypnteWCUkZ8zFUw6YC4qBkycZRSo=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'qEKxBgtUKncTX9Bh4+LTcT+lZ50wXlUkAe/v9AeRCuYqdSGypnteWCUkZ8zFUw6YC4qBkycZRSo=',
   'x-amz-request-id': 'C1C8C26BF3AFC086',
   'date': 'Thu, 30 Jul 2020 18:02:48 GMT',
   'etag': '"222f773a6150a893ca01e651461c7e1c"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"222f773a6150a893ca01e651461c7e1c"'}

# 2017

In [150]:
filtro = df_new['anosolicitud'] == 2017
df_2017 = df_new[filtro]

In [151]:
df_2017.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 195396 entries, 324995 to 1221750
Data columns (total 31 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   folio                          195396 non-null  object        
 1   fechasolicitud                 195396 non-null  datetime64[ns]
 2   dependencia                    195396 non-null  object        
 3   estatus                        195396 non-null  object        
 4   medioentrada                   195396 non-null  object        
 5   tiposolicitud                  195396 non-null  object        
 6   descripcionsolicitud           195396 non-null  object        
 7   otrosdatos                     195396 non-null  object        
 8   archivoadjuntosolicitud        195396 non-null  object        
 9   medioentrega                   195396 non-null  object        
 10  fechalimite                    187922 non-null  datetime64[ns]

In [152]:
data = df_2017[['descripcionsolicitud_lemma']]
data['index'] = data.index
docs = data

In [153]:
docs.head()

Unnamed: 0,descripcionsolicitud_lemma,index
324995,alumno departamento obtener doctor ciencia cin...,324995
324996,alumno departamento obtener maestro ciencia ci...,324996
324997,alumno inscribir doctorar ciencia departamento...,324997
324998,curso seminario impartio carlos coello cinvest...,324998
324999,curso seminario impartio hector garcia compean...,324999


In [154]:
import spacy
import nltk
from nltk import SnowballStemmer

In [155]:
nlp = spacy.load('es_core_news_sm')

In [156]:
def ProcesarTexto(texto):
    
    doc = nlp(texto)
    words = [t.text for t in doc]
    
    return words

In [157]:
import multiprocessing
from multiprocessing.dummy import Pool as ThreadPool

In [158]:
cores = multiprocessing.cpu_count()
pool = ThreadPool(cores)
cores

12

In [159]:
processed_docs = pool.map(ProcesarTexto, docs['descripcionsolicitud_lemma'])
pool.close()

In [160]:
import gensim

In [161]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [162]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [163]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [164]:
from gensim import corpora, models
from pprint import pprint

In [165]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

### 5 tópicos TF-IDF

In [166]:
lda_model_tfidf_5 = gensim.models.LdaMulticore(corpus_tfidf, 
                                            num_topics=5,
                                            id2word=dictionary,
                                            passes=2,
                                            workers=cores)

In [167]:
for idx, topic in lda_model_tfidf_5.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.014*"medicar" + 0.009*"medicamento" + 0.007*"paciente" + 0.006*"recetar" + 0.005*"nivel" + 0.005*"hospital" + 0.005*"publicar" + 0.004*"servidor" + 0.004*"periodo" + 0.004*"tipo"
Topic: 1 Word: 0.020*"remuneracion" + 0.017*"servidor" + 0.014*"publicar" + 0.013*"tipo" + 0.012*"deducción" + 0.011*"soldar" + 0.011*"nivelar" + 0.010*"prestación" + 0.010*"vincular" + 0.010*"neto"
Topic: 2 Word: 0.008*"sanitario" + 0.007*"concesion" + 0.007*"titulos" + 0.007*"numeros" + 0.005*"tramitar" + 0.005*"federal" + 0.005*"cofepris" + 0.005*"amparar" + 0.004*"tribunal" + 0.004*"informar"
Topic: 3 Word: 0.016*"nava" + 0.016*"vazquez" + 0.015*"telecomunicación" + 0.014*"subprocuraduria" + 0.012*"consultor" + 0.012*"organigrama" + 0.012*"aplicar" + 0.012*"dictamenes" + 0.011*"diario" + 0.011*"manual"
Topic: 4 Word: 0.007*"articular" + 0.006*"adquirir" + 0.004*"informar" + 0.004*"partir" + 0.004*"gastar" + 0.004*"casar" + 0.003*"presupuestal" + 0.003*"federal" + 0.003*"informatica" + 0.00

### 10 tópicos TF-IDF

In [168]:
lda_model_tfidf_10 = gensim.models.LdaMulticore(corpus_tfidf, 
                                            num_topics=10,
                                            id2word=dictionary,
                                            passes=2,
                                            workers=cores)

In [169]:
for idx, topic in lda_model_tfidf_10.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.011*"amparar" + 0.010*"tribunal" + 0.009*"juicio" + 0.007*"colegiar" + 0.006*"dictar" + 0.006*"bebida" + 0.006*"federal" + 0.006*"montar" + 0.006*"distrito" + 0.006*"alcohol"
Topic: 1 Word: 0.013*"adquirir" + 0.012*"articular" + 0.010*"medicar" + 0.009*"medicamento" + 0.009*"informatica" + 0.007*"gastar" + 0.006*"adquisicion" + 0.006*"presupuestal" + 0.006*"partir" + 0.006*"mencionar"
Topic: 2 Word: 0.005*"federal" + 0.004*"electoral" + 0.004*"maderar" + 0.004*"articular" + 0.004*"informar" + 0.004*"partir" + 0.003*"sujeto" + 0.003*"instituto" + 0.003*"municipio" + 0.003*"recibir"
Topic: 3 Word: 0.007*"modificacion" + 0.007*"tramitar" + 0.005*"cofepris" + 0.005*"aplicar" + 0.005*"municipio" + 0.005*"sanitario" + 0.005*"zona" + 0.004*"secretario" + 0.004*"ambiental" + 0.004*"ubicar"
Topic: 4 Word: 0.009*"padron" + 0.009*"procampo" + 0.007*"partir" + 0.007*"original" + 0.006*"ejercer" + 0.006*"modificar" + 0.005*"presupuestal" + 0.005*"municipio" + 0.005*"presupuesto" + 

### 20 tópicos TF-IDF

In [170]:
lda_model_tfidf_20 = gensim.models.LdaMulticore(corpus_tfidf, 
                                            num_topics=20,
                                            id2word=dictionary,
                                            passes=2,
                                            workers=cores)

In [171]:
for idx, topic in lda_model_tfidf_20.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.006*"proyecto" + 0.005*"obrar" + 0.004*"auditorio" + 0.004*"ambiental" + 0.004*"anexo" + 0.004*"computar" + 0.004*"municipio" + 0.004*"relacionar" + 0.004*"tribunal" + 0.004*"amparar"
Topic: 1 Word: 0.024*"organigrama" + 0.023*"manual" + 0.022*"dictamenes" + 0.022*"diario" + 0.021*"autorizacion" + 0.021*"oficial" + 0.021*"respectivo" + 0.020*"organizacion" + 0.018*"procedimiento" + 0.017*"federacion"
Topic: 2 Word: 0.012*"remuneracion" + 0.011*"servidor" + 0.011*"proporcionar" + 0.010*"publicar" + 0.010*"tipo" + 0.008*"propiedad" + 0.008*"solicitante" + 0.008*"industrial" + 0.008*"correspondiente" + 0.007*"soldar"
Topic: 3 Word: 0.016*"manufacturar" + 0.013*"maderar" + 0.009*"publicar" + 0.009*"servidor" + 0.008*"oficial" + 0.008*"guante" + 0.007*"organigrama" + 0.007*"dictamenes" + 0.007*"partir" + 0.007*"autorizacion"
Topic: 4 Word: 0.027*"procampo" + 0.027*"padron" + 0.017*"aviso" + 0.015*"original" + 0.014*"modificar" + 0.014*"partir" + 0.013*"vino" + 0.012*"alcoho

### Guardamos en S3

In [172]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2017/df_2017.pkl'
pickle_byte_obj = pickle.dumps(df_2017)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

{'ResponseMetadata': {'RequestId': 'F851D018F6D33049',
  'HostId': 'zqkPGPMh0BScooKNVRCgcIs3vDtmD+jwn5i42iFzxxY29U0QDxtE6jzq9Gry1H2tNlOCo7/vrcI=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'zqkPGPMh0BScooKNVRCgcIs3vDtmD+jwn5i42iFzxxY29U0QDxtE6jzq9Gry1H2tNlOCo7/vrcI=',
   'x-amz-request-id': 'F851D018F6D33049',
   'date': 'Thu, 30 Jul 2020 18:30:54 GMT',
   'etag': '"40eb26d44e51dfdfcd6b273623c41270"',
   'content-length': '0',
   'server': 'AmazonS3',
   'connection': 'close'},
  'RetryAttempts': 0},
 'ETag': '"40eb26d44e51dfdfcd6b273623c41270"'}

In [173]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2017/processed_docs.pkl'
pickle_byte_obj = pickle.dumps(processed_docs)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

{'ResponseMetadata': {'RequestId': '36EBEDB498C2B8F7',
  'HostId': 'DyKdSFm1bvdbvD/l28je7IpqYBKc9XOiFLVeijLwpxvDd3JBXFYIKX8WXL/ytAVsRDt8/2lP00M=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'DyKdSFm1bvdbvD/l28je7IpqYBKc9XOiFLVeijLwpxvDd3JBXFYIKX8WXL/ytAVsRDt8/2lP00M=',
   'x-amz-request-id': '36EBEDB498C2B8F7',
   'date': 'Thu, 30 Jul 2020 18:36:39 GMT',
   'etag': '"0a0c72108705938a09e94c9929f3076a"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"0a0c72108705938a09e94c9929f3076a"'}

In [174]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2017/lda_model_tfidf_5.pkl'
pickle_byte_obj = pickle.dumps(lda_model_tfidf_5)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

{'ResponseMetadata': {'RequestId': '1NAW1S0R4K3V2W2J',
  'HostId': 'G1xPsJ9eNVb9dbZqPKW/IWpZ12lCZpXVB3a6DwoD9IadPFJvUFdQGtL8EacuiQLpha/mGnPAnqU=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'G1xPsJ9eNVb9dbZqPKW/IWpZ12lCZpXVB3a6DwoD9IadPFJvUFdQGtL8EacuiQLpha/mGnPAnqU=',
   'x-amz-request-id': '1NAW1S0R4K3V2W2J',
   'date': 'Thu, 30 Jul 2020 18:37:23 GMT',
   'etag': '"ae58190b8925ee447d1f565e645b18b4"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"ae58190b8925ee447d1f565e645b18b4"'}

In [175]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2017/lda_model_tfidf_10.pkl'
pickle_byte_obj = pickle.dumps(lda_model_tfidf_10)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

{'ResponseMetadata': {'RequestId': '2Z7HERCT6Y4KBV9R',
  'HostId': 'xz8aC2IjxLTEe58KlnyrWWckXxj1dczhg1bmLSYkErbb12M9P3ox84PCyOsY+Mfo1Tr2SApY+fk=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'xz8aC2IjxLTEe58KlnyrWWckXxj1dczhg1bmLSYkErbb12M9P3ox84PCyOsY+Mfo1Tr2SApY+fk=',
   'x-amz-request-id': '2Z7HERCT6Y4KBV9R',
   'date': 'Thu, 30 Jul 2020 18:37:28 GMT',
   'etag': '"9d206e460e6b16285b1fce4f89e60a59"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"9d206e460e6b16285b1fce4f89e60a59"'}

In [176]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2017/lda_model_tfidf_20.pkl'
pickle_byte_obj = pickle.dumps(lda_model_tfidf_20)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

{'ResponseMetadata': {'RequestId': '8A4CF9E33E740DB1',
  'HostId': 'Oe0jnkQZ5pq9xeq4zrHtQ02uGS2CIDQKvVYMC9y1E2H8Hm5q1I7U+N5q1yTBTZETeggA+GNHNIE=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'Oe0jnkQZ5pq9xeq4zrHtQ02uGS2CIDQKvVYMC9y1E2H8Hm5q1I7U+N5q1yTBTZETeggA+GNHNIE=',
   'x-amz-request-id': '8A4CF9E33E740DB1',
   'date': 'Thu, 30 Jul 2020 18:37:36 GMT',
   'etag': '"37718f4b90481bc57001453632c99e44"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"37718f4b90481bc57001453632c99e44"'}

# 2018

In [177]:
filtro = df_new['anosolicitud'] == 2018
df_2018 = df_new[filtro]

In [178]:
df_2018.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 184742 entries, 389995 to 1396674
Data columns (total 31 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   folio                          184742 non-null  object        
 1   fechasolicitud                 184742 non-null  datetime64[ns]
 2   dependencia                    184742 non-null  object        
 3   estatus                        184742 non-null  object        
 4   medioentrada                   184742 non-null  object        
 5   tiposolicitud                  184742 non-null  object        
 6   descripcionsolicitud           184742 non-null  object        
 7   otrosdatos                     184742 non-null  object        
 8   archivoadjuntosolicitud        184742 non-null  object        
 9   medioentrega                   184742 non-null  object        
 10  fechalimite                    177183 non-null  datetime64[ns]

In [179]:
data = df_2018[['descripcionsolicitud_lemma']]
data['index'] = data.index
docs = data

In [180]:
docs.head()

Unnamed: 0,descripcionsolicitud_lemma,index
389995,documentar legal expedir competente constar de...,389995
389997,listar escalafon cambiar turnar medicos especi...,389997
389999,acta adjudicacion contratacion integral banco ...,389999
390000,acta adjudicacion contratacion integral banco ...,390000
390001,querer agenciar tipo aguascalientes hablar con...,390001


In [181]:
import spacy
import nltk
from nltk import SnowballStemmer

In [182]:
nlp = spacy.load('es_core_news_sm')

In [183]:
def ProcesarTexto(texto):
    
    doc = nlp(texto)
    words = [t.text for t in doc]
    
    return words

In [184]:
import multiprocessing
from multiprocessing.dummy import Pool as ThreadPool

In [185]:
cores = multiprocessing.cpu_count()
pool = ThreadPool(cores)
cores

12

In [186]:
processed_docs = pool.map(ProcesarTexto, docs['descripcionsolicitud_lemma'])
pool.close()

In [187]:
import gensim

In [188]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [189]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [190]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [191]:
from gensim import corpora, models
from pprint import pprint

In [192]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

### 5 tópicos TF-IDF

In [193]:
lda_model_tfidf_5 = gensim.models.LdaMulticore(corpus_tfidf, 
                                            num_topics=5,
                                            id2word=dictionary,
                                            passes=2,
                                            workers=cores)

In [194]:
for idx, topic in lda_model_tfidf_5.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.004*"permiso" + 0.004*"informar" + 0.004*"municipio" + 0.003*"mexicano" + 0.003*"electoral" + 0.003*"federal" + 0.003*"vigente" + 0.003*"casar" + 0.003*"sanitario" + 0.003*"gracia"
Topic: 1 Word: 0.005*"municipio" + 0.005*"concesion" + 0.004*"documentar" + 0.004*"titulos" + 0.004*"presupuestar" + 0.004*"numeros" + 0.004*"california" + 0.004*"informar" + 0.004*"federal" + 0.003*"bajo"
Topic: 2 Word: 0.005*"direccion" + 0.004*"montar" + 0.004*"federal" + 0.004*"requerir" + 0.004*"agostar" + 0.004*"instituto" + 0.004*"servidor" + 0.003*"gracia" + 0.003*"secretario" + 0.003*"funcionario"
Topic: 3 Word: 0.015*"medicamento" + 0.013*"medicar" + 0.011*"amparar" + 0.010*"tribunal" + 0.010*"juicio" + 0.009*"colegiar" + 0.006*"obsequiar" + 0.006*"recetar" + 0.006*"paciente" + 0.006*"plataforma"
Topic: 4 Word: 0.010*"sanitario" + 0.007*"cofepris" + 0.007*"tramitar" + 0.005*"salud" + 0.005*"ingresar" + 0.005*"secretario" + 0.005*"evidenciar" + 0.005*"consignar" + 0.005*"informar" +

### 10 tópicos TF-IDF

In [195]:
lda_model_tfidf_10 = gensim.models.LdaMulticore(corpus_tfidf, 
                                            num_topics=10,
                                            id2word=dictionary,
                                            passes=2,
                                            workers=cores)

In [196]:
for idx, topic in lda_model_tfidf_10.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.033*"sanitario" + 0.018*"tramitar" + 0.016*"cofepris" + 0.013*"ingresar" + 0.010*"comision" + 0.009*"denominacion" + 0.008*"concesion" + 0.007*"titulos" + 0.007*"permiso" + 0.007*"informar"
Topic: 1 Word: 0.020*"amparar" + 0.019*"tribunal" + 0.018*"colegiar" + 0.016*"juicio" + 0.013*"obsequiar" + 0.012*"bajo" + 0.011*"plataforma" + 0.009*"requerir" + 0.007*"civil" + 0.006*"penal"
Topic: 2 Word: 0.006*"federacion" + 0.005*"magistrado" + 0.004*"diario" + 0.004*"oficial" + 0.004*"municipio" + 0.004*"montar" + 0.004*"requerir" + 0.004*"gastar" + 0.004*"fiscal" + 0.003*"ambiental"
Topic: 3 Word: 0.022*"medicamento" + 0.022*"medicar" + 0.011*"salud" + 0.011*"recetar" + 0.010*"adquisicion" + 0.010*"paciente" + 0.010*"hospital" + 0.008*"licitacion" + 0.008*"evidenciar" + 0.007*"consignar"
Topic: 4 Word: 0.008*"educacion" + 0.007*"convenio" + 0.007*"direccion" + 0.005*"secretario" + 0.005*"sindicato" + 0.005*"valioso" + 0.005*"quedar" + 0.005*"institucion" + 0.005*"casar" + 0.0

### 20 tópicos TF-IDF

In [197]:
lda_model_tfidf_20 = gensim.models.LdaMulticore(corpus_tfidf, 
                                            num_topics=20,
                                            id2word=dictionary,
                                            passes=2,
                                            workers=cores)

In [198]:
for idx, topic in lda_model_tfidf_20.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.040*"medicamento" + 0.035*"medicar" + 0.018*"recetar" + 0.016*"paciente" + 0.012*"gubernamental" + 0.012*"pieza" + 0.012*"hospital" + 0.011*"adquisicion" + 0.011*"licitacion" + 0.011*"cuadrar"
Topic: 1 Word: 0.005*"tramitar" + 0.005*"prevención" + 0.005*"partir" + 0.005*"jubilacion" + 0.005*"modificacion" + 0.004*"seguridad" + 0.004*"electoral" + 0.004*"desglosar" + 0.004*"cofepris" + 0.004*"cortar"
Topic: 2 Word: 0.017*"valioso" + 0.015*"quedar" + 0.014*"cofepris" + 0.013*"ordenar" + 0.013*"modificación" + 0.013*"corresponder" + 0.013*"sanitario" + 0.011*"gracia" + 0.007*"tramitar" + 0.007*"acosar"
Topic: 3 Word: 0.006*"chihuahua" + 0.005*"confianza" + 0.005*"honorario" + 0.005*"requerir" + 0.005*"roer" + 0.005*"adscripcion" + 0.005*"quintana" + 0.004*"municipio" + 0.004*"etcétera" + 0.004*"delegacion"
Topic: 4 Word: 0.026*"obsequiar" + 0.010*"partir" + 0.007*"diputar" + 0.006*"bien" + 0.006*"oficialia" + 0.005*"protocolar" + 0.005*"campana" + 0.005*"andres" + 0.005*"

### Guardamos en S3

In [199]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2018/df_2018.pkl'
pickle_byte_obj = pickle.dumps(df_2018)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

{'ResponseMetadata': {'RequestId': 'DB4165D87816C1B5',
  'HostId': 'hqVti/+oYML1msWJPZb+Rz3VyYFYFAGm3Q5yM/DAKnX2vN4VE/MXtz/AhzkNvzFhBAVkq40KBQM=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'hqVti/+oYML1msWJPZb+Rz3VyYFYFAGm3Q5yM/DAKnX2vN4VE/MXtz/AhzkNvzFhBAVkq40KBQM=',
   'x-amz-request-id': 'DB4165D87816C1B5',
   'date': 'Thu, 30 Jul 2020 19:04:12 GMT',
   'etag': '"fa7233747bfea1f12925e6e16939fb55"',
   'content-length': '0',
   'server': 'AmazonS3',
   'connection': 'close'},
  'RetryAttempts': 0},
 'ETag': '"fa7233747bfea1f12925e6e16939fb55"'}

In [200]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2018/processed_docs.pkl'
pickle_byte_obj = pickle.dumps(processed_docs)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

{'ResponseMetadata': {'RequestId': '9BFB9FF0B034910E',
  'HostId': 'hDCe2ihdk1NkrMwgF33P4u9vIJDbVK4YCQQFUJA3vFogI0HzKv3KUyErQWjGv/5b7EF9LaJ7iOI=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'hDCe2ihdk1NkrMwgF33P4u9vIJDbVK4YCQQFUJA3vFogI0HzKv3KUyErQWjGv/5b7EF9LaJ7iOI=',
   'x-amz-request-id': '9BFB9FF0B034910E',
   'date': 'Thu, 30 Jul 2020 19:09:37 GMT',
   'etag': '"d9218eb34bd34ae5f370ce4f4a35a07c"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"d9218eb34bd34ae5f370ce4f4a35a07c"'}

In [201]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2018/lda_model_tfidf_5.pkl'
pickle_byte_obj = pickle.dumps(lda_model_tfidf_5)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

{'ResponseMetadata': {'RequestId': 'AT7Z8WAP9VEW1X8G',
  'HostId': 'CL2Dt/VtA1u/zTcXDRo5+hqBrt6xhibe22CR9vRmjkVx1IJ01fc+JsiajV5b9YXTXXh4VI+nPKA=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'CL2Dt/VtA1u/zTcXDRo5+hqBrt6xhibe22CR9vRmjkVx1IJ01fc+JsiajV5b9YXTXXh4VI+nPKA=',
   'x-amz-request-id': 'AT7Z8WAP9VEW1X8G',
   'date': 'Thu, 30 Jul 2020 19:13:23 GMT',
   'etag': '"42b10e2f8383acc303c6b6bb79bf672a"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"42b10e2f8383acc303c6b6bb79bf672a"'}

In [202]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2018/lda_model_tfidf_10.pkl'
pickle_byte_obj = pickle.dumps(lda_model_tfidf_10)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

{'ResponseMetadata': {'RequestId': '210E11C6EA93895C',
  'HostId': 'lFssXGaJ3B8bO68cTO7ka1W7rvqHNta0ZwJyk7LkApF7DfrxwFWDJgJozRaixqJsu07C1/sXkhI=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'lFssXGaJ3B8bO68cTO7ka1W7rvqHNta0ZwJyk7LkApF7DfrxwFWDJgJozRaixqJsu07C1/sXkhI=',
   'x-amz-request-id': '210E11C6EA93895C',
   'date': 'Thu, 30 Jul 2020 19:13:28 GMT',
   'etag': '"01f8562133e28669376820fbde9bd794"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"01f8562133e28669376820fbde9bd794"'}

In [203]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2018/lda_model_tfidf_20.pkl'
pickle_byte_obj = pickle.dumps(lda_model_tfidf_20)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

{'ResponseMetadata': {'RequestId': '02DD0E387FE131B3',
  'HostId': 'Hms1G6VVQTHVyC4o8T9jeTC0u6tw+SVbndHTK28kFYlL7c56QyrtBnOF7w8fHlp93bGfW1yz3Kg=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'Hms1G6VVQTHVyC4o8T9jeTC0u6tw+SVbndHTK28kFYlL7c56QyrtBnOF7w8fHlp93bGfW1yz3Kg=',
   'x-amz-request-id': '02DD0E387FE131B3',
   'date': 'Thu, 30 Jul 2020 19:13:33 GMT',
   'etag': '"0148de123cda063ceddc767ef2c26afc"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"0148de123cda063ceddc767ef2c26afc"'}

# 2019

In [204]:
filtro = df_new['anosolicitud'] == 2019
df_2019 = df_new[filtro]

In [205]:
df_2019.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 222609 entries, 454993 to 1559008
Data columns (total 31 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   folio                          222609 non-null  object        
 1   fechasolicitud                 222609 non-null  datetime64[ns]
 2   dependencia                    222609 non-null  object        
 3   estatus                        222609 non-null  object        
 4   medioentrada                   222609 non-null  object        
 5   tiposolicitud                  222609 non-null  object        
 6   descripcionsolicitud           222609 non-null  object        
 7   otrosdatos                     222609 non-null  object        
 8   archivoadjuntosolicitud        222609 non-null  object        
 9   medioentrega                   222609 non-null  object        
 10  fechalimite                    215440 non-null  datetime64[ns]

In [206]:
data = df_2019[['descripcionsolicitud_lemma']]
data['index'] = data.index
docs = data

In [207]:
docs.head()

Unnamed: 0,descripcionsolicitud_lemma,index
454993,documentar listar funcionario local oficina pr...,454993
454994,documentar listar funcionario local oficina pr...,454994
454995,amparar revision indice decimo tribunal colegi...,454995
454996,amparar revision indice tribunal colegiar escr...,454996
454997,juicio amparar indirecto indice juzgar decimo ...,454997


In [208]:
import spacy
import nltk
from nltk import SnowballStemmer

In [209]:
nlp = spacy.load('es_core_news_sm')

In [210]:
def ProcesarTexto(texto):
    
    doc = nlp(texto)
    words = [t.text for t in doc]
    
    return words

In [211]:
import multiprocessing
from multiprocessing.dummy import Pool as ThreadPool

In [212]:
cores = multiprocessing.cpu_count()
pool = ThreadPool(cores)
cores

12

In [213]:
processed_docs = pool.map(ProcesarTexto, docs['descripcionsolicitud_lemma'])
pool.close()

In [214]:
import gensim

In [215]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [216]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [217]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [218]:
from gensim import corpora, models
from pprint import pprint

In [219]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

### 5 tópicos TF-IDF

In [220]:
lda_model_tfidf_5 = gensim.models.LdaMulticore(corpus_tfidf, 
                                            num_topics=5,
                                            id2word=dictionary,
                                            passes=2,
                                            workers=cores)

In [221]:
for idx, topic in lda_model_tfidf_5.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.004*"servidor" + 0.004*"requerir" + 0.004*"institucion" + 0.004*"mensual" + 0.004*"informar" + 0.004*"trabajador" + 0.004*"secretario" + 0.004*"soldar" + 0.003*"federal" + 0.003*"octubre"
Topic: 1 Word: 0.012*"sanitario" + 0.011*"tramitar" + 0.008*"permiso" + 0.007*"control" + 0.007*"fumigacion" + 0.007*"plagar" + 0.005*"comision" + 0.005*"periodo" + 0.005*"montar" + 0.004*"ingresar"
Topic: 2 Word: 0.013*"medicamento" + 0.012*"medicar" + 0.007*"padron" + 0.006*"recetar" + 0.005*"municipio" + 0.005*"hospital" + 0.005*"paciente" + 0.004*"extintor" + 0.004*"grupo" + 0.004*"periodo"
Topic: 3 Word: 0.008*"amparar" + 0.007*"tribunal" + 0.007*"gracia" + 0.006*"valioso" + 0.006*"ordenar" + 0.006*"quedar" + 0.006*"dictar" + 0.005*"colegiar" + 0.005*"corresponder" + 0.005*"decimo"
Topic: 4 Word: 0.004*"denunciar" + 0.004*"firmar" + 0.004*"procedimiento" + 0.003*"convenio" + 0.003*"publicar" + 0.003*"documentar" + 0.003*"casar" + 0.003*"contratista" + 0.003*"avanzar" + 0.003*"fis

### 10 tópicos TF-IDF

In [222]:
lda_model_tfidf_10 = gensim.models.LdaMulticore(corpus_tfidf, 
                                            num_topics=10,
                                            id2word=dictionary,
                                            passes=2,
                                            workers=cores)

In [223]:
for idx, topic in lda_model_tfidf_10.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.007*"medicamento" + 0.005*"grupo" + 0.004*"federal" + 0.004*"publicar" + 0.004*"facturar" + 0.004*"secretario" + 0.004*"adjudicacion" + 0.004*"servidor" + 0.004*"plaza" + 0.004*"direccion"
Topic: 1 Word: 0.027*"medicar" + 0.021*"medicamento" + 0.017*"recetar" + 0.013*"paciente" + 0.008*"hospital" + 0.006*"nivel" + 0.005*"matricular" + 0.005*"prescribir" + 0.005*"basico" + 0.005*"cuadrar"
Topic: 2 Word: 0.005*"ambiental" + 0.004*"secretario" + 0.004*"impactar" + 0.003*"credito" + 0.003*"convenio" + 0.003*"publicar" + 0.003*"federal" + 0.003*"presupuestar" + 0.003*"desarrollar" + 0.003*"financiero"
Topic: 3 Word: 0.008*"denunciar" + 0.006*"avanzar" + 0.005*"firmar" + 0.005*"articular" + 0.005*"publicar" + 0.005*"tipo" + 0.004*"social" + 0.004*"institucion" + 0.004*"montar" + 0.004*"acosar"
Topic: 4 Word: 0.011*"extintor" + 0.009*"enajenación" + 0.008*"octubre" + 0.008*"noviembre" + 0.008*"oficina" + 0.008*"salud" + 0.007*"completar" + 0.007*"concesion" + 0.006*"confianza

### 20 tópicos TF-IDF

In [224]:
lda_model_tfidf_20 = gensim.models.LdaMulticore(corpus_tfidf, 
                                            num_topics=20,
                                            id2word=dictionary,
                                            passes=2,
                                            workers=cores)

In [225]:
for idx, topic in lda_model_tfidf_20.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.017*"tramitar" + 0.009*"delito" + 0.008*"periodo" + 0.006*"denunciar" + 0.006*"detenido" + 0.006*"robar" + 0.006*"recibir" + 0.005*"responsable" + 0.005*"casar" + 0.005*"investigacion"
Topic: 1 Word: 0.007*"lotenal" + 0.004*"desarrollar" + 0.004*"comite" + 0.004*"requerir" + 0.003*"acta" + 0.003*"convenio" + 0.003*"consultar" + 0.003*"citar" + 0.003*"extraordinario" + 0.003*"casar"
Topic: 2 Word: 0.024*"montar" + 0.018*"extintor" + 0.016*"institucion" + 0.013*"detallar" + 0.012*"proveedor" + 0.011*"comer" + 0.009*"adjudicacion" + 0.009*"fiesta" + 0.009*"gracia" + 0.009*"gastar"
Topic: 3 Word: 0.010*"avanzar" + 0.008*"firmar" + 0.006*"proceso" + 0.005*"digital" + 0.005*"procedimiento" + 0.005*"certificar" + 0.005*"publicar" + 0.004*"educacion" + 0.004*"caracter" + 0.004*"articular"
Topic: 4 Word: 0.041*"sanitario" + 0.021*"valioso" + 0.019*"ordenar" + 0.018*"cofepris" + 0.017*"quedar" + 0.016*"corresponder" + 0.014*"comision" + 0.014*"tramitar" + 0.013*"gracia" + 0.011*

### Guardamos en S3

In [None]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2019/df_2019.pkl'
pickle_byte_obj = pickle.dumps(df_2019)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

In [None]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2019/processed_docs.pkl'
pickle_byte_obj = pickle.dumps(processed_docs)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

In [None]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2019/lda_model_tfidf_5.pkl'
pickle_byte_obj = pickle.dumps(lda_model_tfidf_5)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

In [None]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2019/lda_model_tfidf_10.pkl'
pickle_byte_obj = pickle.dumps(lda_model_tfidf_10)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)

In [None]:
bucket='inai-summerofdata'
key='modeling/LDA/3/2019/lda_model_tfidf_20.pkl'
pickle_byte_obj = pickle.dumps(lda_model_tfidf_20)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, key).put(Body=pickle_byte_obj)