In [1]:
# import basic libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('spanish')
import json
import unidecode
import spacy
nlp = spacy.load("es_core_news_md")

In [2]:
# Getting "anti-taxonomy"
anti_taxonomy_file_name = "anti_taxonomy_export_2.json"
anti_taxonomy_file = open(anti_taxonomy_file_name, 'r')
anti_taxonomy = json.load(anti_taxonomy_file) 
anti_taxonomy += ['stroke', 'span', 'trebuchet', 'webkit', 'kerning', 'calibri', 'font', 'text', 'margin', 'm' , 'ten', 'www', 'https' 'https', 'none']

In [3]:
from fuzzywuzzy import process

def is_in_anti_tax(sr):
    highest = process.extractOne(sr,anti_taxonomy)
    return highest[1] < 95



In [5]:
# function to clean text
def review_to_words(raw_review):
    # 1. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", raw_review) 

    # 2. Convert to lower case and only words
    only_words = letters_only.lower()
    only_words = " ".join(only_words.split())
    # 2.1 the nlp doc and words ❤️ 
    doc = nlp(only_words)
    lexical_tokens = [tok.lemma_ for tok in doc if tok.pos_ != 'PRON' and not (tok.is_punct | tok.is_stop)]
    words = [t.lower() for t in lexical_tokens if len(t) > 2 and t.isalpha()]
    # 3. Remove Stopwords. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("spanish"))  
    # 4. Remove stop words
    meaningful_words = [w for w in words if not w in stops]  #returns a list 
    # 4.1 if you want remove words in anti-taxonomy
    #meaningful_words = list(filter(is_in_anti_tax, meaningful_words)) 
    meaningful_words = [w for w in meaningful_words if w.strip() not in anti_taxonomy]
    # 5. Stem words. Need to define porter stemmer above
    #meaningful_words = [(lambda x: stemmer.stem(x) if len(x) > 5 else x)(word) for word in meaningful_words]
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))  

# Plural to singular lab (don't run)

In [59]:
processed_text = nlp("sifilis amores pasados perros casemonos guapos pepinillos")
lemma_tags = {"NNS", "NNPS"}
for token in processed_text:
    lemma = token.text
    if token.tag_ in lemma_tags:
        print(token.lemma_)
        lemma = token.lemma_

In [65]:
[p.lemma for p in processed_text]

[9993756361066428433,
 7569031656004518897,
 8365782747564503572,
 1889751909063932040,
 2518710851894020402,
 388792319402109172,
 1822172757761197604]

# lab final (run from next line)

In [6]:
file = open("sin_tildes_sexperto.json", 'r')
questions = json.load(file)

In [7]:
questions[0]

{'Name': '?Es necesario tomar anticoncepcion de emergencia cuando se utiliza el anillo anticonceptivo vaginal (NuvaRing) sin condon?',
 'respuestas_juntas': 'Solo cuando comienzas a usar el anillo anticonceptivo vaginal en un dia diferente a los primeros cinco dias de la menstruacion, es necesario utilizar un metodo anticonceptivo adicional. Si este es tu caso y no usaste condon si seria necesario usar anticoncepcion de emergencia. Te recomiendo asistir a una asesoria en anticoncepcion en tu centro de atencion en salud. Recuerda utilizar condon en todas las relaciones sexuales.'}

In [8]:
# apply it to our text data   
# dataset is named wine_data and the text are in the column "wmn"  
processed_wmn = [review_to_words("{} {}".format(question["Name"], question["respuestas_juntas"])) for question in questions] 
len(processed_wmn)

1137

In [9]:
" ".join(processed_wmn[0].split())

'tomar anticoncepcion emergencia anillar anticonceptivo vaginal nuvaring condon anillar anticonceptivo vaginal menstruacion anticonceptivo casar condon serio anticoncepcion emergencia recomer asistir asesoria anticoncepcion centrar atencion salud condon relacionar sexual'

In [10]:
# build a corpus for the word2vec model  
def build_corpus(data):
    corpus = []
    for sentence in data:
        sentence = " ".join(sentence.split())
        word_list = sentence.split(" ")  
        #corpus.append([(lambda x: x[:-2] if x[-2:] == 'es' and len(x) > 3 else (x[:-1] if x[-1:] == "s" and len(x) > 3 else x))(w) for w in word_list])   
        corpus.append(word_list)   
    return corpus

In [11]:
corpus = build_corpus(processed_wmn)
len(corpus)

1137

In [13]:
import pickle
with open('corpus_sexperto_with_lemma_with_s-04062020.pkl', 'wb') as corpus_export_file:
    pickle.dump(corpus, corpus_export_file)

# Thanks Sexperto ❤️🤓

In [52]:
# load the word2vec algorithm from the gensim library  
from gensim.models import word2vec  
# run the model  
model = word2vec.Word2Vec(corpus, size=3, window=4, min_count=30, workers=4)
#model = word2vec.Word2Vec(corpus, size=30, window=4, min_count=30, workers=4)
#model = word2vec.Word2Vec(corpus, size=300, window=4, min_count=30, workers=4)
#model = word2vec.Word2Vec(corpus, size=3000, window=5, min_count=30, workers=4)
len(model.wv.vocab)

317

In [None]:
# A ver iterando sobre 
def getModelByIteratingMin_count(words):
    modelBIMC = None
    min_count = 3000
    while true:
        modelBIMC = word2vec.Word2Vec(corpus, size=200, window=5, min_count=1000, workers=4)


In [None]:
min_words = ["embarazo", "pastillas", ]

In [53]:
[(item[0], round(item[1],5)) for item in model.wv.most_similar('anticonceptivo', topn=50)]

[('dudar', 0.99925),
 ('garantizar', 0.99924),
 ('requerir', 0.998),
 ('alcanzar', 0.99796),
 ('regular', 0.99764),
 ('voluntario', 0.99764),
 ('tomar', 0.99761),
 ('temer', 0.99756),
 ('asesorar', 0.99713),
 ('medicamento', 0.99711),
 ('metodos', 0.99699),
 ('emergencia', 0.99697),
 ('especificar', 0.99666),
 ('pastilla', 0.99633),
 ('vasectomia', 0.99613),
 ('descartar', 0.99597),
 ('necesidad', 0.99583),
 ('linear', 0.99528),
 ('hora', 0.99504),
 ('humano', 0.99467),
 ('acceso', 0.99377),
 ('acudir', 0.99374),
 ('querer', 0.99371),
 ('jovenes', 0.99369),
 ('comunicacion', 0.99342),
 ('cortar', 0.99322),
 ('efectividad', 0.99317),
 ('duracion', 0.99293),
 ('ser', 0.99277),
 ('apoyar', 0.99276),
 ('gratuito', 0.99251),
 ('recomendar', 0.99223),
 ('desear', 0.99186),
 ('estudio', 0.99176),
 ('fundamental', 0.99132),
 ('hijo', 0.99114),
 ('disminuir', 0.99113),
 ('dejar', 0.99073),
 ('situacion', 0.99071),
 ('disfrutar', 0.99054),
 ('https', 0.99031),
 ('correcto', 0.99002),
 ('subdermi

In [74]:
[x for x in model.wv.vocab][-25:]

['atraccion',
 'infectar',
 'cortar',
 'organo',
 'colombiano',
 'pubertad',
 'virgen',
 'presion',
 'referir',
 'olor',
 'malo',
 'higiene',
 'gustar',
 'delito',
 'citologia',
 'voluntario',
 'papiloma',
 'humanar',
 'vph',
 'legal',
 'pornografia',
 'vacuno',
 'victimar',
 'experiencia',
 'denunciar']

In [75]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
# define the function to compute the dimensionality reduction
# and then produce the biplot

"Creates a TSNE model and plots it"
labels = []
tokens = []

for word in model.wv.vocab:
    tokens.append(model[word])
    labels.append(word)

tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500)
new_values = tsne_model.fit_transform(tokens)

x = []
y = []
for value in new_values:
    x.append(value[0])
    y.append(value[1])

plt.figure(figsize=(32, 32)) 
for i in range(len(x)):
    plt.scatter(x[i],y[i])
    plt.annotate(labels[i],
                 xy=(x[i], y[i]),
                 xytext=(5, 2),
                 textcoords='offset points',
                 ha='right',
                 va='bottom')
plt.savefig('foo_comp_4.png')
plt.savefig('foo_comp_4.pdf')

# Taxonomy build GENSIM

In [76]:
from gensim.parsing.preprocessing import remove_stopwords
from gensim.summarization import keywords
import warnings
warnings.filterwarnings("ignore")

In [77]:
textt = " ".join([" ".join(c) for c in corpus])

In [78]:
values = keywords(text=textt,split=True,scores=True, lemmatize=True)
len(values)

730

In [79]:
pd.options.display.max_rows = 999
data_keywords = pd.DataFrame(values,columns=['keyword','score'])
data_keywords = data_keywords.sort_values('score',ascending=False)
data_keywords.head(100)

Unnamed: 0,keyword,score
0,personar,0.265331
1,poder,0.188799
2,saludable,0.167994
3,relacionar sexual parejo embarazar eyacular va...,0.141382
4,condon,0.139714
5,casar,0.137672
6,ano,0.119634
7,sentir,0.119131
8,causar,0.104444
9,sexualidad,0.093179


In [22]:
len(data_keywords)

1022

# Taxonomy build using Spacy

In [105]:
from collections import Counter
from functools import reduce

In [106]:
words_for_tokenization = reduce(lambda x,y :x+y, corpus)
len(words_for_tokenization)

54300

In [115]:
word_freq = Counter(words_for_tokenization)
common_words = word_freq.most_common(int(54300/2))

In [118]:
[c[0] for c in common_words]

['sexual',
 'relacion',
 'infeccion',
 'person',
 'embaraz',
 'salud',
 'condon',
 'anticoncept',
 'poder',
 'transmision',
 'asist',
 'parej',
 'recom',
 'anticoncepcion',
 'consult',
 'sent',
 'ano',
 'casar',
 'present',
 'efect',
 'tomar',
 'informacion',
 'pagin',
 'sexo',
 'derech',
 'medic',
 'penar',
 'dolor',
 'cau',
 'adecu',
 'citar',
 'genital',
 'des',
 'sangr',
 'tipo',
 'recib',
 'preven',
 'normal',
 'quer',
 'vaginal',
 'entid',
 'edad',
 'practic',
 'atencion',
 'hormonal',
 'recomend',
 'inici',
 'metod',
 'vida',
 'asesori',
 'vagin',
 'pastill',
 'oral',
 'menstrual',
 'menstruacion',
 'cuerp',
 'vih',
 'enfermed',
 'sexpert',
 'segund',
 'problem',
 'sintom',
 'fisic',
 'emergent',
 'situacion',
 'orientacion',
 'eyaculacion',
 'ayud',
 'violenci',
 'encontr',
 'proteg',
 'siguient',
 'caso',
 'reproduct',
 'solicit',
 'nec',
 'femenin',
 'penetracion',
 'prob',
 'medicament',
 'hijo',
 'afect',
 'contact',
 'human',
 'disminu',
 'centr',
 'orgasm',
 'plac',
 'abo

In [119]:
import pickle
with open("corpus_sexperto_with_stemming_vector.pkl", 'wb') as fio_corpus:
    pickle.dump([c[0] for c in common_words], fio_corpus)

In [83]:
ret = []
for icm in range(len(common_words)):
    dic = {'word':common_words[icm][0]}
    dic["child"] = [item[0] for item in model.wv.most_similar(common_words[icm][0], topn=20)]
    ret.append(dic)
len(ret)

100

In [84]:
ret

[{'word': 'sexual',
  'child': ['relacionar',
   'parejo',
   'disfrutar',
   'relacion',
   'adquirir',
   'practicar',
   'significar',
   'gustar',
   'respetar',
   'vih',
   'consentimiento',
   'contactar',
   'identificar',
   'querer',
   'acto',
   'anal',
   'comunicacion',
   'placentero',
   'orientacion',
   'compartir']},
 {'word': 'relacionar',
  'child': ['adquirir',
   'sexual',
   'contactar',
   'significar',
   'comunicacion',
   'its',
   'transmitir',
   'presion',
   'respetar',
   'incluir',
   'consentimiento',
   'compartir',
   'relacion',
   'actividad',
   'ayudar',
   'diferenciar',
   'definir',
   'establecer',
   'protegerte',
   'malo']},
 {'word': 'personar',
  'child': ['atraccion',
   'sentir',
   'sexo',
   'parejo',
   'orientacion',
   'practicar',
   'gustar',
   'placer',
   'disfrutar',
   'identificar',
   'acto',
   'sexual',
   'querer',
   'sexualidad',
   'excitacion',
   'relacion',
   'consentimiento',
   'respetar',
   'orgasmo',
   'r

In [91]:
queries_plain = []
for word_main in ret:
    for word_two in word_main['child']:
        queries_plain.append("{} AND {}".format(word_main['word'], word_two))

In [93]:
queries_df = pd.DataFrame(queries_plain) 

In [98]:
with open("queries_bytes.pkl", "wb") as queries_outfile:
    pickle.dump(queries_plain, queries_outfile)

In [85]:
common_words

[('sexual', 2462),
 ('relacionar', 931),
 ('personar', 858),
 ('salud', 759),
 ('embarazar', 703),
 ('condon', 665),
 ('anticonceptivo', 633),
 ('infección', 627),
 ('poder', 596),
 ('transmision', 575),
 ('asistir', 525),
 ('parejo', 498),
 ('recomer', 473),
 ('anticoncepcion', 433),
 ('consultar', 395),
 ('ano', 356),
 ('sentir', 349),
 ('relacion', 343),
 ('casar', 342),
 ('tomar', 304),
 ('informacion', 299),
 ('presentar', 295),
 ('sexo', 278),
 ('derecho', 273),
 ('página', 270),
 ('medicar', 252),
 ('penar', 252),
 ('causar', 245),
 ('citar', 243),
 ('infeccion', 243),
 ('genital', 240),
 ('desear', 233),
 ('sangrar', 232),
 ('tipo', 225),
 ('dolor', 221),
 ('recibir', 214),
 ('sexualidad', 214),
 ('adecuar', 213),
 ('prevenir', 209),
 ('querer', 205),
 ('normal', 203),
 ('vaginal', 201),
 ('entidad', 200),
 ('edad', 196),
 ('atencion', 193),
 ('hormonal', 185),
 ('practicar', 184),
 ('metodo', 180),
 ('vida', 178),
 ('asesoria', 177),
 ('pastilla', 177),
 ('oral', 176),
 ('inic

In [None]:
set([1,2])

## another way

In [None]:
def arbol_build(child, yap, n=2):
    if n <= 0:
        return [{"name": son["name"], "children": [{"name":item[0]} for item in model.wv.most_similar(son["name"], topn=15) if item[0] not in yap]} for son in child]
    else:
        return [{"name": son["name"], "children": arbol_build([{"name":item[0]} for item in model.wv.most_similar(son["name"], topn=10)  if item[0] not in yap], yap + [son["name"]] + [item[0] for item in model.wv.most_similar(son["name"], topn=15)  if item[0] not in yap], n-1)} for son in child]

In [None]:
dic = {'name': 'embarazo'}
dic["children"] = [{"name":item[0]} for item in model.wv.most_similar(dic["name"], topn=15)]
dic["children"] = arbol_build(dic["children"], ['embarazo'] + [{"name":item[0]} for item in model.wv.most_similar(dic["name"], topn=10) if item[0] not in ['embarazo']] , 2)

In [None]:
json.dumps(dic)

# Getting Queries

In [86]:
def getQueries(dic, acum): 
    if "children" not in [attr for attr in dic]:
        if len(acum) > 0:
            return " AND ".join([acum, dic["name"]])
        else:
            return dic["name"]
    else:
        if len(acum) > 0:
            return " ; ".join([" AND ".join([acum, dic["name"]])] + [getQueries(d, "{} AND {}".format(acum, dic["name"])) for d in dic["children"]])
        else:
            return " ; ".join([getQueries(d, dic["name"]) for d in dic["children"]])
        

In [87]:
queries = getQueries(ret, "")

TypeError: list indices must be integers or slices, not str

In [None]:
bigQueries = [" AND ".join(sorted(("".join(qq.split(" "))).split("AND"))) for qq in [q for q in queries.split(";")]]
len(bigQueries)

In [None]:
nQueries = list(set(bigQueries))
nQueries.sort(key=len)
len(nQueries)

In [None]:
[qr for qr in nQueries if "anticoncepcion" in qr]

In [None]:
import pickle
with open("bigQueryList3.pkl", 'wb') as fiout:
    pickle.dump(nQueries, fiout)

### read pickle

In [None]:
import pickle
with open("bigQueryList3.pkl", 'rb') as fiin:
    queries = pickle.load(fiin)

In [None]:
len(queries)

In [None]:
sorted([qr for qr in queries if "anticoncepcion" in qr], key=len)

# Lab for synonims

In [None]:
nlp_lg = spacy.load("es_core_news_md")

In [None]:
vector = nlp(u"Fruta").vector
nlp_lg.vocab.vectors.most_similar(nlp_lg.vocab['fruta'].vector.reshape(1,50))

In [None]:
nlp_lg.vocab['fruta']

In [None]:
def most_similar(word, topn=5):
    word = nlp_lg.vocab[str(word)]
    queries = [
      w for w in word.vocab 
      if w.is_lower == word.is_lower and w.prob >= -15 and np.count_nonzero(w.vector)
    ]

    by_similarity = sorted(queries, key=lambda w: word.similarity(w), reverse=True)
    return [(w.lower_,w.similarity(word)) for w in by_similarity[:topn+1] if w.lower_ != word.lower_]

In [None]:
most_similar("embarazo", topn=10)

In [None]:
import time
'twitterscraper "{}" -o {}.json -p'.format("213123", "-".join(['123123']))