- Versiones de librerías, python 3.8.10

- numpy 1.20.3
- nltk 3.7
- gensim 4.1.2
- keras 2.9.0
- tensorflow 2.9.1
- instant-distance 0.3.5


In [1]:
import gensim.downloader

print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [2]:
ft_300 = gensim.downloader.load('fasttext-wiki-news-subwords-300')

In [3]:
ft_300.most_similar_cosmul(positive=['king', 'woman'], negative=['man'])

[('queen', 0.9390855431556702),
 ('queen-mother', 0.9078598618507385),
 ('king-', 0.8828966617584229),
 ('queen-consort', 0.882541835308075),
 ('child-king', 0.8680858016014099),
 ('monarch', 0.8670082688331604),
 ('ex-queen', 0.8654637932777405),
 ('princess', 0.8628991842269897),
 ('queen-', 0.8613532781600952),
 ('boy-king', 0.860465943813324)]

In [4]:
score, results = ft_300.evaluate_word_analogies('questions-words.txt')

In [5]:
score

0.8827876424099353

In [6]:
import numpy as np

vectors_ft = np.asarray(ft_300.vectors)
labels_ft = np.asarray(ft_300.index_to_key)

In [7]:
import tensorflow
from tensorflow import keras
from sklearn.datasets import fetch_20newsgroups

X_train_text, Y_train = fetch_20newsgroups(subset="train", remove=('headers', 'footers', 'quotes'), return_X_y=True)
X_test_text, Y_test  = fetch_20newsgroups(subset="test", remove=('headers', 'footers', 'quotes'), return_X_y=True)

In [8]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from nltk.stem import WordNetLemmatizer

classes = np.unique(Y_train)

# Load stop-words
stop_words = set(stopwords.words('english'))

# Initialize tokenizer
# It's also possible to try with a stemmer or to mix a stemmer and a lemmatizer
tokenizer = RegexpTokenizer('[\'a-zA-Z]+')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

def tokenize(document):
    words = []

    for sentence in sent_tokenize(document):
        tokens = [lemmatizer.lemmatize(t.lower()) for t in tokenizer.tokenize(sentence) if t.lower() not in stop_words and len(t) > 2]
        words += tokens

    text = ' '.join(words)
    return text

In [9]:
train_docs = []
test_docs = []

for raw_text in X_train_text:
    text = tokenize(raw_text)
    train_docs.append(text)
    
for raw_text in X_test_text:
    text = tokenize(raw_text)
    test_docs.append(text)
    

In [10]:
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

max_tokens = 50 ## Hyperparameter, input length

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_docs+test_docs)

## Vectorizing data to keep 50 words per sample.
X_train_vect = pad_sequences(tokenizer.texts_to_sequences(train_docs), maxlen=max_tokens, padding="post", truncating="post", value=0.)
X_test_vect  = pad_sequences(tokenizer.texts_to_sequences(test_docs), maxlen=max_tokens, padding="post", truncating="post", value=0.)


X_train_vect.shape, X_test_vect.shape

((11314, 50), (7532, 50))

In [11]:
len(tokenizer.index_word)

95077

In [12]:
embed_len = 300

ft_embeddings = np.zeros((len(tokenizer.index_word)+1, embed_len))

for idx, word in tokenizer.index_word.items():
    if word in labels_ft:
        ft_embeddings[idx] = vectors_ft[int(np.where(labels_ft == word)[0][0])]


In [13]:
from keras.models import Model
from keras.layers import Dense, Embedding, Input

inputs = Input(shape=(max_tokens, ))
embeddings_layer = Embedding(input_dim=len(tokenizer.index_word)+1, output_dim=embed_len,
                             input_length=max_tokens, trainable=False, weights=[ft_embeddings])
dense1 = Dense(128, activation="relu")
dense2 = Dense(64, activation="relu")
dense3 = Dense(len(classes), activation="softmax")

x = embeddings_layer(inputs)
x = tensorflow.reduce_mean(x, axis=1) ### Averaged embeddings of tokens of each example
x = dense1(x)
x = dense2(x)
outputs = dense3(x)

model = Model(inputs=inputs, outputs=outputs)

model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 50)]              0         
                                                                 
 embedding (Embedding)       (None, 50, 300)           28523400  
                                                                 
 tf.math.reduce_mean (TFOpLa  (None, 300)              0         
 mbda)                                                           
                                                                 
 dense (Dense)               (None, 128)               38528     
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 20)                1300      
                                                             

In [14]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [15]:
model.fit(X_train_vect, Y_train, batch_size=32, epochs=8)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f77bb52cca0>

In [16]:
labels = ['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [17]:
from sklearn.metrics import accuracy_score, classification_report

Y_preds = model.predict(X_test_vect).argmax(axis=-1)

print("Test Accuracy : {}".format(accuracy_score(Y_test, Y_preds)))
print("\nClassification Report : ")
print(classification_report(Y_test, Y_preds, target_names=labels))

Test Accuracy : 0.5350504514073288

Classification Report : 
                          precision    recall  f1-score   support

             alt.atheism       0.25      0.31      0.27       319
           comp.graphics       0.52      0.60      0.55       389
 comp.os.ms-windows.misc       0.48      0.34      0.39       394
comp.sys.ibm.pc.hardware       0.53      0.17      0.25       392
   comp.sys.mac.hardware       0.37      0.48      0.42       385
          comp.windows.x       0.57      0.58      0.58       395
            misc.forsale       0.63      0.62      0.62       390
               rec.autos       0.73      0.50      0.59       396
         rec.motorcycles       0.46      0.64      0.53       398
      rec.sport.baseball       0.41      0.80      0.55       397
        rec.sport.hockey       0.82      0.71      0.76       399
               sci.crypt       0.71      0.59      0.64       396
         sci.electronics       0.46      0.49      0.48       393
              

# Bilingual aligned word vectors

Ver técnica de word alignment en: https://arxiv.org/pdf/1710.04087.pdf

In [18]:
LANGS = ('en', 'fr')
LANG_REPLACE = '$$lang'
WORD_MAP_PATH = f"./data/{'_'.join(LANGS)}.json"
BUILT_IDX_PATH = f"./data/{'_'.join(LANGS)}.idx"
DL_TEMPLATE = f"https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.{LANG_REPLACE}.align.vec"

points = []
values = []
word_map = {}

Ver documentación en: https://fasttext.cc/docs/en/aligned-vectors.html

In [19]:
import os, aiohttp

async with aiohttp.ClientSession() as session:
  for lang in LANGS:
    # Construct a url for each language
    url = DL_TEMPLATE.replace(LANG_REPLACE, lang)

    # Ensure the directory and files exist
    os.makedirs(os.path.dirname(BUILT_IDX_PATH), exist_ok=True)

    lineno = 0
    async with session.get(url) as resp:
      while True:
        lineno += 1
        line = await resp.content.readline()
        if not line:
          # EOF
          break

        linestr = line.decode('utf-8')
        tokens = linestr.split(' ')

        # The first token is the word and the rest
        # are the embedding
        value = tokens[0]
        embedding = [float(p) for p in tokens[1:]]

        # We only go from english to the other two languages
        if lang == 'en':
          word_map[value] = embedding
        else:
          # Don't index words that exist in english
          # to improve the quality of the results.
          if value in word_map:
              continue

          # We track values here to build the instant-distance index
          # Every value is prepended with 2 character language code.
          # This allows us to determine language output later.
          values.append(lang + value)
          points.append(embedding)



Ver documentación del motor de vecinos cercanos en: https://github.com/InstantDomain/instant-distance

In [20]:
import instant_distance, json

# Build the instant-distance index and dump it out to a file with .idx suffix
print('Building index... (this will take a while)')
hnsw = instant_distance.HnswMap.build(points, values, instant_distance.Config())
hnsw.dump(BUILT_IDX_PATH)

# Store the mapping from string to embedding in a .json file
with open(WORD_MAP_PATH, 'w') as f:
    json.dump(word_map, f)

Building index... (this will take a while)


In [22]:
word = 'hello'

# Get an embedding for the given word
embedding = word_map.get(word)
if not embedding:
  print(f"Word not recognized: {word}")
  exit(1)

hnsw = instant_distance.HnswMap.load(BUILT_IDX_PATH)
search = instant_distance.Search()
hnsw.search(embedding, search)

# Print the results
for result in list(search)[:10]:
  # We know that the first two characters of the value is the language code
  # from when we built the index.
  print(result.value)

frbonjours
frbonjour 
fr#bonjour
fr bonjour
frremerci
frbonjoursg
frbonsoir,
frrebonjour
fr>bonjour
frbonjour,
