In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path
from google.colab import drive

content_path = Path('/').absolute() / 'content'
drive_path = content_path / 'drive'
drive.flush_and_unmount()
drive.mount(str(drive_path))

In [16]:
import pandas as pd

df = pd.read_json(drive_path / "My Drive" / "OCR" / "5_StackOverflow" / "preprocessed_questions.json")

# Word2Vec

In [15]:
from tensorflow.keras.layers import Input, Embedding, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

import gensim
import numpy as np

In [6]:
w2v_size=300
w2v_window=5
w2v_min_count=1
w2v_epochs=100
sentences = df['sentence_bow_lem'].str.split().tolist()
maxlen = max(len(s) for s in sentences)

In [7]:
w2v_model = gensim.models.Word2Vec(
    min_count=w2v_min_count,
    window=w2v_window,
    vector_size=w2v_size,
    seed=42,
    workers=1
)

w2v_model.build_vocab(sentences)

w2v_model.train(
    sentences,
    total_examples=w2v_model.corpus_count,
    epochs=w2v_epochs
)

(351174, 468100)

In [8]:
model_vectors = w2v_model.wv
w2v_words = model_vectors.index_to_key

In [9]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)

x_sentences = pad_sequences(
    tokenizer.texts_to_sequences(sentences),
    maxlen=maxlen,
    padding='post'
)

In [12]:
vocab_size = len(tokenizer.word_index) + 1

embedding_matrix = np.zeros((vocab_size, w2v_size))

for word, idx in tokenizer.word_index.items():
    if model_vectors[word] is not None:
        embedding_matrix[idx] = model_vectors[word]

In [13]:
input = Input(shape=(len(x_sentences),maxlen),dtype='float64')

word_input = Input(shape=(maxlen,),dtype='float64')

word_embedding = Embedding(
    input_dim=vocab_size,
    output_dim=w2v_size,
    weights = [embedding_matrix],
    input_length=maxlen
)(word_input)

word_vec = GlobalAveragePooling1D()(word_embedding)

embed_model = Model([word_input], word_vec)

embed_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 14)]              0         
                                                                 
 embedding (Embedding)       (None, 14, 300)           315300    
                                                                 
 global_average_pooling1d (  (None, 300)               0         
 GlobalAveragePooling1D)                                         
                                                                 
Total params: 315300 (1.20 MB)
Trainable params: 315300 (1.20 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [29]:
embeddings = embed_model.predict(x_sentences)



# USE

In [22]:
import tensorflow_hub as hub

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [23]:
def feature_USE(sentences, b_size) :
    batch_size = b_size

    for step in range(len(sentences) // batch_size) :
        idx = step * batch_size
        feat = embed(sentences[idx:idx + batch_size])
        features = feat if step == 0 else np.concatenate((features, feat))

    return features

In [26]:
batch_size = 10
sentences_USE = df['sentence_dl'].to_list()

features = feature_USE(sentences_USE, batch_size)

In [28]:
features.shape

(1000, 512)