In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path
from google.colab import drive
import sys

content_path = Path('/').absolute() / 'content'
drive_path = content_path / 'drive'
drive.flush_and_unmount()
drive.mount(str(drive_path))

# Word2Vec

In [3]:
import tensorflow as tf
from tensorflow.keras import backend, metrics

from tensorflow.keras.layers import Input, Embedding, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

import gensim
import pandas as pd
import numpy as np

In [5]:
df = pd.read_json(drive_path / "My Drive" / "OCR" / "5_StackOverflow" / "preprocessed_questions.json")

In [6]:
w2v_size=300
w2v_window=5
w2v_min_count=1
w2v_epochs=100
sentences = df['sentence_bow_lem'].str.split().tolist()
maxlen = max(len(s) for s in sentences)

In [7]:
w2v_model = gensim.models.Word2Vec(
    min_count=w2v_min_count, window=w2v_window,
    vector_size=w2v_size,
    seed=42,
    workers=1
)

w2v_model.build_vocab(sentences)

w2v_model.train(
    sentences,
    total_examples=w2v_model.corpus_count,
    epochs=w2v_epochs
)

(351174, 468100)

In [8]:
model_vectors = w2v_model.wv
w2v_words = model_vectors.index_to_key

In [9]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)

x_sentences = pad_sequences(
    tokenizer.texts_to_sequences(sentences),
    maxlen=maxlen,
    padding='post'
)

In [12]:
vocab_size = len(tokenizer.word_index) + 1

embedding_matrix = np.zeros((vocab_size, w2v_size))

for word, idx in tokenizer.word_index.items():
    if model_vectors[word] is not None:
        embedding_matrix[idx] = model_vectors[word]

In [13]:
input = Input(shape=(len(x_sentences),maxlen),dtype='float64')

word_input = Input(shape=(maxlen,),dtype='float64')

word_embedding = Embedding(
    input_dim=vocab_size,
    output_dim=w2v_size,
    weights = [embedding_matrix],
    input_length=maxlen
)(word_input)

word_vec = GlobalAveragePooling1D()(word_embedding)

embed_model = Model([word_input],word_vec)

embed_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 14)]              0         
                                                                 
 embedding (Embedding)       (None, 14, 300)           315300    
                                                                 
 global_average_pooling1d (  (None, 300)               0         
 GlobalAveragePooling1D)                                         
                                                                 
Total params: 315300 (1.20 MB)
Trainable params: 315300 (1.20 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [14]:
embeddings = embed_model.predict(x_sentences)
embeddings.shape



(1000, 300)

# BERT

In [None]:
import tensorflow as tf
import tensorflow.keras
from tensorflow.keras import backend as K

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import metrics as kmetrics
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model

# Bert
import os
import transformers
from transformers import *

os.environ["TF_KERAS"]='1'

In [None]:
print(tf.__version__)
print(tensorflow.__version__)
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print(tf.test.is_built_with_cuda())