## Vocab, CBOW, SkipGram 

In [1]:
import numpy as np
import random

def build_vocab(tokens):
    assert type(tokens) is list, "Tokens must be list"
    if type(tokens[0]) is str: tokens = [tokens] #creates 2d list if a 1d list is passed.
    word2int, idx = {"<pad>":0}, 1
    for sentence in tokens:
        for word in sentence:
            if word not in word2int:
                word2int[word] = idx
                idx += 1
    int2word = {index: token for token, index in word2int.items()}
    return word2int, int2word

def generate_cbow(tokens,window_size):
    assert window_size % 2 != 0, "Window Size must be odd number since context words are centered around target word."
    assert type(tokens) is list, "Tokens must be 2d list of sentences"
    if type(tokens[0]) in [int,str]: #could be word or int representation of word
        tokens = [tokens]
    window_size = int((window_size - 1) / 2)
    
    train_data = []
    for sentence in tokens:
        for idx in range(2, len(sentence) - 2):
            context = [raw_text[idx - 2], 
                       raw_text[idx - 1],
                       raw_text[idx + 1], 
                       raw_text[idx + 2]]

            target = raw_text[idx]
            train_data.append((context, target))
    return train_data


def generate_skipgrams(tokens, window_size, n_neg_samples):
    assert window_size % 2 != 0, "Window Size must be odd number since context words are centered around target word."
    assert type(tokens) is list, "Tokens must be 2d list of sentences"
    if type(tokens[0]) in [int,str]: #could be word or int representation of word
        tokens = [tokens]
    window_size = int((window_size - 1) / 2)
    X, y, targets = [], [], []
    
    vocab_ints = list(vocab.values())
    first_sample_size = int(n_neg_samples**2)
    
    label = np.zeros(n_neg_samples + 1)
    label[0] = 1
    
    for sentence in tokens:
        sentence_len = len(sentence)
        for idx, word in enumerate(sentence):
            window_start, window_end = max(idx - window_size, 0), min(idx + window_size, sentence_len)
            context = sentence[window_start : window_end + 1]
            neg_sampling = random.sample(vocab_ints, first_sample_size)
            neg_sampling = [x for x in neg_sampling if x not in context]#[:n_neg_samples]
            context.remove(word)
            for c in context:
                X.append(word)
                y.append( [c] + random.sample(neg_sampling,n_neg_samples) )
                #y.append([c]+neg_sampling)
                targets.append(label)
                
    return X, y, targets


In [143]:
# %%timeit
# generate_skipgrams(token_ints,5,3)

384 µs ± 374 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [124]:
# %%timeit
# generate_cbow(token_ints,5)

5.36 µs ± 95.3 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [2]:
import pandas as pd
import re


PATH = "/Users/trevor/DB_BACKUPS/26Apr2021/"
df = pd.read_csv(PATH + "ArticleData_26Apr21.csv",encoding="utf-8")


In [3]:
df.groupby("source").count()

Unnamed: 0_level_0,Unnamed: 0,id,authors,url,title,description,date,content,category,status,images
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ABC,2043,2043,2043,2043,2043,2043,2043,2043,2043,445,2043
AP,3183,3183,3183,3183,3183,3183,3183,3183,3183,616,3183
AlJazeera,1503,1503,1503,1503,1503,1503,1503,1503,1503,402,1503
BBC,2503,2503,2503,2503,2503,2503,2503,2052,2503,372,2503
CBN,224,224,224,224,224,37,224,224,224,33,224
CBS,326,326,326,326,326,326,326,326,326,131,326
CNBC,2696,2696,2696,2696,2696,2683,2696,2693,2696,303,2696
CNN,5858,5858,5858,5858,5846,4430,5858,3828,5858,731,5858
Forbes,481,481,481,481,481,481,481,481,481,43,481
FoxNews,3053,3053,3053,3053,3053,3014,3053,3053,3053,502,3053


In [4]:
df = df[df.source == "CNN"].reset_index(drop=True)
len(df)

5858

In [5]:
df = df.loc[df.content.apply(type) != float].reset_index(drop=True)
len(df)

3828

In [6]:
sample = df.content.to_list()
len(sample)


3828

In [16]:

#when mining, paragraphs are split by \n. News paragraphs can be considered sentences...
def clean_first(text):
    """
    Get rid of Locations and Source in starting tokens
    """
    text = re.sub(r"(\))(\w)",r"\1 \2",text) #add space after parenthesis if missing
    text = text.split()
    
    i = min(7, len(text) - 2)
    while i >= 0:
        token = text[i]
        if re.search(r"^[-–-——]+",token) != None:
            text = text[i+1:]
            break
        elif token.find(")") != -1:
            text = text[i+1:]
            break
        i -= 1
    text = " ".join(text).strip()
    return text

def clean(text):
    text = re.sub(r"\s{2,}"," ",text)
    text = re.sub(r"https?\:[\/]{2}[^\s]+","",text) #urls
    text = text.replace("&amp;","and")
    text = re.sub(r"\([^\(\)]+(Photo|Getty Images)[^\(\)]{0,}\)","",text)
    text = re.sub(r"’|\u2019","'",text)
    #text = re.sub(r"([A-Za-z]+)([\,\:\;])",r"\1 \2",text)
    #text = re.sub(r"([A-Za-z]{2,})(\.)", r"\1 \2", text)
    return text

def dequote(text, min_words = 4, tag=True, ret_quotes=False):
    """
    Dequotes text
    -----------
    Parameters
    -----------
        min_words --> min number of words in a quote to be considered valid quotation 
        tag --> replaces quote with a "<quote>" tag
        ret_quotes --> returns a tuple of dequoted_text, list of quotes
    """
    def _isQuote(match):
        match = match.group(0)
        if len(match.split()) > min_words:
            if tag:
                return "<quote>"
            return ""
        return match
    #***************************************
    
    text = re.sub(r"\u201C|\u201D",'"',text) #normalize fancy unicode quotes

    pattern = re.compile(r'((?<=^)|(?<=\W))\"(.+?)\"')
    dequoted_text = pattern.sub(_isQuote,text).strip()
    return dequoted_text

#**********************************************************************
import spacy
nlp = spacy.load("en_core_web_sm")
#nlp.disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"]

def process_entities(text, 
                     ent_types=["ORG","PERSON","GPE", "LOCATION","ORGANIZATION"],
                     repl_char="_"):
    """
    Finds entities and transforms into a single token 
    -------------
    Parameters
    -------------
    ent_types --> types of entities to process
    repl_char --> character to replace spaces with in the entity
    """
    def replace_ent(match):
        match = match.group(0).strip()
        return match.replace(" ", repl_char)
        
    def custom_ent_tagger(text):
        rules = {
            "<year>":[
                (r"(?<=\s[iI]n\s)\d{4}", None), 
                (r"(<year>)(\sand\s|\sor\s|\,\s)(\d{4})", r"\1\2<year>")
            ], # in <year>, <year> and|,|or <year>
            "<day_of_week>":[
                (r"([Mm]onday|[Tt](ues|hurs)day|[Ww]ednesday|[Ff]riday|[Ss](at|un)day)", None)
            ],
            "<date>":[
                (r"(January|February|March|April|May|June|July|August|September|October|November|December)\s(\d{1,2})\,?\s(\d{4})", r"\1_\2_\3"),
                (r"(January|February|March|April|May|June|July|August|September|October|November|December)\s(\d{1,2})", None)
            ],
            "<money>":[
                (r"\$[\d\,\.]+", None)
            ]

            
        }

        for tag, patterns in rules.items():
            for query, repl in patterns:
                if repl is None:
                    text = re.sub(query,tag,text)
                else:
                    text = re.sub(query,repl,text)

        return text
        
        
    for ent in nlp(text).ents:
        if ent.label_ in ent_types:   
            pattern = ent.text
            pattern = re.sub("[Tt]he","",pattern).strip() #drop beginning "the"
            text = re.sub(re.escape(pattern),replace_ent,text)
            
        elif ent.label_ == "DATE": #get year from spaCy DATE ents
            pattern = ent.text
            if re.search(r"(^\'?\d\d$)|(^\d{4}$)",pattern) != None:
                text = re.sub(pattern,"<year>",text)
            #month day or year -- may 2020 | may 7
            elif re.search("([Jj](anuary|uly|une)|[Ff]ebruary|[Mm](arch|ay)|[Aa](pril|ugust)|([Ss]ept|[Nn]ov|[Dd]ec)ember|[Oo]ctober)(\s\d{2,})",pattern) != None:
                text = re.sub(pattern,"<date>",text)
            elif re.search("[Mm]ay",pattern) != None:
                text = re.sub(pattern,"<month_of_may>",text)


    ##seperate paranthesis attached to entity  #national_institute_of_allergy_and_infectious_diseases_(niaid)
    text = re.sub(r"([\w\_]+)(\_)(\(\w+\))",r"\1 \3", text) 
    #get rid of leading _ for ents
    text = re.sub(r"(\s)(\_)([\w])", r"\1 \3", text)
    #custom rules
    text = custom_ent_tagger(text)
    #print(text)   
    return text


def contraction_handling(text):
    contractions = {
        r"(\w+)n't":r"\1 not",
        r"(\w+)'ve":r"\1 have",
        r"(\w+)'re":r"\1 are",
        r"(\w+)'s":r"\1 's",
        r"(\w+)'ll":r"\1 will",
        r"([A-Za-z]+)(\-)([A-Za-z]+)":r"\1 \2 \3"
    }

    exceptions = {
        "can't":"cannot",
        "won't":"will not",
        "why'd":"why did",
        "he'd":"he would",
        r"([Ii]t|[Ss]?[Hh]e|[Tt]here)'s":r"\1 is"
    }
    
    for pattern, repl in exceptions.items():
        text = re.sub(pattern,repl,text)
    
    for pattern, repl in contractions.items():
        text = re.sub(pattern,repl,text)
        
    return text


def remove_punct(tokens):
    pattern = re.compile(r"[\,\.\?\!\:\;\"\-\)\(]+|(\'(?=\s))")
    if type(tokens) is str:
        return re.sub(pattern,"",tokens)
    else:
        return [re.sub(pattern,"",t) for t in tokens]

def remove_stopwords(tokens, negations=False):
    assert type(tokens) is list, "Tokens must be a list"
    
    stop_words = ['the', 'to', 'of', 'and', 'in', 'a', 'that', "'s", "if","them", "all", "into", "-","—", "you",
                  'for', 'on', 'is', 'said', 'with', 'it', 'as', 'was', 'he', 'has', "did", "only", "still", "back",
                  'by', 'are', 'from', 'at', 'have', 'but', 'his', 'be', 'an', 'will', 'who', 'more', "where",
                  'she', 'also', 'which', 'its', 'they', 'their', 'this', 'about', 'been', "there", "i", "any",
                  'had', 'one', 'her', 'than', 'were', 'us', 'or', 'when', "like", "him", "just", "both",
                  'some', 'after', 'last','told', 'up', "how", "those", "while", "what", "get", "then",
                  'out', 'over', 'new', 'during', "now", "so", "we", "do", "these", "can", "go", "my",
                 "through", "though", "yours", "your", "yet", "yours", "yourself", "yourselves",
                  "whatever", "again", "am", "become", "does", "doing", "each", "either", "else", "even",
                  "ever", "me", "keep", "hers", "off", "onto", "our", "per", "side", "such", "too", "via", "why"]

    if negations:
        stop_words = stop_words + ["not", "no", "cannot"]
    
    return [t for t in tokens if t not in stop_words]

def tokenize(text):
    #whitespace
    return text.split()


def remove_unicode(text):
    return re.sub(r"\\u[\d\b]","",text)

    
def clean_document(text):
    text = re.sub("<br>","",text)
    return text


def preprocess(sample, remove_stops=True):
    sample = clean_document(sample)
    paragraphs = [x.strip() for x in sample.split("\n") if len(x.strip()) > 0]
    if paragraphs[0][:14] == "Fox News Flash":
        paragraphs = paragraphs[1:]
    elif paragraphs[0].find("FoxCast") != -1:
        paragraphs = paragraphs[1:]
        
    paragraphs[0] = clean_first(paragraphs[0])
    new_paragraphs = []
    i =0 
    while i <= len(paragraphs) - 2:
        curr = paragraphs[i]
        if curr[-1].isalnum():
            n = paragraphs[i + 1]
            n = n.encode("ascii","ignore").decode()
            if n[0].islower():
                curr = curr + " " + n
                i += 1
        new_paragraphs.append(curr)
        i += 1
    all_sentences = []
    for para in new_paragraphs:
        cleaned = clean(para)
        cleaned = dequote(cleaned,tag=True)
        cleaned = process_entities(cleaned)
        cleaned = contraction_handling(cleaned)
        cleaned = remove_punct(cleaned)
        tokens = tokenize(cleaned.lower())
        if remove_stops:
            tokens = remove_stopwords(tokens)
        all_sentences.append(tokens)
    return all_sentences
    
#***************************************************
    
documents = []
for article in sample[50:55]:
    sentences = preprocess(article)
    for sentence in sentences:
        documents.append(sentence)


print(len(documents))
print(type(documents),type(documents[0]),type(documents[0][0]))
    


105
<class 'list'> <class 'list'> <class 'str'>


## Find Stop Words

In [18]:
from collections import Counter

def keep_n_common(tokens,n):
    n_common = [x[0] for x in Counter(tokens).most_common(n)]
    return n_common

all_tokens = []
for sentence in documents:
    for word in sentence:
        all_tokens.append(word)

keep_n_common(all_tokens,100)

['<quote>',
 'not',
 'arceneaux',
 'company',
 'space',
 'tread',
 'americans',
 'people',
 'home',
 'broadband',
 'would',
 'mission',
 'malware',
 'researchers',
 'st',
 'jude',
 'take',
 'crew',
 'peloton',
 'publishers',
 'fcc',
 'united',
 'cancer',
 '4',
 'plans',
 'many',
 'other',
 'say',
 'internet',
 'red',
 'use',
 'around',
 'month',
 'inspiration',
 'first',
 'isaacman',
 'could',
 'spacex',
 'media',
 'european',
 'irving',
 'income',
 'according',
 'canary',
 'silver',
 'sparrow',
 'found',
 'year',
 'millions',
 'astronauts',
 'million',
 'make',
 'trip',
 'including',
 'call',
 'lower',
 'treadmill',
 'tread+',
 'evancha',
 'classes',
 'news',
 'policy',
 'legislation',
 'school',
 'access',
 'low',
 'infected',
 'used',
 'search',
 'apple',
 'report',
 'because',
 'states',
 'additional',
 'members',
 'research',
 'less',
 'whose',
 'put',
 'no',
 'patients',
 'nasa',
 'work',
 'companies',
 'designed',
 'measures',
 'most',
 'high',
 'december',
 'president',
 'avail

In [11]:
vocab, inverse_vocab = build_vocab(documents)
all_sentences = [[vocab[x] for x in sentence] for sentence in documents]
print("Vocab Length: {}".format(len(vocab)))

Vocab Length: 42312


In [12]:
inputs, context, labels = generate_skipgrams(all_sentences,
                                             window_size=5,
                                             n_neg_samples=4)


In [15]:
with open("abc_training_data.txt", "w") as f:
    f.write(str(list(zip(inputs, context, labels))))


In [16]:
print(len(inputs))

2206316


In [17]:
import tensorflow as tf
from tensorflow import keras

BATCH_SIZE = 64
BUFFER_SIZE = len(inputs)
dataset = tf.data.Dataset.from_tensor_slices(((inputs, context), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

<BatchDataset shapes: (((64,), (64, 5, 1)), (64, 5)), types: ((tf.int64, tf.int64), tf.float64)>


In [18]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(dataset)

<PrefetchDataset shapes: (((64,), (64, 5, 1)), (64, 5)), types: ((tf.int64, tf.int64), tf.float64)>


In [19]:
class Word2Vec(keras.models.Model):
    def __init__(self, vocab_size, embedding_dim, num_ns):
        super(Word2Vec,self).__init__()
        self.target_embedding = keras.layers.Embedding(vocab_size, embedding_dim, input_length = 1, name="W2V")
        self.context_embedding = keras.layers.Embedding(vocab_size,embedding_dim,input_length=num_ns+1)
        self.dots = keras.layers.Dot(axes=(3, 2))
        self.flatten = keras.layers.Flatten()
        
    def call(self, pair):
        target, context = pair
        word_emb = self.target_embedding(target)
        context_emb = self.context_embedding(context)
        dots = self.dots([context_emb, word_emb])
        return self.flatten(dots)

vocab_size = len(vocab)
embedding_dim = 128
word2vec = Word2Vec(vocab_size, embedding_dim, num_ns=4)
word2vec.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy']) 


In [20]:
word2vec.fit(dataset, epochs=5)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fc34ccd6210>

In [22]:
import io, string

weights = word2vec.get_layer('W2V').get_weights()[0]

out_v = io.open('abc_vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('abc_vocab.tsv', 'w', encoding='utf-8')
for index, word in enumerate(vocab):
    if index == 0:
        continue  # skip 0, it's padding.
    vec = weights[index]
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_m.write(word + "\n")
out_v.close()
out_m.close()