# Custom Word Embedding Vectors

The U.S. Patent Office releases the full text of all patents from 1976 to present on their website, USPTO Open Data. 

In this notebook, [a subset of that data](https://developer.uspto.gov/product/patent-grant-full-text-dataxml), from Jan 2022 to Jun 2022, will be used to train custom word embedding vectors using gensim.

In [9]:
import pandas as pd
import html
import re
import os

from contractions import CONTRACTION_MAP

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

from gensim.models import Word2Vec, Phrases, FastText
from nltk.tokenize import word_tokenize, sent_tokenize

### Text Preprocessing

In [26]:
lm = WordNetLemmatizer()
ps = PorterStemmer()

def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)               
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text


def remove_newline_characters(text):
    pattern = r'[\r\n]'
    text = re.sub(pattern, '', text)
    return text


def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text


def lemmatize(text):
    return lm.lemmatize(text)


def stem(text):
    return ps.stem(text)


def normalize_corpus(corpus, method='lem'):
    corpus = corpus.lower()
    corpus = remove_newline_characters(corpus)
    corpus = expand_contractions(corpus, contraction_mapping=CONTRACTION_MAP)

    sents = sent_tokenize(corpus)
    norm_corpus = []

    for sent in sents:
        sent = remove_special_characters(sent)

        words = word_tokenize(sent)

        if method == 'lem':
            norm_words = [lemmatize(word) for word in words if word not in stopwords.words('english')]
        else:
            norm_words = [stem(word) for word in words if word not in stopwords.words('english')]

        norm_corpus.append(' '.join(norm_words))

    return norm_corpus

### Load & Parse Patents Data

In [42]:
%%time
data_dir = r'./data/'

with open('./data/training_corpus.txt', 'w') as f, open('./data/errors.txt', 'w') as err:
    for file in os.listdir(data_dir):
        if '.xml' in file:
            print('Processing file: %s' % file)
            
            patents = html.unescape(open(os.path.join(data_dir, file), 'r').read())

            titles = []
            abstracts = []
            descriptions = []
            
            for ind, patent in enumerate(patents.split(r'<?xml version="1.0" encoding="UTF-8"?>')):
                try:
                    bs = BeautifulSoup(patent)
                except:
                    err.writelines(patent + '\n')

                try:
                    titles.append([elm.text for elm in bs.find_all('invention-title')])
                except:
                    pass

                try:
                    abstracts.append([elm.text for elm in bs.find_all('abstract')])
                except:
                    pass

                try:
                    descriptions.append([elm.text for elm in bs.find_all('description')])
                except:
                    pass
                
                if (ind+1) % 2000 == 0:
                    print('Extracted text from %i patents..' % (ind+1))

            print('Total Patents: %i' % ind)
            print()

            for ind, item in enumerate(titles + abstracts + descriptions):
                if item:
                    norm_corpus = normalize_corpus(item[0], method='lem')
                    f.writelines('%s\n' % item for item in norm_corpus)

Processing file: ipg220614.xml
Extracted text from 2000 patents..
Extracted text from 4000 patents..
Extracted text from 6000 patents..
Total Patents: 7463

Processing file: ipg220412.xml
Extracted text from 2000 patents..
Extracted text from 4000 patents..
Extracted text from 6000 patents..
Total Patents: 7399

Processing file: ipg220201.xml
Extracted text from 2000 patents..
Extracted text from 4000 patents..
Extracted text from 6000 patents..
Total Patents: 7395

Processing file: ipg220215.xml
Extracted text from 2000 patents..
Extracted text from 4000 patents..
Extracted text from 6000 patents..
Total Patents: 7313

Processing file: ipg220405.xml
Extracted text from 2000 patents..
Extracted text from 4000 patents..
Extracted text from 6000 patents..
Total Patents: 7373

Processing file: ipg220607.xml
Extracted text from 2000 patents..
Extracted text from 4000 patents..
Extracted text from 6000 patents..
Total Patents: 7461

Processing file: ipg220503.xml
Extracted text from 2000 pa

### Iterator / Generator - Full Subset

Solution courtesy of [this post](https://stackoverflow.com/questions/56468865/sentence-iterator-to-pass-to-gensim-language-model)!

In [6]:
def tokens_generator(training_corpus_file, min_len=5):
    with open(training_corpus_file, 'r') as f:
        for line in f:
            if len(line.split(' ')) >= min_len:
                yield line[:-1]

In [7]:
class SentencesIterator():
    def __init__(self, generator_function, dir, min_len):
        self.generator_function = generator_function
        self.dir = dir
        self.min_len = min_len

        self.generator = self.generator_function(self.dir, self.min_len)

    def __iter__(self):
        # reset the generator
        self.generator = self.generator_function(self.dir, self.min_len)
        return self

    def __next__(self):
        result = next(self.generator)
        if result is None:
            raise StopIteration
        else:
            return result

### Word2Vec Model (gensim)

In [13]:
%%time
corpus = SentencesIterator(tokens_generator, dir=r'./data/training_corpus.txt', min_len=5)

model = Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=1, epochs=4)
model.save(r'./models/patents-2022-w2v.model')

CPU times: user 2h 35min 25s, sys: 1min 41s, total: 2h 37min 6s
Wall time: 1h 21s


In [14]:
corpus = SentencesIterator(tokens_generator, dir=r'./data/training_corpus.txt', min_len=5)

model = FastText(sentences=corpus, vector_size=100, window=5, min_count=1, epochs=4)
model.save(r'./models/patents-2022-fasttext.model')

In [15]:
corpus = SentencesIterator(tokens_generator, dir=r'./data/training_corpus.txt', min_len=5)
bigram_transformer = Phrases(corpus)

model = Word2Vec(sentences=bigram_transformer[corpus], vector_size=100, window=5, min_count=1, epochs=4)
model.save(r'./models/patents-2022-w2v-ngram.model')