In [1]:
import pandas as pd
import numpy as np

import nltk
import string
import gensim 
# nltk.download('wordnet')
# nltk.download('punkt')
# nltk.download('stopwords')

#for text preprocessing
from string import digits
#for building document term matrices
from sklearn.feature_extraction.text import TfidfVectorizer

#nltk stopwords 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from gensim.models import Word2Vec 
from gensim.models import KeyedVectors

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## TODOs

- how to save/load pre-processed corpus to train our own word2vec into our class
- add support for Glove and other word embeddings
- use other tokenizers than gensim's one
- Scraping/bucketizing by news source for the prediction task
- Add SEC form scraping to scraper

In [88]:
#added Word2Vec support
#add GloVE
#Look into NMF factorization
#Look into how we can use tf-idf

class Featurizer(object):
    """class to preprocess corpuses or text
    and ultimately create textual features."""
    
    def __init__(self, lang = 'english'):
        """class instantiator."""
        self.stopset = set(stopwords.words(lang))
        self.corpus = set()
        self.tfidf = None
        #for dimensionality reduction
        self.scaler = None
        self.pca = None
        #word2vec
        self.w2v = None
        
    def preprocess(self, input_str):
        """method to preprocess articles.
        removes punctuation, digits & stopwords
        for the given input string.
        
        Args:
            input_str :: str
                text to be preprocessed

        Returns:
            preprocessed string. Also stores 
            the cleaned string into corpus class variable
        """

        input_str = input_str.lower()

        #fastest way to remove/replace characters in python
        digits_table = str.maketrans('', '', string.digits)
        punct_table = str.maketrans('', '', string.punctuation)


        #can add more tables for future reference

        tables = [digits_table, punct_table]
        for t in tables:
            input_str = input_str.translate(t)

        #handling stopwords
        input_str = ' '.join([word for word in input_str.split() if word not in self.stopset])
        
        if input_str not in self.corpus:
            self.corpus.add(input_str)
        
        return input_str
    
    def PCA_fit(self, X_train, n_components=0.9):
        """fits PCA on training sample. stores the PCA
        object and appropriate scaler to use on the test set.
        
        Args:
            X_train :: DataFrame or np.array
                Training set, can be relatively large matrix
                of chosen unigram features.
            
            n_components :: float 
                number of components needed to explain
                n_components% of the variation in X_train
                
        Returns:
            PCs :: np.array
                principal components of the training set
        """
        
        sc = StandardScaler()
        X_train_std = sc.fit_transform(X_train)
        self.scaler = sc 
    
        #obtaining the PCs from training data
        pca_fit = PCA(n_components)
        PCs = pca_fit.fit(X_train_std)
        self.pca = pca_fit 
        
        return PCs
    
    def PCA_transform(self, X_test):
        """transform out-of-sample examples
        with PCs from training sample."""
        
        if (self.scaler == None) or (self.pca == None):
            return "must fit PCA to the training sample first"
        
        X_test_std = self.scaler.transform(X_test)
        
        return self.pca.transform(X_test_std)

    def tfidf_fit(self, use_idf = True, max_df = 1.0, min_df = 1, max_features = None):
        """generates the document term frequency
        matrix for the stored training corpus. Can serve as simple
        NLP baseline for the regression task.
        
        Args:
            use_idf :: bool
                whether or not to use idf weights. if
                set to False, simply uses tf weights
                
            max_df/min_df :: float or int
                if in [0,1], represents proportion 
                of terms to ignore with respect to corpus 

            max_features :: int
                considers only the "max_features" most frequent
                terms, if specified
                
        Returns:
            df :: DataFrame
                document term frequency matrix for the corpus
        """
        #params of the vectorizer
        tfidf = TfidfVectorizer(lowercase = False, 
                        preprocessor = self.preprocess,
                        stop_words = None,
                        ngram_range=(1,1),
                        tokenizer = None,
                        max_df = max_df, min_df = min_df,
                        max_features = max_features,
                        use_idf = use_idf)
        
        features = tfidf.fit_transform(self.corpus)
        self.tfidf = tfidf 
        df = pd.DataFrame(features.todense(), columns=tfidf.get_feature_names())

        return df
    
    def tfidf_transform(self, test_corpus):
        """transforms articles according to
        trained tfidf model. Use on out-of-sample articles to obtain
        textual tfidf features. 
        
        Args:
            test_corpus :: list(str)
                list of articles to transform into tfidf features
                
        Returns:
            X_test :: DataFrame
                articles transformed into features
        """
        
        if self.tfidf is None:
            return 'Must fit an tfidf model on training corpus first'
        
        X_test = self.tfidf.transform(test_corpus)
        X_test = pd.DataFrame(data=X_test.toarray(), columns= f.tfidf.get_feature_names())
        
        return X_test
    
    def word2vec(self, pre_train_path = None, size = 100, min_count = 1, window = 5):
        """loads or train Word2Vec embeddings 
        using class corpus.
        
        Args:
            pre_train_path :: str
                path to file containing pre-trained embeddings,
                e.g. GoogleNews embeddings
                
            size :: int
                dimension of the word embedding,
                e.g. # of features
        
            min_count :: int
                ignores words occuring less than min_count
                
        Stores:
            self.w2v :: Word2Vec object
        """
        
        #using pre_trained embeddings
        if pre_train_path:
            pre_trained = KeyedVectors.load_word2vec_format(pre_train_path, binary=True)
            
            self.w2v = pre_trained 
            return 'stored pre-trained word embeddings'
        
        #tokenizing articles
        tokenized_articles = []
        for a in f.corpus:
            #maybe use another tokenizer, not sure if gensim.utils is best
            tokenized_articles.append(gensim.utils.simple_preprocess(a))
        
        model = Word2Vec(sentences = tokenized_articles, sg = 0, size = size, 
                         min_count = min_count, workers = -1, window = window)
        self.w2v = model
        
        return 'trained word embeddings on corpus' 
    
    def article2vec(self, article):
        """given an aticle/text,
        returns a word embedding for that particular text
        using the self.w2v model (obtained with word2vec).
        
        Args:
            article :: str 
                article text
                
        Returns:
            vec :: np.array
                word embedding of the text
        """
        
        if self.w2v is None:
            return "must load/train a word2vec model first"
        
        vec = np.zeros(shape = (self.w2v.vector_size))
    
        txt = self.preprocess(article)
        #convert into tokens, can perhaps be more picky here
        tokens = gensim.utils.simple_preprocess(txt)
        
        for t in tokens:
            #check if token in vocab
            if t in self.w2v.wv:
                #maybe consider using a weighted word2vec embedding
                vec += self.w2v.wv[t]
        
        return vec

### Example of preprocessing

In [99]:
#example 
training_corpus = ['Vineet has said to concatenate articles', 'this is information about document one', 
                   '$3NoW, so!me^ information about the >,/?seco0nd! 488492document',
                  'this is the last article news!!']
f = Featurizer()
for a in training_corpus:
    f.preprocess(a)

print('pre-processed articles/text:', f.corpus)
print('')
doc_matrix = f.tfidf_fit(use_idf = True)
doc_matrix

pre-processed articles/text: {'vineet said concatenate articles', 'information document one', 'last article news', 'information second document'}



Unnamed: 0,article,articles,concatenate,document,information,last,news,one,said,second,vineet
0,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.5
1,0.0,0.0,0.0,0.526405,0.526405,0.0,0.0,0.667679,0.0,0.0,0.0
2,0.57735,0.0,0.0,0.0,0.0,0.57735,0.57735,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.526405,0.526405,0.0,0.0,0.0,0.0,0.667679,0.0


In [90]:
f.PCA_fit(doc_matrix, n_components=2)

PCA(n_components=2)

In [91]:
#new testing examples to be used as features
test_corpus = ['one test instance']

test_matrix = f.tfidf_transform(test_corpus)

In [92]:
f.PCA_transform(test_matrix)

array([[-0.06117865,  2.20290661]])

In [7]:
#example of pre-processing

stopset = set(stopwords.words('english'))
def preprocess(input_str):
    """removes punctuation and digits
    for the given input string."""
    
    input_str = input_str.lower()
    
    #fastest way to remove/replace characters in python
    digits_table = str.maketrans('', '', string.digits)
    punct_table = str.maketrans('', '', string.punctuation)
    
    
    #can add more tables for future reference

    tables = [digits_table, punct_table]
    for t in tables:
        input_str = input_str.translate(t)

    #handling stopwords
    input_str = ' '.join([word for word in input_str.split() if word not in stopset])
    
    return input_str

preprocess('This&/ $!!is a #42 {[impor^%tant]} +70TEST|')

'important test'

### PCA example

In [43]:
#using PCA to reduce dimensionality. Maybe try kernel PCA?
df = pd.read_csv('https://archive.ics.uci.edu/ml/'
                      'machine-learning-databases/wine/wine.data',
                      header=None)

X, y = df_wine.iloc[:, 1:].values, df.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3,
    stratify=y, random_state=0)

sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
#neeeds to perform scaling as per the training sample
X_test_std = sc.transform(X_test)


pca = PCA(n_components = 0.9)
#obtaining the principal components for training data
PCs = pca.fit_transform(X_train_std)
new_PC = pca.transform(X_test_std)

### Word2Vec Example

In [3]:
#getting data to train word2vec model, 10 days of articles on apple
%run Scraper.ipynb

search data stored, 11/15/2020
search data stored, 10/01/2020
search data stored, 10/02/2020
search data stored, 10/03/2020
search data stored, 10/04/2020
search data stored, 10/05/2020
search data stored, 10/06/2020
search data stored, 10/07/2020
search data stored, 10/08/2020
search data stored, 10/09/2020
search data stored, 10/10/2020
CPU times: user 50.7 s, sys: 1.13 s, total: 51.8 s
Wall time: 2min 50s


In [26]:
#### using word2vec ####
f = Featurizer()
#preprocessing articles
for t in s.final_df.text:
    if t:
        f.preprocess(t)
        
#sentences: list of lists of tokens. Can be changed to support n-grams as well.
#sg: whether to use skipgram or cbow  (default cbow)
#size: dimensionality of word space (e.g how many features to use)
#min_count: ignores words that occur less than this. Can probably handle else where in pre-processing.
#workers: how many cores to use

f.word2vec()

'trained word embeddings on corpus'

In [37]:
#obtain similar words with cosine similarity
#note that the result is pretty "awful" here (pun intended), since we trained on a very small corpus
#to remedy this, use either larger corpus or load a pre-trained model
f.w2v.wv.most_similar('awful')

[('sountrap', 0.36851558089256287),
 ('tamp', 0.32931700348854065),
 ('agreement', 0.3131377696990967),
 ('charts', 0.31090497970581055),
 ('administration', 0.30836647748947144),
 ('integral', 0.301155686378479),
 ('close', 0.2970578074455261),
 ('recognizable', 0.29581451416015625),
 ('read', 0.2942274808883667),
 ('filing', 0.2928212881088257)]

In [36]:
# example = s.df.text[0]
# f.article2vec(example)

In [39]:
#loading pre-stored word2vec
f.word2vec(pre_train_path ='PreTrained_Vecs.bin.gz')

'stored pre-trained word embeddings'

In [22]:
# load the google word2vec model 
#issue is that distribution might differ from articles we are pulling our data from, hence the need to train our own
# filename = 'PreTrained_Vecs.bin.gz'
# # pre_trained = KeyedVectors.load_word2vec_format(filename, binary=True)
# # calculate: (king - man) + woman = ?
# f.w2v.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)