In [56]:
import numpy as np
from copy import deepcopy
from collections import Counter
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from string import punctuation, digits
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer

In [34]:
def get_tokens(corpus, pattern = None):
    '''
        input: 
            corpus, a list of documents
            pattern, if not None, we use RegexpTokenizer
                     else we use word_tokenize
        remove lower, stop words, punctuations, 
    '''
    tokens = None
    stop_words = stopwords.words('english')
    if pattern is not None:
        tokenizer = RegexpTokenizer.tokenize(pattern)
        tokens_list = [tokenizer.tokenize(doc.lower()) for doc in corpus]
    else:
        tokens_list = [word_tokenize(doc.lower().translate(None, punctuation + digits)) for doc in corpus]
    
    return [[token for token in tokens if token not in stop_words] for tokens in tokens_list]

In [38]:
def stemmize(tokens, stemmizer):
    return [stemmizer.stem(token) for token in tokens]

In [101]:
def lemmize(tokens, lemmizer):
    return [lemmizer.lemmatize(token) for token in tokens]

In [50]:
def stem_or_lem(tokens_list, stem = True):
    '''
        input: 
            tokens_list:
            stem, default is True. If False we use lemmize
        output:
            stemmized or lemmatized tokens_list
    '''
    if stem:
        stemmizer = PorterStemmer()
        return [stemmize(tokens, stemmizer) for tokens in tokens_list]
    else:
        lemmizer = WordNetLemmatizer()
        return [lemmize(tokens, lemmizer) for tokens in tokens_list]

In [51]:
def get_vocabulary(sl_tokens_list):
    '''
        get sorted vocabularies
    '''
    vocabulary = [token for tokens in sl_tokens_list for token in tokens]
    return sorted(list(set(vocabulary)))

In [58]:
def bow_vector(tokens, vocabulary):
    '''
        input:
            tokens are a list of tokens
        output:
            bag of word for this tokens
    '''
    bag_of_words = Counter(tokens)
    token_vector = np.zeros(len(vocabulary))
    for word_index, word in enumerate(vocabulary):
        if word in bag_of_words:
            token_vector[word_index] += bag_of_words[word]
    return token_vector

In [70]:
def get_bag_of_word(sl_tokens_list, vocabulary):
    '''
        get bag of word for all tokens_list
    '''
    return np.array([bow_vector(tokens, vocabulary) for tokens in sl_tokens_list])

In [67]:
def get_tf_matrix(bow):
    '''
        input:
            bow is bag of word
        output:
            normalized tf matrix
    '''
    tf_matrix = deepcopy(bow)
    for i in xrange(len(bog)):
        tf_matrix[i] = tf_matrix[i] / np.sum(tf_matrix[i])
    return tf_matrix

In [79]:
def get_idf(bow, vocabulary):
    N = len(bow)
    token_fre_array = np.sum(bow, axis = 0) + 1
    return np.log(N / token_fre_array)

In [75]:
def get_tf_idf_matrix(bow, vocabulary):
    tf_matrix = get_tf_matrix(bow)
    idf = get_idf(bow,vocabulary)
    return tf_matrix * idf

In [102]:
def main(corpus):
    tokens_list = get_tokens(corpus)
    sl_tokens_list = stem_or_lem(tokens_list, stem = False)
    vocabulary = get_vocabulary(sl_tokens_list)
    bow = get_bag_of_word(sl_tokens_list, vocabulary)
    print bow
    tf_idf_matrix = get_tf_idf_matrix(bog,vocabulary)
    return tf_idf_matrix

In [103]:
doc1 = 'Wise people thinking they shop  are foolish!'
doc2 = 'Foolish think shopping foolish people 4 * ( think they are wise wise'
doc3 = 'I am definitely wise; so this irritates me'
doc4 = '-- Trump is for sure like definitely foolish'
corpus = [doc1, doc2, doc3, doc4]

In [104]:
tf_idf_matrix = main(corpus)

[[ 0.  1.  0.  0.  1.  1.  0.  0.  0.  1.  0.  1.]
 [ 0.  2.  0.  0.  1.  0.  1.  0.  2.  0.  0.  2.]
 [ 1.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  1.]
 [ 1.  1.  0.  1.  0.  0.  0.  1.  0.  0.  1.  0.]]


In [91]:
tf_idf_matrix

array([[ 0.        , -0.04462871,  0.        ,  0.        ,  0.05753641,
         0.13862944,  0.        ,  0.        ,  0.        ,  0.13862944,
         0.        , -0.04462871],
       [ 0.        , -0.05578589,  0.        ,  0.        ,  0.03596026,
         0.        ,  0.0866434 ,  0.        ,  0.07192052,  0.        ,
         0.        , -0.05578589],
       [ 0.09589402, -0.        ,  0.23104906,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        , -0.07438118],
       [ 0.05753641, -0.04462871,  0.        ,  0.13862944,  0.        ,
         0.        ,  0.        ,  0.13862944,  0.        ,  0.        ,
         0.13862944, -0.        ]])