# Feature Engineering: Document-Term Matrix, TF-IDF using sklearn and Gensim

### Scikit-Learn

In [3]:
my_docs = ['This movie was about spaceships and aliens.',
           'I really enjoyed the movie!',
           'Awesome action scenes, but boring characters.',
           'The movie was awful! I hate alien films.',
           'Space is cool! I like space movies.',
           'More space films, please!']

In [4]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vec = CountVectorizer()                      # Instantiate CountVectorizer

In [5]:
# CountVectorizer() and TfidfVectorizer() dictionary can be customized as:
#   stop_words=stopwords.words('english')      - Drop stopwords from dictionary
#   max_df=0.85           - Drop all terms that appear in over 85% of documents
#   max_features=10       - Restrict vocabulary to the 10 most frequent terms

vec = CountVectorizer(max_df=0.85, stop_words=stopwords.words('english'))
vec = CountVectorizer(max_df=0.85, max_features=10, stop_words='english')

In [6]:
vec.fit(my_docs)                             # fit method creates dictionary
print(vec.vocabulary_)                       # Display  dictionary

{'movie': 5, 'really': 7, 'enjoyed': 1, 'action': 0, 'scenes': 8, 'hate': 3, 'films': 2, 'space': 9, 'like': 4, 'movies': 6}


In [7]:
print(sorted(vec.vocabulary_.keys()))        # Sort dictionary

['action', 'enjoyed', 'films', 'hate', 'like', 'movie', 'movies', 'really', 'scenes', 'space']


In [8]:
len(vec.vocabulary_)

10

In [9]:
dtm = vec.transform(my_docs)                 # transform() creates count vector
dtm = vec.fit_transform(my_docs)             # You can fit and transform jointly
dtm.shape                                    # num_docs x dictonary_length

(6, 10)

In [10]:
print(dtm.toarray())

[[0 0 0 0 0 1 0 0 0 0]
 [0 1 0 0 0 1 0 1 0 0]
 [1 0 0 0 0 0 0 0 1 0]
 [0 0 1 1 0 1 0 0 0 0]
 [0 0 0 0 1 0 1 0 0 2]
 [0 0 1 0 0 0 0 0 0 1]]


In [17]:
vec = TfidfVectorizer()                      # TF-IDF
vec = TfidfVectorizer(stop_words='english', max_df=0.85)

tfidf = vec.fit_transform(my_docs)
tfidf.shape

(6, 18)

In [18]:
print(tfidf.toarray())

[[0.         0.         0.63509072 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.4396812  0.         0.         0.         0.         0.63509072]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.63509072 0.         0.         0.
  0.4396812  0.         0.63509072 0.         0.         0.        ]
 [0.4472136  0.         0.         0.4472136  0.         0.4472136
  0.4472136  0.         0.         0.         0.         0.
  0.         0.         0.         0.4472136  0.         0.        ]
 [0.         0.490779   0.         0.         0.490779   0.
  0.         0.         0.         0.4024458  0.490779   0.
  0.3397724  0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.41923309 0.         0.         0.         0.41923309
  0.         0.41923309 0.         0.         0.68755426 0.        ]
 [0.         0.         0.         0.   

In [19]:
tfidf.nnz                                    # nnz number of non-zero elements

22

In [20]:
tfidf.nnz/float(tfidf.shape[0]*tfidf.shape[1]) # Measure of sparsity

0.2037037037037037

### Gensim

In [21]:
#pip install gensim
from gensim.utils import simple_preprocess, lemmatize
from gensim.corpora import Dictionary

In [22]:
tokens = [[w for w in doc.split()] for doc in my_docs]
tokens = [simple_preprocess(doc, deacc=True) for doc in my_docs]
my_dict = Dictionary(tokens)              # Create a Gensim dictionary
print(my_dict)

Dictionary(27 unique tokens: ['about', 'aliens', 'and', 'movie', 'spaceships']...)


In [23]:
print(my_dict.token2id)                   # Token-to-id mapping

{'about': 0, 'aliens': 1, 'and': 2, 'movie': 3, 'spaceships': 4, 'this': 5, 'was': 6, 'enjoyed': 7, 'really': 8, 'the': 9, 'action': 10, 'awesome': 11, 'boring': 12, 'but': 13, 'characters': 14, 'scenes': 15, 'alien': 16, 'awful': 17, 'films': 18, 'hate': 19, 'cool': 20, 'is': 21, 'like': 22, 'movies': 23, 'space': 24, 'more': 25, 'please': 26}


In [24]:
len(my_dict.token2id)

27

In [25]:
from nltk.tokenize import regexp_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [26]:
tokens = []
for doc in my_docs:
    words = regexp_tokenize(doc.lower(), r'[A-Za-z]+')
    words = [w for w in words if w not in stopwords.words('english')]
    words = [lemmatizer.lemmatize(w) for w in words]
    tokens.append(words)
tokens

[['movie', 'spaceship', 'alien'],
 ['really', 'enjoyed', 'movie'],
 ['awesome', 'action', 'scene', 'boring', 'character'],
 ['movie', 'awful', 'hate', 'alien', 'film'],
 ['space', 'cool', 'like', 'space', 'movie'],
 ['space', 'film', 'please']]

In [27]:
my_dict = Dictionary(tokens)       
print(my_dict)

Dictionary(17 unique tokens: ['alien', 'movie', 'spaceship', 'enjoyed', 'really']...)


In [28]:
print(my_dict.token2id) 

{'alien': 0, 'movie': 1, 'spaceship': 2, 'enjoyed': 3, 'really': 4, 'action': 5, 'awesome': 6, 'boring': 7, 'character': 8, 'scene': 9, 'awful': 10, 'film': 11, 'hate': 12, 'cool': 13, 'like': 14, 'space': 15, 'please': 16}


In [29]:
# Note: You can also add new words or lists of words to a dictionary, save a
# dictionary to a file, load it back later, or read a dictionary from a text file

# my_dict.add_documents(list_of_new_words)
# my_dict.save('saved_dict.dict')
# loaded_dict = Dictionary.load('saved_dict.dict')
# dictionary = Dictionary(line.split()) for line in open('sample.txt', encoding='utf-8'))

In [30]:
dtm = [my_dict.doc2bow(doc) for doc in tokens]
dtm                                         # Create Gensim BOW corpus

[[(0, 1), (1, 1), (2, 1)],
 [(1, 1), (3, 1), (4, 1)],
 [(5, 1), (6, 1), (7, 1), (8, 1), (9, 1)],
 [(0, 1), (1, 1), (10, 1), (11, 1), (12, 1)],
 [(1, 1), (13, 1), (14, 1), (15, 2)],
 [(11, 1), (15, 1), (16, 1)]]

In [31]:
for doc in dtm:
    print([[my_dict[i], freq] for i, freq in doc])

[['alien', 1], ['movie', 1], ['spaceship', 1]]
[['movie', 1], ['enjoyed', 1], ['really', 1]]
[['action', 1], ['awesome', 1], ['boring', 1], ['character', 1], ['scene', 1]]
[['alien', 1], ['movie', 1], ['awful', 1], ['film', 1], ['hate', 1]]
[['movie', 1], ['cool', 1], ['like', 1], ['space', 2]]
[['film', 1], ['space', 1], ['please', 1]]


In [32]:
from gensim.models import TfidfModel
import numpy as np
tfidf = TfidfModel(dtm)                     # Compute TF-IDF values
for doc in tfidf[dtm]:
    print([[my_dict[i], np.around(freq, decimals=2)] for i, freq in doc])

[['alien', 0.51], ['movie', 0.19], ['spaceship', 0.84]]
[['movie', 0.16], ['enjoyed', 0.7], ['really', 0.7]]
[['action', 0.45], ['awesome', 0.45], ['boring', 0.45], ['character', 0.45], ['scene', 0.45]]
[['alien', 0.37], ['movie', 0.14], ['awful', 0.6], ['film', 0.37], ['hate', 0.6]]
[['movie', 0.12], ['cool', 0.53], ['like', 0.53], ['space', 0.65]]
[['film', 0.46], ['space', 0.46], ['please', 0.76]]


In [33]:
tfidf[dtm[0]]                              # Sentence weights

[(0, 0.5132496009471523), (1, 0.1894251567009166), (2, 0.8370740451933879)]

In [34]:
tfidf[dtm[3]]                              # Rare words weighted higher

[(0, 0.3662223291232878),
 (1, 0.13516176525716664),
 (10, 0.5972828929894088),
 (11, 0.3662223291232878),
 (12, 0.5972828929894088)]