In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [4]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## **Skearn**

In [5]:
def preprocess(document):
    '''
    Function to to lower case and removes stopwords
    '''

    # change sentence to lower case
    document = document.lower()

    # tokenize into words
    words = word_tokenize(document)

    # remove stop words
    words = [word for word in words if word not in stopwords.words("english")]

    # join words to make sentence
    document = " ".join(words)
    
    return document

In [6]:
documents = ["Gangs of Wasseypur is a great movie. Wasseypur is a town in Bihar.", "The success of a song depends on the music.", "There is a new movie releasing this week. The movie is fun to watch."]

In [7]:
documents = [preprocess(document) for document in documents]
documents

['gangs wasseypur great movie . wasseypur town bihar .',
 'success song depends music .',
 'new movie releasing week . movie fun watch .']

### 1) BOW

**Creating bag of words model using count vectorizer function**

In [8]:

def bow_vec(documents, ngram_range=(1,1)):
    '''
    Function to convert a collection of text documents to a matrix of token counts.
    
    Args:
      documents (list): List of documents
      ngram_range (tuple):  word n-grams

    '''

    vectorizer = CountVectorizer(ngram_range=ngram_range)
    bow_model = vectorizer.fit_transform(documents)
    return vectorizer, bow_model


In [9]:
bow_vectorizer, bow_model = bow_vec(documents)

In [10]:
bow_model.toarray()

array([[1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 2, 0, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 2, 0, 1, 1, 0, 0, 0, 0, 1, 1]])

In [11]:
pd.DataFrame(bow_model.toarray(), columns = bow_vectorizer.get_feature_names_out())

Unnamed: 0,bihar,depends,fun,gangs,great,movie,music,new,releasing,song,success,town,wasseypur,watch,week
0,1,0,0,1,1,1,0,0,0,0,0,1,2,0,0
1,0,1,0,0,0,0,1,0,0,1,1,0,0,0,0
2,0,0,1,0,0,2,0,1,1,0,0,0,0,1,1


### 2) TFIDF

**Creating bag of words model using TfidfVectorizer function**

In [12]:
def tfidf_vec(documents, ngram_range=(1, 1)):
    '''
    Function to transform a count matrix to a normalized tf or tf-idf representation.
    
    Args:
      documents (list): List of documents
      ngram_range (tuple):  word n-grams

    '''
    vectorizer = TfidfVectorizer(
        strip_accents='unicode',    # Remove accents and perform other character normalization during the preprocessing step. 
        analyzer='word',            # Whether the feature should be made of word or character n-grams.
        token_pattern=r'\w{1,}',    # Regular expression denoting what constitutes a “token”, only used if analyzer == 'word'
        ngram_range=ngram_range,         # The lower and upper boundary of the range of n-values for different n-grams to be extracted
        stop_words='english')

    vectorizer.fit(documents)    # Fiting it on Train
    tfid_model = vectorizer.transform(documents)
    return vectorizer, tfid_model


In [13]:
tfidf_vectorizer, tfidf_model = tfidf_vec(documents)

In [14]:
tfidf_model.toarray()

array([[0.34142622, 0.        , 0.        , 0.34142622, 0.34142622,
        0.25966344, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.34142622, 0.68285244, 0.        , 0.        ],
       [0.        , 0.5       , 0.        , 0.        , 0.        ,
        0.        , 0.5       , 0.        , 0.        , 0.5       ,
        0.5       , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.36977238, 0.        , 0.        ,
        0.56244284, 0.        , 0.36977238, 0.36977238, 0.        ,
        0.        , 0.        , 0.        , 0.36977238, 0.36977238]])

In [15]:
pd.DataFrame(tfidf_model.toarray(), columns = tfidf_vectorizer.get_feature_names_out())

Unnamed: 0,bihar,depends,fun,gangs,great,movie,music,new,releasing,song,success,town,wasseypur,watch,week
0,0.341426,0.0,0.0,0.341426,0.341426,0.259663,0.0,0.0,0.0,0.0,0.0,0.341426,0.682852,0.0,0.0
1,0.0,0.5,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.5,0.5,0.0,0.0,0.0,0.0
2,0.0,0.0,0.369772,0.0,0.0,0.562443,0.0,0.369772,0.369772,0.0,0.0,0.0,0.0,0.369772,0.369772


In [16]:
# def create_wordfreq(corpus):
#   wordfreq = {}
#   for sentence in corpus:
#       tokens = nltk.word_tokenize(sentence)
#       for token in tokens:
#           if token not in wordfreq.keys():
#               wordfreq[token] = 1
#           else:
#               wordfreq[token] += 1
#   return wordfreq


## **From Scratch**

In [17]:
documents = [preprocess(document) for document in documents]
documents

['gangs wasseypur great movie . wasseypur town bihar .',
 'success song depends music .',
 'new movie releasing week . movie fun watch .']

In [18]:
corpus_list = list(set([word for document in documents for word in document.split()]))
print(corpus_list)

['movie', 'releasing', 'watch', 'music', 'success', 'song', 'bihar', 'wasseypur', 'week', 'gangs', 'depends', 'new', 'great', 'fun', 'town', '.']


### 1) BOW

In [19]:
def bow_scratch(documents, corpus_list):
    '''
    Function to convert a collection of text documents to a matrix of token counts.
    
    Args:
      documents (list): List of documents
      corpus_list (list):  List of unique words

    Returns:
      np.array: Array
    '''

    sentence_vectors = [
        [1 if token in nltk.word_tokenize(sentence) else 0 for token in corpus_list] 
        for sentence in documents
    ]
    return np.asarray(sentence_vectors)

In [20]:
bow_vect = bow_scratch(documents, corpus_list)
bow_vect

array([[1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1],
       [0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1],
       [1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1]])

In [21]:
df_bow = pd.DataFrame(bow_vect, columns=corpus_list)
df_bow

Unnamed: 0,movie,releasing,watch,music,success,song,bihar,wasseypur,week,gangs,depends,new,great,fun,town,.
0,1,0,0,0,0,0,1,1,0,1,0,0,1,0,1,1
1,0,0,0,1,1,1,0,0,0,0,1,0,0,0,0,1
2,1,1,1,0,0,0,0,0,1,0,0,1,0,1,0,1


### 2) TFIDF

In [22]:
def get_tf(documents, corpus_list):
  '''
  Function to get TF.
  
  Args:
    documents (list): List of documents
    corpus_list (list):  List of unique words

  Returns:
    Dataframe with TF
  '''

  n_docs = len(documents)
  n_words_set = len(corpus_list)

  df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), columns=corpus_list)

  for i in range(n_docs):
      words = documents[i].split(' ') # Words in the document
      for w in words:
          df_tf[w][i] = df_tf[w][i] + (1 / len(words))
  return df_tf


def get_idf(documents, corpus_list):
  '''
  Function to get IDF.
  
  Args:
    documents (list): List of documents
    corpus_list (list):  List of unique words

  Returns:
    Dict with IDF
  '''

  idf = {}
  n_docs = len(documents)
  n_words_set = len(corpus_list)
 
  for w in corpus_list:
      k = 0
      for i in range(n_docs):
          if w in documents[i].split():
              k += 1
              
      idf[w] =  np.log10(n_docs / k)

  return idf


In [23]:
df_tf = get_tf(documents, corpus_list)
df_tf

Unnamed: 0,movie,releasing,watch,music,success,song,bihar,wasseypur,week,gangs,depends,new,great,fun,town,.
0,0.111111,0.0,0.0,0.0,0.0,0.0,0.111111,0.222222,0.0,0.111111,0.0,0.0,0.111111,0.0,0.111111,0.222222
1,0.0,0.0,0.0,0.2,0.2,0.2,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.2
2,0.222222,0.111111,0.111111,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.111111,0.0,0.111111,0.0,0.222222


In [24]:
idf = get_idf(documents, corpus_list)
idf

{'movie': 0.17609125905568124,
 'releasing': 0.47712125471966244,
 'watch': 0.47712125471966244,
 'music': 0.47712125471966244,
 'success': 0.47712125471966244,
 'song': 0.47712125471966244,
 'bihar': 0.47712125471966244,
 'wasseypur': 0.47712125471966244,
 'week': 0.47712125471966244,
 'gangs': 0.47712125471966244,
 'depends': 0.47712125471966244,
 'new': 0.47712125471966244,
 'great': 0.47712125471966244,
 'fun': 0.47712125471966244,
 'town': 0.47712125471966244,
 '.': 0.0}

In [25]:
df_tf_idf = df_tf*idf

In [26]:
df_tf_idf

Unnamed: 0,movie,releasing,watch,music,success,song,bihar,wasseypur,week,gangs,depends,new,great,fun,town,.
0,0.019566,0.0,0.0,0.0,0.0,0.0,0.053013,0.106027,0.0,0.053013,0.0,0.0,0.053013,0.0,0.053013,0.0
1,0.0,0.0,0.0,0.095424,0.095424,0.095424,0.0,0.0,0.0,0.0,0.095424,0.0,0.0,0.0,0.0,0.0
2,0.039131,0.053013,0.053013,0.0,0.0,0.0,0.0,0.0,0.053013,0.0,0.0,0.053013,0.0,0.053013,0.0,0.0


## **N-Grams**

In [27]:
def get_n_grams(sentence, gram=1):
  ngrams_list = []
  for i in range(len(sentence.split()) - (gram-1)):
    ngrams_list.append(" ".join(sentence.split()[i:i+gram]))
  return ngrams_list

In [28]:
sentence = "I will go to United States"
ngrams_list = get_n_grams(sentence, gram=2)
ngrams_list

['I will', 'will go', 'go to', 'to United', 'United States']

In [29]:
sentence = "I will go to United States"
ngrams_list = get_n_grams(sentence, gram=3)
ngrams_list

['I will go', 'will go to', 'go to United', 'to United States']

In [30]:
sentence = "I will go to United States"
ngrams_list = get_n_grams(sentence, gram=6)
ngrams_list

['I will go to United States']