<a href="https://colab.research.google.com/github/vishnuvardhan-jadava/BoW/blob/main/BoW_Fit_Transform_Practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
#importing libraries
import re
import pandas as pd
import sqlite3
from tqdm import tqdm

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [4]:
con = sqlite3.connect('/content/drive/MyDrive/Dataset/database.sqlite')
df = pd.read_sql_query('select * from reviews where score <> 3 limit 10',con)

In [6]:
df.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

In [7]:
# functions to remove html tags, URL, punctuations, words with numbers and more then one space.
def clean_html_tags(x):
    """removes html tags"""
    return re.sub(r'<.*?>',r' ',x)
def clean_punctuation(x):
    """removes punctuations"""
    return re.sub(r'[^a-zA-Z0-9]+',r' ',x)
def remove_url(x):
    """removes URL"""
    return re.sub('http\S+',r' ',x)
def remove_words_with_numbers(x):
    """removes words with numbers"""
    return re.sub(r'[a-zA-Z]*[0-9]+[a-zA-Z]*',r' ',x)
def remove_more_than_one_space(x):
    return re.sub(r'[\s]{2,}',r' ',x)
def decontracted(phrase):
    """converts imformal words to formal"""
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

def clean_text(text):
    """cleans the given text by removing html tags, URL, punctuations, words with numbers and more then one space"""
    text = decontracted(text)
    text = clean_html_tags(text)
    text = remove_url(text)
    text = clean_punctuation(text)
    text = remove_words_with_numbers(text)
    text = text.lower()
    text=remove_more_than_one_space(text)
    return text

In [8]:
df['cleaned_text'] = df['Text'].map(clean_text)

In [39]:
def fit(dataset):
  uniq_words = set()
  if isinstance(dataset,(list)):
    for sent in  dataset:
      for word in sent.split():
        if len(word)<2:
          continue
        uniq_words.add(word)
    uniq_words=sorted(uniq_words)
    vocab = {j:i for i,j in enumerate(uniq_words)}
    return vocab
  else:
    return 'pass a list'

In [23]:
vocab = fit(list(df['cleaned_text']))

In [44]:
from collections import Counter
from scipy.sparse import csr_matrix
def transform(dataset,vocab):

  rows = []
  col = []
  values = []
  if isinstance(dataset,(list)):
    for row_idx,row in enumerate(tqdm(dataset)):
      word_freq = dict(Counter(row.split()))
      for word,freq in word_freq.items():
        if len(word)<2:
          continue
        col_idx = vocab.get(word,-1)
        if col_idx != -1:
          rows.append(row_idx)
          col.append(col_idx)
          values.append(freq)
    return csr_matrix((values,(rows,col)),shape=(len(dataset),len(vocab))).toarray()
  return 'pass a list'

In [45]:
strings = ["the method of lagrange multipliers is the economists workhorse for solving optimization problems",
           "the technique is a centerpiece of economic theory but unfortunately its usually taught poorly"]
vocab = fit(strings)
print(list(vocab.keys()))
print(transform(strings, vocab))

100%|██████████| 2/2 [00:00<00:00, 7163.63it/s]

['but', 'centerpiece', 'economic', 'economists', 'for', 'is', 'its', 'lagrange', 'method', 'multipliers', 'of', 'optimization', 'poorly', 'problems', 'solving', 'taught', 'technique', 'the', 'theory', 'unfortunately', 'usually', 'workhorse']
[[0 0 0 1 1 1 0 1 1 1 1 1 0 1 1 0 0 2 0 0 0 1]
 [1 1 1 0 0 1 1 0 0 0 1 0 1 0 0 1 1 1 1 1 1 0]]





In [49]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(analyzer='word')

In [51]:
vec.fit(strings)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [52]:
trans = vec.transform(strings).toarray()

In [53]:
trans

array([[0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 0, 0, 0, 1],
       [1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0]])