<a href="https://colab.research.google.com/github/vishnuvardhan-jadava/BoW/blob/main/BoW_Amazon_Fine_Food_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#importing libraries
import re
import pandas as pd
import sqlite3
from tqdm import tqdm

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
con = sqlite3.connect('/content/drive/MyDrive/Dataset/database.sqlite')
df = pd.read_sql_query('select * from reviews where score <>3',con)

In [5]:
df.shape

(525814, 10)

In [6]:
df.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

In [7]:
df=df[df.HelpfulnessNumerator <= df.HelpfulnessDenominator]

In [8]:
df.shape

(525812, 10)

In [9]:
df.Score.value_counts()

5    363121
4     80654
1     52268
2     29769
Name: Score, dtype: int64

In [10]:
def neg_pos(x):
  if x<3:
    return 'negative'
  return 'positive'

In [11]:
df['Score'] = df['Score'].map(neg_pos)

In [12]:
df.Score.value_counts()

positive    443775
negative     82037
Name: Score, dtype: int64

In [13]:
df = df.sort_values(by='ProductId',ascending=True)#sorting values by 'ProductId' in ascending order
df=df.drop_duplicates(subset=['UserId', 'ProfileName','Time','Text'],keep='first') # removing duplicates where 'UserId', 'ProfileName','Time','Text' are same and by keeping the first occurance

In [14]:
df.shape

(364171, 10)

In [15]:
# finding how many records have basic html tags
j=0
for i in df.Text.values:
    if ('<br />' in i) or ('<br/>' in i) or ('<html />' in i) or ('<html/>' in i):
        j+=1
print(j)

90294


In [16]:
# functions to remove html tags, URL, punctuations, words with numbers and more then one space.
def clean_html_tags(x):
    """removes html tags"""
    return re.sub(r'<.*?>',r' ',x)
def clean_punctuation(x):
    """removes punctuations"""
    return re.sub(r'[^a-zA-Z0-9]+',r' ',x)
def remove_url(x):
    """removes URL"""
    return re.sub('http\S+',r' ',x)
def remove_words_with_numbers(x):
    """removes words with numbers"""
    return re.sub(r'[a-zA-Z]*[0-9]+[a-zA-Z]*',r' ',x)
def remove_more_than_one_space(x):
    return re.sub(r'[\s]{2,}',r' ',x)
def decontracted(phrase):
    """converts imformal words to formal"""
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

def clean_text(text):
    """cleans the given text by removing html tags, URL, punctuations, words with numbers and more then one space"""
    text = decontracted(text)
    text = clean_html_tags(text)
    text = remove_url(text)
    text = clean_punctuation(text)
    text = remove_words_with_numbers(text)
    text = text.lower()
    text=remove_more_than_one_space(text)
    return text

In [17]:
df['cleaned_text'] = df['Text'].map(clean_text)

In [18]:
def fit(dataset):
  """returns vocab from the dataset"""
  uniq_words = set()
  if isinstance(dataset,(list)):
    for sent in dataset:
      for word in sent.split():
        if len(word)<2:
          continue
        uniq_words.add(word)
    uniq_words=sorted(uniq_words)
    vocab = {j:i for i,j in enumerate(uniq_words)}
    return vocab
  else:
    return 'pass a list'
        

In [19]:
from collections import Counter
from scipy.sparse import csr_matrix

def transform(dataset,vocab):
  """transform method, returns sparse matrix"""
  row=[]
  col=[]
  values=[]
  if isinstance(dataset,(list)):
    for row_num,sent in enumerate(tqdm(dataset)):
      word_freq = dict(Counter(sent.split()))
      for word,freq in word_freq.items():
        if len(word)<2:
          continue
        list_idx = vocab.get(word,-1)
        if list_idx != -1:
          row.append(row_num)
          col.append(list_idx)
          values.append(freq)
    return csr_matrix((values, (row,col)), shape=(len(dataset),len(vocab))).toarray()
  else:
    return 'pass a list'

In [20]:
vocab = fit(list(df['cleaned_text'][:10000].values))
#print(list(vocab.keys()))
trans=transform(list(df['cleaned_text'][:10000].values), vocab)

100%|██████████| 10000/10000 [00:00<00:00, 18127.00it/s]


In [21]:
vocab  #trans

{'aa': 0,
 'aaaaa': 1,
 'aaaaaahhhhhyaaaaaa': 2,
 'aaaallll': 3,
 'aaah': 4,
 'aaahhhhhh': 5,
 'aacute': 6,
 'aafco': 7,
 'aahing': 8,
 'ab': 9,
 'abandon': 10,
 'abandoned': 11,
 'abattoir': 12,
 'abbey': 13,
 'abby': 14,
 'abdomen': 15,
 'abdominal': 16,
 'aberration': 17,
 'abhor': 18,
 'abhorrent': 19,
 'abide': 20,
 'abiding': 21,
 'abilities': 22,
 'ability': 23,
 'abj': 24,
 'abk': 25,
 'able': 26,
 'ablity': 27,
 'abnormal': 28,
 'abnormalities': 29,
 'abnormally': 30,
 'abomination': 31,
 'abominations': 32,
 'abound': 33,
 'abour': 34,
 'about': 35,
 'above': 36,
 'abowt': 37,
 'abraham': 38,
 'abrasions': 39,
 'abroad': 40,
 'abrupt': 41,
 'abruptly': 42,
 'absence': 43,
 'absent': 44,
 'absolute': 45,
 'absolutely': 46,
 'absolutey': 47,
 'absolutley': 48,
 'absolutly': 49,
 'absorb': 50,
 'absorbed': 51,
 'absorbency': 52,
 'absorbent': 53,
 'absorbing': 54,
 'absorbs': 55,
 'absorbtion': 56,
 'absorption': 57,
 'abstract': 58,
 'absurd': 59,
 'absurdity': 60,
 'abuelita':

In [22]:
trans

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [25]:
print(len(vocab))
print(trans.shape)

22085
(10000, 22085)
