# Task 1

# Read Source Data

In [1]:
# import library
import pandas as pd

In [2]:
# read data - first 1000 rows
data = pd.read_csv("Reviews.csv", nrows=1000)

In [3]:
# check first 5 rows
data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [4]:
# check no. of rows & columns
data.shape

(1000, 10)

# Remove Duplicates

In [5]:
# select Text column
data_t = data['Text']
data_t.head()

0    I have bought several of the Vitality canned d...
1    Product arrived labeled as Jumbo Salted Peanut...
2    This is a confection that has been around a fe...
3    If you are looking for the secret ingredient i...
4    Great taffy at a great price.  There was a wid...
Name: Text, dtype: object

In [6]:
# check no. of duplicates in Text column
data_t_duplicates = data_t.duplicated()
print(data_t_duplicates.sum())

3


In [7]:
# remove duplicates in Text column
data_rdup = data_t.drop_duplicates()
data_rdup.shape

(997,)

In [8]:
# check any row
print(data_rdup[18])

Twizzlers, Strawberry my childhood favorite candy, made in Lancaster Pennsylvania by Y & S Candies, Inc. one of the oldest confectionery Firms in the United States, now a Subsidiary of the Hershey Company, the Company was established in 1845 as Young and Smylie, they also make Apple Licorice Twists, Green Color and Blue Raspberry Licorice Twists, I like them all<br /><br />I keep it in a dry cool place because is not recommended it to put it in the fridge. According to the Guinness Book of Records, the longest Licorice Twist ever made measured 1.200 Feet (370 M) and weighted 100 Pounds (45 Kg) and was made by Y & S Candies, Inc. This Record-Breaking Twist became a Guinness World Record on July 19, 1998. This Product is Kosher! Thank You


# Data Cleaning & Standardization

In [9]:
# import library
import re
import string

In [10]:
#.strip() is to remove white spaces after you do substitution
def get_cleaned_textdata(sentence):
    modified_sentence = re.sub(r'<.*?>',' ', sentence)
    modified_sentence = ''.join([i if i not in string.punctuation else ' ' for i in modified_sentence])
    modified_sentence = re.sub(r'\d+', ' ', modified_sentence)
    modified_sentence = re.sub(r'\s+', ' ', modified_sentence)
    modified_sentence = re.sub(r'also', ' ', modified_sentence)
    modified_sentence = modified_sentence.strip().lower()
    return modified_sentence

In [11]:
data_rdupclean = data_rdup.apply(get_cleaned_textdata)

In [12]:
# check result of data cleaning
print(data_rdup[18],"\n")
print(data_rdupclean[18])

Twizzlers, Strawberry my childhood favorite candy, made in Lancaster Pennsylvania by Y & S Candies, Inc. one of the oldest confectionery Firms in the United States, now a Subsidiary of the Hershey Company, the Company was established in 1845 as Young and Smylie, they also make Apple Licorice Twists, Green Color and Blue Raspberry Licorice Twists, I like them all<br /><br />I keep it in a dry cool place because is not recommended it to put it in the fridge. According to the Guinness Book of Records, the longest Licorice Twist ever made measured 1.200 Feet (370 M) and weighted 100 Pounds (45 Kg) and was made by Y & S Candies, Inc. This Record-Breaking Twist became a Guinness World Record on July 19, 1998. This Product is Kosher! Thank You 

twizzlers strawberry my childhood favorite candy made in lancaster pennsylvania by y s candies inc one of the oldest confectionery firms in the united states now a subsidiary of the hershey company the company was established in as young and smylie th

# Tokenization

In [13]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Qeme\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [14]:
from nltk.tokenize import word_tokenize

In [15]:
data_token = data_rdupclean.apply(word_tokenize)

In [16]:
# check result of tokenization
print(data_rdupclean[18],"\n")
print(data_token[18])

twizzlers strawberry my childhood favorite candy made in lancaster pennsylvania by y s candies inc one of the oldest confectionery firms in the united states now a subsidiary of the hershey company the company was established in as young and smylie they   make apple licorice twists green color and blue raspberry licorice twists i like them all i keep it in a dry cool place because is not recommended it to put it in the fridge according to the guinness book of records the longest licorice twist ever made measured feet m and weighted pounds kg and was made by y s candies inc this record breaking twist became a guinness world record on july this product is kosher thank you 

['twizzlers', 'strawberry', 'my', 'childhood', 'favorite', 'candy', 'made', 'in', 'lancaster', 'pennsylvania', 'by', 'y', 's', 'candies', 'inc', 'one', 'of', 'the', 'oldest', 'confectionery', 'firms', 'in', 'the', 'united', 'states', 'now', 'a', 'subsidiary', 'of', 'the', 'hershey', 'company', 'the', 'company', 'was',

# Remove Stopwords

In [17]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Qeme\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [18]:
stopwords = nltk.corpus.stopwords.words('english')

In [19]:
def remove_stopwords(text):
    output = []
    for i in text:
        if i not in stopwords:
            output.append(i)
    return output

In [20]:
data_xstopwords = data_token.apply(remove_stopwords)

In [21]:
# check result of stopwords removal
print(data_token[18],"\n")
print(data_xstopwords[18])

['twizzlers', 'strawberry', 'my', 'childhood', 'favorite', 'candy', 'made', 'in', 'lancaster', 'pennsylvania', 'by', 'y', 's', 'candies', 'inc', 'one', 'of', 'the', 'oldest', 'confectionery', 'firms', 'in', 'the', 'united', 'states', 'now', 'a', 'subsidiary', 'of', 'the', 'hershey', 'company', 'the', 'company', 'was', 'established', 'in', 'as', 'young', 'and', 'smylie', 'they', 'make', 'apple', 'licorice', 'twists', 'green', 'color', 'and', 'blue', 'raspberry', 'licorice', 'twists', 'i', 'like', 'them', 'all', 'i', 'keep', 'it', 'in', 'a', 'dry', 'cool', 'place', 'because', 'is', 'not', 'recommended', 'it', 'to', 'put', 'it', 'in', 'the', 'fridge', 'according', 'to', 'the', 'guinness', 'book', 'of', 'records', 'the', 'longest', 'licorice', 'twist', 'ever', 'made', 'measured', 'feet', 'm', 'and', 'weighted', 'pounds', 'kg', 'and', 'was', 'made', 'by', 'y', 's', 'candies', 'inc', 'this', 'record', 'breaking', 'twist', 'became', 'a', 'guinness', 'world', 'record', 'on', 'july', 'this', 'p

# Stemming

In [22]:
from nltk.stem.porter import PorterStemmer

In [23]:
porter_stemmer = PorterStemmer()

In [24]:
def porter_stemming(text):
    stem_text = []
    for word in text:
        stemmed_word = porter_stemmer.stem(word)
        stem_text.append(stemmed_word)
    return stem_text

In [25]:
data_porterstem = data_xstopwords.apply(porter_stemming)

In [26]:
# check result after Porter stemming
print(data_xstopwords[18], "\n\n")
print(data_porterstem[18])

['twizzlers', 'strawberry', 'childhood', 'favorite', 'candy', 'made', 'lancaster', 'pennsylvania', 'candies', 'inc', 'one', 'oldest', 'confectionery', 'firms', 'united', 'states', 'subsidiary', 'hershey', 'company', 'company', 'established', 'young', 'smylie', 'make', 'apple', 'licorice', 'twists', 'green', 'color', 'blue', 'raspberry', 'licorice', 'twists', 'like', 'keep', 'dry', 'cool', 'place', 'recommended', 'put', 'fridge', 'according', 'guinness', 'book', 'records', 'longest', 'licorice', 'twist', 'ever', 'made', 'measured', 'feet', 'weighted', 'pounds', 'kg', 'made', 'candies', 'inc', 'record', 'breaking', 'twist', 'became', 'guinness', 'world', 'record', 'july', 'product', 'kosher', 'thank'] 


['twizzler', 'strawberri', 'childhood', 'favorit', 'candi', 'made', 'lancast', 'pennsylvania', 'candi', 'inc', 'one', 'oldest', 'confectioneri', 'firm', 'unit', 'state', 'subsidiari', 'hershey', 'compani', 'compani', 'establish', 'young', 'smyli', 'make', 'appl', 'licoric', 'twist', 'gre

# Lemmatization

In [27]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Qeme\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [28]:
#importing the Lemmatizer function from nltk library
from nltk.stem import WordNetLemmatizer

In [29]:
#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()

In [30]:
#defining the function for lemmatization
def lemmatizer(text):
    lemm_text = []
    for word in text:
        lemmatized_word = wordnet_lemmatizer.lemmatize(word)
        lemm_text.append(lemmatized_word)
    return lemm_text

In [31]:
#applying the lemmatizer function to data_porterstem
data_lemmatized1 = data_porterstem.apply(lemmatizer)
data_lemmatized2 = data_xstopwords.apply(lemmatizer)

In [32]:
# check result after Lemmatization
print(data_porterstem[18], "\n\n")
print(data_lemmatized1[18], "\n\n")
print(data_xstopwords[18], "\n\n")
print(data_lemmatized2[18], "\n\n")

['twizzler', 'strawberri', 'childhood', 'favorit', 'candi', 'made', 'lancast', 'pennsylvania', 'candi', 'inc', 'one', 'oldest', 'confectioneri', 'firm', 'unit', 'state', 'subsidiari', 'hershey', 'compani', 'compani', 'establish', 'young', 'smyli', 'make', 'appl', 'licoric', 'twist', 'green', 'color', 'blue', 'raspberri', 'licoric', 'twist', 'like', 'keep', 'dri', 'cool', 'place', 'recommend', 'put', 'fridg', 'accord', 'guin', 'book', 'record', 'longest', 'licoric', 'twist', 'ever', 'made', 'measur', 'feet', 'weight', 'pound', 'kg', 'made', 'candi', 'inc', 'record', 'break', 'twist', 'becam', 'guin', 'world', 'record', 'juli', 'product', 'kosher', 'thank'] 


['twizzler', 'strawberri', 'childhood', 'favorit', 'candi', 'made', 'lancast', 'pennsylvania', 'candi', 'inc', 'one', 'oldest', 'confectioneri', 'firm', 'unit', 'state', 'subsidiari', 'hershey', 'compani', 'compani', 'establish', 'young', 'smyli', 'make', 'appl', 'licoric', 'twist', 'green', 'color', 'blue', 'raspberri', 'licoric',

# Task 2
## Convert text data into numerical features

In [33]:
# import another library
import math
from collections import Counter

In [34]:
# create functions for calculating the tf, idf and tf-idf
def compute_tf(document):
    word_count = Counter(document)
    tf = {word: count/len(document) for word, count in word_count.items()}
    return tf

def compute_idf(documents):
    N = len(documents)
    idf = {}
    all_words = set(word for doc in documents for word in doc)
    for word in all_words:
        count = sum(1 for doc in documents if word in doc)
        idf[word] = math.log(N/count)
    return idf

def compute_tfidf(document, idf):
    tfidf = {}
    tf = compute_tf(document)
    for word, tf_value in tf.items():
        tfidf[word] = tf_value * idf[word]
    return tfidf

In [35]:
# Now compute the tf or the data that already lemmetised
tf_data1 = [compute_tf(doc) for doc in data_lemmatized1]
tf_data2 = [compute_tf(doc) for doc in data_lemmatized2]

In [36]:
# Create DataFrame for TF
tf_df1 = pd.DataFrame(tf_data1).fillna(0)
print("TF df1 Scores:")
print(tf_df1)

TF df1 Scores:
       bought     sever     vital       can       dog      food   product  \
0    0.043478  0.043478  0.043478  0.043478  0.043478  0.043478  0.130435   
1    0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.111111   
2    0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
3    0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
4    0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
..        ...       ...       ...       ...       ...       ...       ...   
992  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
993  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
994  0.000000  0.000000  0.000000  0.000000  0.000000  0.040000  0.000000   
995  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
996  0.018868  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   

        found      good   qualiti  ...  chef  habenero      

In [37]:
# Create DataFrame for TF
tf_df2 = pd.DataFrame(tf_data2).fillna(0)
print("TF df2 Scores:")
print(tf_df2)

TF df2 Scores:
       bought   several  vitality    canned       dog      food   product  \
0    0.043478  0.043478  0.043478  0.043478  0.043478  0.043478  0.130435   
1    0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.111111   
2    0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
3    0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
4    0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
..        ...       ...       ...       ...       ...       ...       ...   
992  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
993  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
994  0.000000  0.000000  0.000000  0.000000  0.000000  0.040000  0.000000   
995  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
996  0.018868  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   

        found      good   quality  ...  habenero      dude  

In [38]:
# Compute IDF - the number of columns should be the same as you calculate in tf
idf1 = compute_idf(data_lemmatized1)
idf_df1 = pd.DataFrame([idf1]).fillna(0)
print("\nIDF df1 Scores:")
print(idf_df1)


IDF df1 Scores:
      toxic   chedder       air   favorit      base    vendor     limit  \
0  6.211604  6.904751  4.825309  2.427414  4.132162  5.295313  4.602166   

   distinct   heavili       bet  ...    earthi       hid    potent     favor  \
0  5.295313  6.211604  5.806138  ...  6.211604  6.211604  6.904751  4.602166   

      mixer     becam      chop     bewar    arabia       ace  
0  6.904751  4.958841  4.958841  5.518456  6.904751  6.904751  

[1 rows x 4203 columns]


In [39]:
idf2 = compute_idf(data_lemmatized2)
idf_df2 = pd.DataFrame([idf2]).fillna(0)
print("\nIDF df2 Scores:")
print(idf_df2)


IDF df2 Scores:
       serf  restaurant   allowed  labeling  distinct    eludes  overheated  \
0  6.211604    4.419844  5.518456  6.904751  5.806138  6.904751    6.904751   

     target   whacked   hotline  ...    nicely        lo   suspect      tiny  \
0  5.518456  6.904751  6.904751  ...  4.825309  6.904751  5.806138  4.339801   

      buyer    potent  criticism   rapidly    arabia    mousie  
0  5.518456  6.904751   6.904751  6.904751  6.904751  6.904751  

[1 rows x 5133 columns]


In [40]:
# Compute TF-IDF for each document
tfidf_data1 = [compute_tfidf(doc, idf1) for doc in data_lemmatized1]

# Create DataFrame for TF-IDF
tfidf_df1 = pd.DataFrame(tfidf_data1).fillna(0)
print("\nTF-IDF df1 Scores:")
print(tfidf_df1)


TF-IDF df1 Scores:
       bought     sever     vital       can       dog      food   product  \
0    0.107562  0.153802  0.300207  0.200094  0.148184  0.080094  0.221121   
1    0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.188363   
2    0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
3    0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
4    0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
..        ...       ...       ...       ...       ...       ...       ...   
992  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
993  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
994  0.000000  0.000000  0.000000  0.000000  0.000000  0.073686  0.000000   
995  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
996  0.046678  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   

        found      good   qualiti  ...  chef  habenero 

In [41]:
# Compute TF-IDF for each document
tfidf_data2 = [compute_tfidf(doc, idf2) for doc in data_lemmatized2]

# Create DataFrame for TF-IDF
tfidf_df2 = pd.DataFrame(tfidf_data2).fillna(0)
print("\nTF-IDF df2 Scores:")
print(tfidf_df2)


TF-IDF df2 Scores:
       bought  several  vitality    canned       dog      food   product  \
0    0.107562  0.15855  0.300207  0.222304  0.148184  0.080094  0.221836   
1    0.000000  0.00000  0.000000  0.000000  0.000000  0.000000  0.188972   
2    0.000000  0.00000  0.000000  0.000000  0.000000  0.000000  0.000000   
3    0.000000  0.00000  0.000000  0.000000  0.000000  0.000000  0.000000   
4    0.000000  0.00000  0.000000  0.000000  0.000000  0.000000  0.000000   
..        ...      ...       ...       ...       ...       ...       ...   
992  0.000000  0.00000  0.000000  0.000000  0.000000  0.000000  0.000000   
993  0.000000  0.00000  0.000000  0.000000  0.000000  0.000000  0.000000   
994  0.000000  0.00000  0.000000  0.000000  0.000000  0.073686  0.000000   
995  0.000000  0.00000  0.000000  0.000000  0.000000  0.000000  0.000000   
996  0.046678  0.00000  0.000000  0.000000  0.000000  0.000000  0.000000   

        found      good   quality  ...  habenero      dude   mailbo