In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import spacy
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize
from textblob import TextBlob
from emot.emo_unicode import UNICODE_EMO, EMOTICONS

In [3]:
df = pd.read_csv('../archive/Reviews.csv')

# 1.Data Preprocessing 

## 1.1 Removing Null Entries

In [4]:
df.isnull().sum()

Id                         0
ProductId                  0
UserId                     0
ProfileName               16
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   27
Text                       0
dtype: int64

In [5]:
df = df[df['Summary'].notna()]

In [6]:
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


## 1.2 Keeping just Summary and Text

In [7]:
df=df[['Summary','Text']]

In [8]:
df.head(6)

Unnamed: 0,Summary,Text
0,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,"""Delight"" says it all",This is a confection that has been around a fe...
3,Cough Medicine,If you are looking for the secret ingredient i...
4,Great taffy,Great taffy at a great price. There was a wid...
5,Nice Taffy,I got a wild hair for taffy and ordered this f...


### Checking the data types of the row

In [9]:
df.dtypes

Summary    object
Text       object
dtype: object

## 1.3 Cleaning the data

In [10]:
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [12]:
#

summary = df['Summary']
text = df['Text']

print(text[0])
print(summary[0])

def func(text, flag):
    # convert to lower case
    text = text.apply(lambda x: x.lower())
    
    for i in range(len(text)):
        try:
            sent = text[i]
            words = []
            for j in sent.split(" "):

                if j in contractions:
                    words.append(contractions[j])
                else:
                    words.append(j)
            text[i] = ' '.join(words)
        except Exception:
            pass
        
        
        
    text = text.apply(lambda x: x.replace('[^\w\s]',''))
    text = text.apply(lambda x: re.sub(r'https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE))
    text = text.apply(lambda x:re.sub(r'\<a href', ' ', x))
    text = text.apply(lambda x:re.sub(r'&amp;', '', x))
    text = text.apply(lambda x:re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', x))
    text = text.apply(lambda x:re.sub(r'<br />', ' ', x))
    text = text.apply(lambda x:re.sub(r'\'', ' ', x))
    

    stop = set(stopwords.words('english'))
    
    if flag != 'S':
        text= text.apply(lambda x:' '.join([word for word in x.split(' ') if word not in (stop)]))
        
    
    text = text.apply(lambda x:re.sub("[\(\[].*?[\)\]]", "", x))
    
    return text
    
    

    

text = func(text, 'T')
print(text[0])

summary = func(summary, 'S')
print(summary[0])




I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.
Good Quality Dog Food
bought several vitality canned dog food products found good quality  product looks like stew processed meat smells better  labrador finicky appreciates product better  
good quality dog food


## 1.5 Word embedding

In [13]:
# counting the number of occurences of the words

words = dict()

def count_words(text):
    for sentence in text:
        for curr in sentence.split(' '):
            if curr in words:
                words[curr]+=1
            else:
                words[curr] = 1
        
        
count_words(text)
count_words(summary)

print(len(words))
    
    

132888


## 1.6 Compare with Numberbatch

In [14]:
nb_words = dict()

with open('../numberbatch-en.txt', encoding = 'utf-8') as file:
    for line in file:
        current = line.split(' ')
        nb_words[current[0]] = np.asarray(current[1:], dtype = np.float64 )
        

print(len(nb_words))


        

516783


In [20]:
threshold = 20
missing_words = 0
for word in words:
    if words[word]>threshold and (word not in nb_words):
        missing_words = missing_words+1
print(missing_words)

3846


### Number of missing words

In [22]:
print(" Number of words in the text data = ",len(words))
print(" Number of words in the CN data = ",len(nb_words))
print(" Number of missing words = ",missing_words)

 Number of words in the text data =  132888
 Number of words in the CN data =  516783
 Number of missing words =  3846


In [26]:
##### vocab to integer ###### for vocab to word
count = 0
word_to_int = dict()
for i in words:
    if words[i]>threshold or (i in nb_words):
        word_to_int[i]=count
        count = count+1

        
    

In [30]:
#### tokenising ####
tokens = ['*UNK*','*EOS*','*PAD*','*GO*']
for i in tokens:
    word_to_int[i]=count
    count = count+1
    

In [32]:
#### reverse dictionary #### for int to word
int_to_word = dict()
count = 0
for i in word_to_int:
    int_to_word[count]=i
    count = count+1

In [42]:
##### creating the embedding layer #####
embedding_layer = dict()
embedding_dim = 300
for word in word_to_int:
    if word in nb_words:
        embedding_layer[word] = np.array(nb_words[word])
    else:
        embedding_layer[word] = np.array(np.random.uniform(-1.0,1.0,embedding_dim))
                                         
    

In [57]:
#### EOS wali cheez pending hai ####
### here we have given number to each word in a sentence, for each sentence in a text ###
total_words = 0
total_missing_words = 0
def int_assign_to_sentence(text):
    global total_words
    global total_missing_words
    main_list = list()
    for sentence in text:
        sentence_list = list()
        for word in sentence.split(' '):
            if word in word_to_int:
                sentence_list.append(word_to_int[word])
            else:
                sentence_list.append(word_to_int['*UNK*'])
                total_missing_words = total_missing_words + 1
                
            total_words = total_words+1
        main_list.append(sentence_list)
    return main_list
summary_list = int_assign_to_sentence(summary)

print(total_words,total_missing_words)
print(total_missing_words/total_words *100)

text_list = int_assign_to_sentence(text)

2856054 17081
0.5980629217794902


In [56]:
print(total_words,total_missing_words)
print(total_missing_words/total_words *100)

33815654 193553
0.5723769234213244
