In [48]:
import pandas as pd
import numpy as np
import re
import nltk
import spacy
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize
from textblob import TextBlob
from emot.emo_unicode import UNICODE_EMO, EMOTICONS

In [8]:
df = pd.read_csv('../Abstract_Data/Reviews.csv')

# 1.Data Preprocessing 

## 1.1 Removing Null Entries

In [9]:
df.isnull().sum()

Id                         0
ProductId                  0
UserId                     0
ProfileName               16
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   27
Text                       0
dtype: int64

In [10]:
df = df[df['Summary'].notna()]

In [11]:
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


## 1.2 Keeping just Summary and Text

In [12]:
df=df[['Summary','Text']]

In [18]:
df.head(6)

Unnamed: 0,Summary,Text
0,good quality dog food,i have bought several of the vitality canned d...
1,not as advertised,product arrived labeled as jumbo salted peanut...
2,delight says it all,this is a confection that has been around a fe...
3,cough medicine,if you are looking for the secret ingredient i...
4,great taffy,great taffy at a great price there was a wide...
5,nice taffy,i got a wild hair for taffy and ordered this f...


### Checking the data types of the row

In [14]:
df.dtypes

Summary    object
Text       object
dtype: object

## 1.3 Changing to lower case

In [23]:
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [None]:
summary = df['Summary']
text = df['Text']

def func(text, flag):
    # convert to lower case
    text = text.str.lower()
    
    for i in range(len(text)):
        try:
            sent = text[i]
            words = []
            for j in sent.split(" "):

                if j in contractions:
                    words.append(contractions[j])
                else:
                    words.append(j)
            text[i] = ' '.join(words)
        except:
            
            print('X')
        
        
        
    text = text.apply(lambda x: x.replace('[^\w\s]',''))
    text = text.apply(lambda x: re.sub(r'https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE))
    text = text.apply(lambda x:re.sub(r'\<a href', ' ', x))
    text = text.apply(lambda x:re.sub(r'&amp;', '', x))
    text = text.apply(lambda x:re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', x))
    text = text.apply(lambda x:re.sub(r'<br />', ' ', x))
    text = text.apply(lambda x:re.sub(r'\'', ' ', x))
    
#     nltk.download()
    
    stop = stopwords.words('english')
    
    if flag != 'S':
        text= text.apply(lambda x:' '.join([word for word in x.split(' ') if word not in (stop)]))
        
    
    text = text.aapply(lambda x:re.sub("[\(\[].*?[\)\]]", "", x))
    
    

    
func(text, 'T')

# print(text[33598])
    
# func(summary)


## 1.4 Removing Punctuation

Unnamed: 0,Summary,Text
0,good quality dog food,i have bought several of the vitality canned d...
1,not as advertised,product arrived labeled as jumbo salted peanut...
2,delight says it all,this is a confection that has been around a fe...
3,cough medicine,if you are looking for the secret ingredient i...
4,great taffy,great taffy at a great price there was a wide...
...,...,...
568449,will not do without,great for sesame chickenthis is a good if not ...
568450,disappointed,im disappointed with the flavor the chocolate ...
568451,perfect for our maltipoo,these stars are small so you can give 1015 of ...
568452,favorite training and reward treat,these are the best treats for training and rew...


## 1.5 Removing Stopwords

In [None]:
df.head()

## 1.6 Converting Emoji and Emoticons to words

In [None]:
# Converting emojis to words
def convert_emojis(text):
    for emot in UNICODE_EMO:
        text = text.replace(emot, "_".join(UNICODE_EMO[emot].replace(",","").replace(":","").split()))
        return text
# Converting emoticons to words    
def convert_emoticons(text):
    for emot in EMOTICONS:
        text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), text)
        return text
df['Summary'] = df['Summary'].apply(convert_emojis)
df['Summary'] = df['Summary'].apply(convert_emoticons)
df['Text'] = df['Text'].apply(convert_emojis)
df['Text'] = df['Text'].apply(convert_emoticons)

## 1.7 Stemming the words

## 1.5 Spelling Correction

In [None]:
# pd.df.to_csv('new.csv')