<a href="https://colab.research.google.com/github/syedabasmah/deceptive_reviews_detection/blob/main/project/NLP_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Building Natural Language Processing (NLP) pipeline:

**Removing punctuation:**

In [None]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
def remove_punctuation(txt):
  txt_nopunc = "".join([c for c in txt if c not in string.punctuation])
  return txt_nopunc

In [None]:
df['text_no_punc'] = df['text'].apply(lambda x: remove_punctuation(x))

In [None]:

df.head()

Unnamed: 0,text,label,text_no_punc
0,We stayed for a one night getaway with family ...,0,We stayed for a one night getaway with family ...
1,Triple A rate with upgrade to view room was le...,0,Triple A rate with upgrade to view room was le...
2,This comes a little late as I'm finally catchi...,0,This comes a little late as Im finally catchin...
3,The Omni Chicago really delivers on all fronts...,0,The Omni Chicago really delivers on all fronts...
4,I asked for a high floor away from the elevato...,0,I asked for a high floor away from the elevato...


**Tokenization:**

In [None]:
import re #regular expression (built in package)

def tokenize(txt):
  tokens = re.split('\W+', txt)
  return tokens 

df['text_tokenized'] = df['text_no_punc'].apply(lambda x: tokenize(x.lower()))

In [None]:
df.head()

Unnamed: 0,text,label,text_no_punc,text_tokenized
0,We stayed for a one night getaway with family ...,0,We stayed for a one night getaway with family ...,"[we, stayed, for, a, one, night, getaway, with..."
1,Triple A rate with upgrade to view room was le...,0,Triple A rate with upgrade to view room was le...,"[triple, a, rate, with, upgrade, to, view, roo..."
2,This comes a little late as I'm finally catchi...,0,This comes a little late as Im finally catchin...,"[this, comes, a, little, late, as, im, finally..."
3,The Omni Chicago really delivers on all fronts...,0,The Omni Chicago really delivers on all fronts...,"[the, omni, chicago, really, delivers, on, all..."
4,I asked for a high floor away from the elevato...,0,I asked for a high floor away from the elevato...,"[i, asked, for, a, high, floor, away, from, th..."


**Removing stop words:**

In [None]:
import nltk
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
stopwords[0:20]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his']

In [None]:
def remove_stopwords(text_tokenized):
  text_removed_stopwords = [word for word in text_tokenized if word not in stopwords]
  return text_removed_stopwords

df['text_no_stopword'] = df['text_tokenized'].apply(lambda x: remove_stopwords(x))

In [None]:
df.head()

Unnamed: 0,text,label,text_no_punc,text_tokenized,text_no_stopword
0,We stayed for a one night getaway with family ...,0,We stayed for a one night getaway with family ...,"[we, stayed, for, a, one, night, getaway, with...","[stayed, one, night, getaway, family, thursday..."
1,Triple A rate with upgrade to view room was le...,0,Triple A rate with upgrade to view room was le...,"[triple, a, rate, with, upgrade, to, view, roo...","[triple, rate, upgrade, view, room, less, 200,..."
2,This comes a little late as I'm finally catchi...,0,This comes a little late as Im finally catchin...,"[this, comes, a, little, late, as, im, finally...","[comes, little, late, im, finally, catching, r..."
3,The Omni Chicago really delivers on all fronts...,0,The Omni Chicago really delivers on all fronts...,"[the, omni, chicago, really, delivers, on, all...","[omni, chicago, really, delivers, fronts, spac..."
4,I asked for a high floor away from the elevato...,0,I asked for a high floor away from the elevato...,"[i, asked, for, a, high, floor, away, from, th...","[asked, high, floor, away, elevator, got, room..."


**Lemmatization:**

In [None]:
import nltk
nltk.download('wordnet')
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()
dir(wn)

[nltk_data] Downloading package wordnet to /root/nltk_data...


['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'lemmatize']

In [None]:
nltk.download('omw-1.4')
print(ps.stem('goose'))
print(ps.stem('geese'))
print(wn.lemmatize('goose'))
print(wn.lemmatize('geese'))

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


goos
gees
goose
goose


In [None]:
def lemmatization(text_no_stopword):
  text = [wn.lemmatize(word) for word in text_no_stopword]
  return text

df['text_lemmatized'] = df['text_no_stopword'].apply(lambda x: lemmatization(x))

In [None]:
df.head()

Unnamed: 0,text,label,text_no_punc,text_tokenized,text_no_stopword,text_lemmatized
0,We stayed for a one night getaway with family ...,0,We stayed for a one night getaway with family ...,"[we, stayed, for, a, one, night, getaway, with...","[stayed, one, night, getaway, family, thursday...","[stayed, one, night, getaway, family, thursday..."
1,Triple A rate with upgrade to view room was le...,0,Triple A rate with upgrade to view room was le...,"[triple, a, rate, with, upgrade, to, view, roo...","[triple, rate, upgrade, view, room, less, 200,...","[triple, rate, upgrade, view, room, le, 200, a..."
2,This comes a little late as I'm finally catchi...,0,This comes a little late as Im finally catchin...,"[this, comes, a, little, late, as, im, finally...","[comes, little, late, im, finally, catching, r...","[come, little, late, im, finally, catching, re..."
3,The Omni Chicago really delivers on all fronts...,0,The Omni Chicago really delivers on all fronts...,"[the, omni, chicago, really, delivers, on, all...","[omni, chicago, really, delivers, fronts, spac...","[omni, chicago, really, delivers, front, spaci..."
4,I asked for a high floor away from the elevato...,0,I asked for a high floor away from the elevato...,"[i, asked, for, a, high, floor, away, from, th...","[asked, high, floor, away, elevator, got, room...","[asked, high, floor, away, elevator, got, room..."


In [None]:
df.drop('text', axis=1, inplace=True)
df.drop('text_no_punc', axis=1, inplace=True)
df.drop('text_tokenized', axis=1, inplace=True)
df.drop('text_no_stopword', axis=1, inplace=True)
df.head()

Unnamed: 0,label,text_lemmatized
0,0,"[stayed, one, night, getaway, family, thursday..."
1,0,"[triple, rate, upgrade, view, room, le, 200, a..."
2,0,"[come, little, late, im, finally, catching, re..."
3,0,"[omni, chicago, really, delivers, front, spaci..."
4,0,"[asked, high, floor, away, elevator, got, room..."


In [None]:
X = df.text_lemmatized
Y = df.label

In [None]:
X = X.astype(str)