**Preprocessing Of Reviews dataset**

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
import re
import string
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [None]:
da = pd.read_csv('deceptive-opinion.csv')

In [None]:
da.head(10)

Unnamed: 0,deceptive,hotel,polarity,source,text
0,truthful,conrad,positive,TripAdvisor,We stayed for a one night getaway with family ...
1,truthful,hyatt,positive,TripAdvisor,Triple A rate with upgrade to view room was le...
2,truthful,hyatt,positive,TripAdvisor,This comes a little late as I'm finally catchi...
3,truthful,omni,positive,TripAdvisor,The Omni Chicago really delivers on all fronts...
4,truthful,hyatt,positive,TripAdvisor,I asked for a high floor away from the elevato...
5,truthful,omni,positive,TripAdvisor,I stayed at the Omni for one night following a...
6,truthful,conrad,positive,TripAdvisor,We stayed in the Conrad for 4 nights just befo...
7,truthful,omni,positive,TripAdvisor,Just got back from 2 days up in Chicago shoppi...
8,truthful,omni,positive,TripAdvisor,We arrived at the Omni on 2nd September for a ...
9,truthful,hyatt,positive,TripAdvisor,"On our visit to Chicago, we chose the Hyatt du..."


In [None]:
da = da.drop(["hotel", "polarity","source"], axis=1)

In [None]:
#Randomly ordering the dataframe
da = da.sample(frac=1)

In [None]:
da.head()

Unnamed: 0,deceptive,text
1014,truthful,"I'd been searching for a cool, non-chain hotel..."
73,truthful,Booked this on priceline for an insanely low p...
145,truthful,"I booked this via Priceline, and was not sure ..."
1585,deceptive,Amalfi Hotel in Chicago is one of the worst ho...
157,truthful,I stayed for just one nite at the Sheraton Hot...


In [None]:
from sklearn import preprocessing

# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()

# Encode labels in column 'species'.
da['deceptive']= label_encoder.fit_transform(da['deceptive'])

da['deceptive'].unique()

array([1, 0])

In [None]:
da.head()

Unnamed: 0,deceptive,text
22,1,I actually booked this reservation with the ho...
936,1,Was one of the worst travel experiences of qui...
758,0,My husband and I visited this hotel on our way...
424,0,The Omni Chicago Hotel was a delight to stay i...
287,1,Stayed here with friends for a long weekend in...


### Dataset description truthful=1 deceptive=0

In [None]:

da.groupby('deceptive').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
deceptive,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,800,800,My husband and I visited this hotel on our way...,1
1,800,796,The Omni was chosen for it's location whichwor...,2


In [None]:
def clean_text(text):

    ## Remove puncuation
    text = text.translate(string.punctuation)

    ## Convert words to lower case and split them
    text = text.lower().split()

    ## Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]

    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    text = text.split()
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)

    return text

In [None]:
# Some preprocesssing that will be common to all the text classification methods

puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£',
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…',
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─',
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞',
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

def clean_char(x):
    x = str(x)
    for punct in puncts:
        if punct in x:
            x = x.replace(punct, f' {punct} ')
    return x

In [None]:
def clean_numbers(x):
    if bool(re.search(r'\d', x)):
        x = re.sub('[0-9]{5,}', '#####', x)
        x = re.sub('[0-9]{4}', '####', x)
        x = re.sub('[0-9]{3}', '###', x)
        x = re.sub('[0-9]{2}', '##', x)
    return x

In [None]:
da['text'] = da['text'].map(lambda a: clean_numbers(a))

In [None]:
da['text'] = da['text'].map(lambda a: clean_char(a))

In [None]:
import nltk
nltk.download('stopwords')

da['text'] = da['text'].map(lambda a: clean_text(a))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
da['text']

Unnamed: 0,text
22,actual book reserv hotel phone got great rate ...
936,one worst travel experi quit time arriv constr...
758,husband visit hotel way home honeymoon nice ho...
424,omni chicago hotel delight stay second step pl...
287,stay friend long weekend august love hotel enj...
...,...
380,stay hotel one night hotel excel compar qualit...
1344,stay knickerbock hotel last time visit chicago...
436,hyatt regenc chicago hotel perfecti locat cent...
1351,stay millennium knickerbock hotel chicago stan...


In [None]:
da.describe()

Unnamed: 0,deceptive
count,1600.0
mean,0.5
std,0.500156
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [None]:
da.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1600 entries, 22 to 863
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   deceptive  1600 non-null   int64 
 1   text       1600 non-null   object
dtypes: int64(1), object(1)
memory usage: 37.5+ KB


In [None]:
x = da['text']
y = da['deceptive']

In [None]:
da.to_csv('preprocessed_data', index= False)