In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
df=pd.read_csv("FakeNews_dataset/tripadvisor_hotel_reviews.csv")

In [3]:
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


### Pre-Processing

In [4]:
df['lower_case'] = df['Review'].str.lower()

In [5]:
df.head()

Unnamed: 0,Review,Rating,lower_case
0,nice hotel expensive parking got good deal sta...,4,nice hotel expensive parking got good deal sta...
1,ok nothing special charge diamond member hilto...,2,ok nothing special charge diamond member hilto...
2,nice rooms not 4* experience hotel monaco seat...,3,nice rooms not 4* experience hotel monaco seat...
3,"unique, great stay, wonderful time hotel monac...",5,"unique, great stay, wonderful time hotel monac..."
4,"great stay great stay, went seahawk game aweso...",5,"great stay great stay, went seahawk game aweso..."


In [6]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

df["text_wo_stop"] = df["lower_case"].apply(lambda text: remove_stopwords(text))
df.head()

Unnamed: 0,Review,Rating,lower_case,text_wo_stop
0,nice hotel expensive parking got good deal sta...,4,nice hotel expensive parking got good deal sta...,nice hotel expensive parking got good deal sta...
1,ok nothing special charge diamond member hilto...,2,ok nothing special charge diamond member hilto...,ok nothing special charge diamond member hilto...
2,nice rooms not 4* experience hotel monaco seat...,3,nice rooms not 4* experience hotel monaco seat...,nice rooms 4* experience hotel monaco seattle ...
3,"unique, great stay, wonderful time hotel monac...",5,"unique, great stay, wonderful time hotel monac...","unique, great stay, wonderful time hotel monac..."
4,"great stay great stay, went seahawk game aweso...",5,"great stay great stay, went seahawk game aweso...","great stay great stay, went seahawk game aweso..."


In [7]:
# drop the new column created in last cell
# df.drop(["text_lower"], axis=1, inplace=True)
import string
PUNCT_TO_REMOVE = string.punctuation
print (PUNCT_TO_REMOVE)
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

df["text_wo_punct"] = df["text_wo_stop"].apply(lambda text: remove_punctuation(text))
df.head()

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


Unnamed: 0,Review,Rating,lower_case,text_wo_stop,text_wo_punct
0,nice hotel expensive parking got good deal sta...,4,nice hotel expensive parking got good deal sta...,nice hotel expensive parking got good deal sta...,nice hotel expensive parking got good deal sta...
1,ok nothing special charge diamond member hilto...,2,ok nothing special charge diamond member hilto...,ok nothing special charge diamond member hilto...,ok nothing special charge diamond member hilto...
2,nice rooms not 4* experience hotel monaco seat...,3,nice rooms not 4* experience hotel monaco seat...,nice rooms 4* experience hotel monaco seattle ...,nice rooms 4 experience hotel monaco seattle g...
3,"unique, great stay, wonderful time hotel monac...",5,"unique, great stay, wonderful time hotel monac...","unique, great stay, wonderful time hotel monac...",unique great stay wonderful time hotel monaco ...
4,"great stay great stay, went seahawk game aweso...",5,"great stay great stay, went seahawk game aweso...","great stay great stay, went seahawk game aweso...",great stay great stay went seahawk game awesom...


In [8]:
# Sample code to remove a regex pattern 
import re 

def remove_regex(input_text, regex_pattern):
    urls = re.finditer(regex_pattern, input_text) 
    for i in urls: 
        input_text = re.sub(i.group().strip(), '', input_text)
    return input_text

In [9]:
regex_pattern = "[0-9]+"

df["text_wo_punct"] = df["text_wo_punct"].apply(lambda text:remove_regex(text, regex_pattern))

In [10]:
from collections import Counter
cnt = Counter()
for text in df["text_wo_punct"].values:
    for word in text.split():
        cnt[word] += 1
        
cnt.most_common(10)

[('hotel', 48861),
 ('room', 34321),
 ('great', 21094),
 ('nt', 18997),
 ('good', 16985),
 ('staff', 16212),
 ('stay', 15158),
 ('nice', 12409),
 ('rooms', 12024),
 ('location', 11043)]

In [11]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

df["text_lemmatized"] = df["text_wo_punct"].apply(lambda text: lemmatize_words(text))
df.head()

Unnamed: 0,Review,Rating,lower_case,text_wo_stop,text_wo_punct,text_lemmatized
0,nice hotel expensive parking got good deal sta...,4,nice hotel expensive parking got good deal sta...,nice hotel expensive parking got good deal sta...,nice hotel expensive parking got good deal sta...,nice hotel expensive parking got good deal sta...
1,ok nothing special charge diamond member hilto...,2,ok nothing special charge diamond member hilto...,ok nothing special charge diamond member hilto...,ok nothing special charge diamond member hilto...,ok nothing special charge diamond member hilto...
2,nice rooms not 4* experience hotel monaco seat...,3,nice rooms not 4* experience hotel monaco seat...,nice rooms 4* experience hotel monaco seattle ...,nice rooms experience hotel monaco seattle go...,nice room experience hotel monaco seattle good...
3,"unique, great stay, wonderful time hotel monac...",5,"unique, great stay, wonderful time hotel monac...","unique, great stay, wonderful time hotel monac...",unique great stay wonderful time hotel monaco ...,unique great stay wonderful time hotel monaco ...
4,"great stay great stay, went seahawk game aweso...",5,"great stay great stay, went seahawk game aweso...","great stay great stay, went seahawk game aweso...",great stay great stay went seahawk game awesom...,great stay great stay went seahawk game awesom...


In [12]:
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

df["text_lemmatized"] = df["text_wo_punct"].apply(lambda text: lemmatize_words(text))
df.head()

Unnamed: 0,Review,Rating,lower_case,text_wo_stop,text_wo_punct,text_lemmatized
0,nice hotel expensive parking got good deal sta...,4,nice hotel expensive parking got good deal sta...,nice hotel expensive parking got good deal sta...,nice hotel expensive parking got good deal sta...,nice hotel expensive parking get good deal sta...
1,ok nothing special charge diamond member hilto...,2,ok nothing special charge diamond member hilto...,ok nothing special charge diamond member hilto...,ok nothing special charge diamond member hilto...,ok nothing special charge diamond member hilto...
2,nice rooms not 4* experience hotel monaco seat...,3,nice rooms not 4* experience hotel monaco seat...,nice rooms 4* experience hotel monaco seattle ...,nice rooms experience hotel monaco seattle go...,nice room experience hotel monaco seattle good...
3,"unique, great stay, wonderful time hotel monac...",5,"unique, great stay, wonderful time hotel monac...","unique, great stay, wonderful time hotel monac...",unique great stay wonderful time hotel monaco ...,unique great stay wonderful time hotel monaco ...
4,"great stay great stay, went seahawk game aweso...",5,"great stay great stay, went seahawk game aweso...","great stay great stay, went seahawk game aweso...",great stay great stay went seahawk game awesom...,great stay great stay go seahawk game awesome ...


In [13]:
df.text_lemmatized[0]

'nice hotel expensive parking get good deal stay hotel anniversary arrive late evening take advice previous review valet park check quick easy little disappointed nonexistent view room room clean nice size bed comfortable woke stiff neck high pillow soundproof like heard music room night morning loud bang door open closing hear people talk hallway maybe noisy neighbor aveda bath product nice goldfish stay nice touch take advantage stay long location great walk distance shop overall nice experience pay parking night'

In [14]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def word_token(input_str):
    tokens = word_tokenize(input_str)
    result = [i for i in tokens if not i in stop_words]
    return " ".join(result)

In [15]:
df['text_token']=df['text_lemmatized'].apply(lambda x:word_token(x))

In [16]:
df.head()

Unnamed: 0,Review,Rating,lower_case,text_wo_stop,text_wo_punct,text_lemmatized,text_token
0,nice hotel expensive parking got good deal sta...,4,nice hotel expensive parking got good deal sta...,nice hotel expensive parking got good deal sta...,nice hotel expensive parking got good deal sta...,nice hotel expensive parking get good deal sta...,nice hotel expensive parking get good deal sta...
1,ok nothing special charge diamond member hilto...,2,ok nothing special charge diamond member hilto...,ok nothing special charge diamond member hilto...,ok nothing special charge diamond member hilto...,ok nothing special charge diamond member hilto...,ok nothing special charge diamond member hilto...
2,nice rooms not 4* experience hotel monaco seat...,3,nice rooms not 4* experience hotel monaco seat...,nice rooms 4* experience hotel monaco seattle ...,nice rooms experience hotel monaco seattle go...,nice room experience hotel monaco seattle good...,nice room experience hotel monaco seattle good...
3,"unique, great stay, wonderful time hotel monac...",5,"unique, great stay, wonderful time hotel monac...","unique, great stay, wonderful time hotel monac...",unique great stay wonderful time hotel monaco ...,unique great stay wonderful time hotel monaco ...,unique great stay wonderful time hotel monaco ...
4,"great stay great stay, went seahawk game aweso...",5,"great stay great stay, went seahawk game aweso...","great stay great stay, went seahawk game aweso...",great stay great stay went seahawk game awesom...,great stay great stay go seahawk game awesome ...,great stay great stay go seahawk game awesome ...


In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfv= TfidfVectorizer()
final_X = tfv.fit_transform(df.text_token)
final_X.shape

(20491, 71506)

In [19]:
final_Y = df['Rating'].values

In [24]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(final_X,final_Y,test_size=.25,random_state=4)

In [29]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs',max_iter=2000,random_state=21,tol=.001,)
model.fit(x_train,y_train)
model.score(x_test,y_test)

0.6138981065781769

In [30]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model,final_X,final_Y,cv=5)
print(scores)
print(scores.mean)


[0.61697975 0.60053685 0.5988287  0.60956564 0.60883358]
<built-in method mean of numpy.ndarray object at 0x7fe8160b4df0>


In [31]:
pred=model.predict(x_test)

In [34]:
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

[[ 214   93   17   17   22]
 [  78  155   75   94   46]
 [  13   52  109  251   75]
 [   5   21   50  788  681]
 [   3    4    6  375 1879]]
              precision    recall  f1-score   support

           1       0.68      0.59      0.63       363
           2       0.48      0.35      0.40       448
           3       0.42      0.22      0.29       500
           4       0.52      0.51      0.51      1545
           5       0.70      0.83      0.76      2267

    accuracy                           0.61      5123
   macro avg       0.56      0.50      0.52      5123
weighted avg       0.59      0.61      0.60      5123

