****Perform Bigram analysis on the text column in the given dataset below
and find the top 20 words.
tripadvisor_hotel_reviews.csv****

In [39]:
# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string 
import re
import warnings
warnings.filterwarnings('ignore')

In [40]:
# Importing data sets
trip = pd.read_csv('/kaggle/input/tripadvisorcsv/Tripadvisor.csv')
trip.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [41]:
# Selecting Only First Column To Our Analysis
trip_text = trip[['Review']]
trip_text.head()

Unnamed: 0,Review
0,nice hotel expensive parking got good deal sta...
1,ok nothing special charge diamond member hilto...
2,nice rooms not 4* experience hotel monaco seat...
3,"unique, great stay, wonderful time hotel monac..."
4,"great stay great stay, went seahawk game aweso..."


In [42]:
# Shape of our data
trip_text.shape

(20491, 1)

In [43]:
# info
trip_text.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20491 entries, 0 to 20490
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  20491 non-null  object
dtypes: object(1)
memory usage: 160.2+ KB


In [44]:
# convert all text to lowercase and remove non-alphabetical characters from our "Review" column 
trip_text['clean_text'] = trip_text['Review'].str.lower().str.replace('[^a-z\']', ' ')
trip_text['clean_text'] = trip_text['Review'].str.lower().apply(lambda x: re.sub('[^a-z]', ' ', x))
trip_text.head()

Unnamed: 0,Review,clean_text
0,nice hotel expensive parking got good deal sta...,nice hotel expensive parking got good deal sta...
1,ok nothing special charge diamond member hilto...,ok nothing special charge diamond member hilto...
2,nice rooms not 4* experience hotel monaco seat...,nice rooms not experience hotel monaco seat...
3,"unique, great stay, wonderful time hotel monac...",unique great stay wonderful time hotel monac...
4,"great stay great stay, went seahawk game aweso...",great stay great stay went seahawk game aweso...


In [45]:
# Importing Required library
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english') 
stop

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [46]:
# Now we are removing all the conjunction from our reviwe column
def stop_words(x):
    x = [word for word in x.split() if word not in stop]
    return ' '.join(x)

trip_text['split_text'] = trip_text['clean_text'].apply(stop_words)
trip_text.head()

Unnamed: 0,Review,clean_text,split_text
0,nice hotel expensive parking got good deal sta...,nice hotel expensive parking got good deal sta...,nice hotel expensive parking got good deal sta...
1,ok nothing special charge diamond member hilto...,ok nothing special charge diamond member hilto...,ok nothing special charge diamond member hilto...
2,nice rooms not 4* experience hotel monaco seat...,nice rooms not experience hotel monaco seat...,nice rooms experience hotel monaco seattle goo...
3,"unique, great stay, wonderful time hotel monac...",unique great stay wonderful time hotel monac...,unique great stay wonderful time hotel monaco ...
4,"great stay great stay, went seahawk game aweso...",great stay great stay went seahawk game aweso...,great stay great stay went seahawk game awesom...


In [47]:
# Now we importing Count Vectorizer and creating bigram for our analysis
from sklearn.feature_extraction.text import CountVectorizer
cnt_vec = CountVectorizer(ngram_range=(2,2))
# Fit the data
x = cnt_vec.fit_transform(trip_text['split_text'])
x 

<20491x931979 sparse matrix of type '<class 'numpy.int64'>'
	with 1953082 stored elements in Compressed Sparse Row format>

In [48]:
# Converting it into data frame
DTM = pd.DataFrame(x.toarray(), 
                   columns=cnt_vec.get_feature_names_out())
DTM

Unnamed: 0,aa arena,aa center,aa flight,aa food,aa great,aa guide,aa remarkable,aa travel,aa want,aaa apprehension,...,zurich stayed,zurich want,zvago say,zwolle netherlands,zyrtec expectorant,zytec allergies,zz near,zzzt power,zzzzt middle,zzzzzs want
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20486,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20487,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20488,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20489,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
# Sum
Word_frq_count = DTM.sum()
Word_frq_count

aa arena           1
aa center          4
aa flight          1
aa food            1
aa great           1
                  ..
zytec allergies    1
zz near            1
zzzt power         1
zzzzt middle       1
zzzzzs want        1
Length: 931979, dtype: int64

In [50]:
# Top 20 Words
Word_freq_table = pd.DataFrame(Word_frq_count).reset_index().rename(columns = {"index": "Word's",0:"Word's Count"})
Word_freq_table.sort_values(by = "Word's Count",ascending = False).head(20)

Unnamed: 0,Word's,Word's Count
351097,great location,2210
766765,staff friendly,2163
631546,punta cana,1730
387789,hotel great,1491
350860,great hotel,1474
891797,walking distance,1468
323830,friendly helpful,1432
651772,recommend hotel,1360
390383,hotel staff,1269
690090,room service,1246


# THE END..!