In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data=pd.read_csv('data.csv')

In [3]:
data.head()

Unnamed: 0,Review,date,Location
0,I was very impressed with the resort.\n Great ...,2019/08/20,Sebastian
1,The rooms were nice the outside needs work als...,2019/08/20,Los Angeles
2,Great location! I have stayed at this hotel on...,2019/08/20,Georgia
3,The hotel was adequate for my stay. The strips...,2019/08/20,
4,"Great location, room was large and spacious. P...",2019/08/19,Palm Harbor


In [4]:
data.shape

(6448, 3)

# Here we see that we have a dataset of 6448 rows and 3 columns that is Review, date, Location

In [5]:
data.value_counts('Location')

Location
United States of America    116
New York                     34
California                   19
San Jose                     19
Canada                       16
                           ... 
Manhattan                     1
Manhattan, NY                 1
Mansfield, Texas              1
Margate Florida               1
2186                          1
Length: 1082, dtype: int64

# In location columns we have maximun number hotels reviews from United States of America(116 reviews), New York(34 reviews) and california(19 reviews).

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6448 entries, 0 to 6447
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Review    6393 non-null   object
 1   date      6448 non-null   object
 2   Location  1711 non-null   object
dtypes: object(3)
memory usage: 151.2+ KB


In [7]:
data.isnull().sum() * 100 / len(data)

Review       0.852978
date         0.000000
Location    73.464640
dtype: float64

# 75 percentage of location in location columns are missing

In [8]:
data.isnull().sum()

Review        55
date           0
Location    4737
dtype: int64

# 55 reviews are missing

In [9]:
data.value_counts('date')

date
Jun 2019      145
May 2019      121
Jul 2019       92
2018/12/01     66
Oct 2018       58
             ... 
2019/06/14      3
2019/05/31      2
Dec 2018        2
2019/06/06      2
2019/06/08      2
Length: 403, dtype: int64

# most of hotels visits in month of january around 145.

In [10]:
data=data.dropna(subset=['Review'], how='any')

In [11]:
data.isnull().sum()

Review         0
date           0
Location    4688
dtype: int64

# drop 55 null reviews from dataset to evaluate the sentiment of reviews.

In [12]:
from textblob import TextBlob

In [13]:
def analyze_sentiment(Review):
    analysis = TextBlob(Review)
    if analysis.sentiment.polarity > 0:
        return 'Positive'
    elif analysis.sentiment.polarity == 0:
        return 'Neutral'
    else:
        return 'Negative'

# using Textblob libaray to evaluate the sentiment of reviews.

In [14]:
data['Sentiment'] = data['Review'].apply(lambda x : analyze_sentiment(x) )
data.head()

Unnamed: 0,Review,date,Location,Sentiment
0,I was very impressed with the resort.\n Great ...,2019/08/20,Sebastian,Positive
1,The rooms were nice the outside needs work als...,2019/08/20,Los Angeles,Positive
2,Great location! I have stayed at this hotel on...,2019/08/20,Georgia,Positive
3,The hotel was adequate for my stay. The strips...,2019/08/20,,Positive
4,"Great location, room was large and spacious. P...",2019/08/19,Palm Harbor,Positive


In [15]:
data.value_counts('Sentiment')

Sentiment
Positive    5616
Negative     744
Neutral       33
dtype: int64

# Here you are can see the number of positive(5616) negative(744) and neutral reviews(33).

In [16]:
data.loc[data['Sentiment'] == 'Positive', 'Location'].value_counts()

United States of America    76
New York                    31
California                  18
San Jose                    17
Florida                     13
                            ..
Lake alfred, us              1
Reno, Nevada                 1
Mukilteo, WA                 1
Spokane WA                   1
India                        1
Name: Location, Length: 1001, dtype: int64

# postive reviews according to various cities.

In [17]:
data.loc[data['Sentiment'] == 'Negative', 'Location'].value_counts()

United States of America     22
Florida                       3
New York                      3
United Kingdom                3
Miami                         3
                             ..
Southwick us                  1
Jacksonville fl               1
wGreeashin\n \n Greendale     1
Birmingham, al                1
Trinidad and Tobago           1
Name: Location, Length: 136, dtype: int64

# Negative reviews according to various cities.

In [18]:
data.loc[data['Sentiment'] == 'Neutral', 'Location'].value_counts()

United States of America    18
Canada                       2
France                       1
El Salvador                  1
Puerto Rico                  1
Kuwait                       1
Greece                       1
Name: Location, dtype: int64

# Neutral reviews according to various cities.

# Most positve and negative reviews are from the united states of america