In [32]:
!pip install plotly

Collecting plotly
  Downloading plotly-6.0.0-py3-none-any.whl.metadata (5.6 kB)
Collecting narwhals>=1.15.1 (from plotly)
  Downloading narwhals-1.25.0-py3-none-any.whl.metadata (10 kB)
Downloading plotly-6.0.0-py3-none-any.whl (14.8 MB)
   ---------------------------------------- 0.0/14.8 MB ? eta -:--:--
    --------------------------------------- 0.3/14.8 MB ? eta -:--:--
   ------- -------------------------------- 2.6/14.8 MB 10.8 MB/s eta 0:00:02
   ----------------------------------- ---- 13.1/14.8 MB 29.3 MB/s eta 0:00:01
   ---------------------------------------- 14.8/14.8 MB 30.0 MB/s eta 0:00:00
Downloading narwhals-1.25.0-py3-none-any.whl (313 kB)
Installing collected packages: narwhals, plotly
Successfully installed narwhals-1.25.0 plotly-6.0.0


In [2]:
import pandas as pd

bookings = pd.read_csv('./bookings.csv')

### Fix typings

In [3]:
bookings['number_of_reviews'] = bookings['number_of_reviews'].str.replace(',', '', regex=True)
bookings['number_of_reviews'] = pd.to_numeric(bookings['number_of_reviews'], errors='coerce')
bookings['review_score'] = pd.to_numeric(bookings['review_score'], errors='coerce')

### Reviews Distribution
Group the review counts by snapshot dates, and the names of the hotels. the `number_of_reviews` will be a the same for the same hotel in a certain snapshot.   

In [4]:
number_of_reviews = bookings.groupby(['name', 'snapshot_date'], as_index=False)['number_of_reviews'].first()
number_of_reviews

Unnamed: 0,name,snapshot_date,number_of_reviews
0,1 Hotel Central Park,2025-02-02,556.0
1,1 Hotel Central Park,2025-02-03,556.0
2,17John,2025-02-02,209.0
3,17John,2025-02-03,209.0
4,2 Bedroom Cozy Central Apartment,2025-02-02,8.0
...,...,...,...
711,citizenM New York Times Square,2025-02-03,3999.0
712,"voco The Franklin New York, an IHG Hotel",2025-02-02,508.0
713,"voco The Franklin New York, an IHG Hotel",2025-02-03,508.0
714,"voco Times Square South New York, an IHG Hotel",2025-02-02,3803.0


In [5]:
snapshot_dates = bookings['snapshot_date'].unique()

In [37]:
import plotly.graph_objects as go

fig = go.Figure()

for date in snapshot_dates:
    data = number_of_reviews[number_of_reviews['snapshot_date'] == date]['number_of_reviews'].dropna()
    fig.add_trace(go.Histogram(x=data, name=str(date), opacity=0.5, autobinx=True))

# Update layout for better visualization
fig.update_layout(
    title= "Distribution of Number of Reviews for Each Snapshot Date",
    xaxis_title="Number of Reviews",
    yaxis_title="Frequency",
    barmode="overlay",  # Makes histograms overlap
    legend_title="Snapshot Date",
    width=1400,  # Increase figure width
    height=800   # Increase figure height
)

# Show interactive plot
fig.show()

In [50]:
# Drop NaN values from 'review_score'

import plotly.express as px

review_scores = bookings.groupby(['name', 'snapshot_date'], as_index=False)['review_score'].first()

# Create subplots for each snapshot date
fig = go.Figure()

for date in snapshot_dates:
    data = review_scores[review_scores['snapshot_date'] == date]['review_score'].dropna()
    fig.add_trace(go.Histogram(x=data, name=str(date), opacity=0.5, bingroup=100))

# Update layout to separate the graphs
fig.update_layout(
    title="Distribution of Review Scores for Each Snapshot Date",
    xaxis_title="Review Score",
    yaxis_title="Frequency",
    barmode='overlay',  # Ensures histograms are separate
    legend_title="Snapshot Date",
    width=1000,
    height=600
)

# Show interactive plot
fig.show()