In [None]:
# 📦 Step 1: Import libraries
import pandas as pd

# 📥 Step 2: Load the Disneyland Reviews dataset WITHOUT skipping bad lines
file_path = "/content/DisneylandReviews.csv"  # Update path if needed
df = pd.read_csv(file_path, encoding='ISO-8859-1', quotechar='"')  # no on_bad_lines='skip'

# 🧾 Show original shape
print("Original shape:", df.shape)
df.head()


Original shape: (42656, 6)


Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong


In [None]:
# 🗓️ Step 3: Convert 'Year_Month' to datetime format (day=1 assumed)
df_date = df.copy()
df_date['Year_Month'] = pd.to_datetime(df_date['Year_Month'], format='%Y-%m', errors='coerce')
df_date.head()


Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch
0,670772142,4,2019-04-01,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong
1,670682799,4,2019-05-01,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong
2,670623270,4,2019-04-01,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong
3,670607911,4,2019-04-01,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong
4,670607296,4,2019-04-01,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong


In [None]:
# 🔤 Step 4: Add Word Count from Review_Text
df_date['Word_Count'] = df_date['Review_Text'].apply(lambda x: len(str(x).split()))
df_date.head()


Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,Word_Count
0,670772142,4,2019-04-01,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong,59
1,670682799,4,2019-05-01,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong,171
2,670623270,4,2019-04-01,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong,169
3,670607911,4,2019-04-01,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong,91
4,670607296,4,2019-04-01,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong,31


In [None]:
# 😊 Step 5: Sentiment Analysis using TextBlob
!pip install -q textblob
from textblob import TextBlob

df_date['Sentiment_Score'] = df_date['Review_Text'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
df_date.head()


Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,Word_Count,Sentiment_Score
0,670772142,4,2019-04-01,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong,59,0.243981
1,670682799,4,2019-05-01,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong,171,0.236131
2,670623270,4,2019-04-01,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong,169,0.160498
3,670607911,4,2019-04-01,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong,91,0.189286
4,670607296,4,2019-04-01,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong,31,0.266667


In [None]:
# 🏙️ Step 6: Extract Branch_Location and Branch_Country
df_branch = df_date.copy()

# Extract location part
df_branch['Branch_Location'] = df_branch['Branch'].apply(lambda x: x.split('_')[-1])

# Map to countries
location_to_country = {
    'California': 'United States',
    'Paris': 'France',
    'HongKong': 'China'
}
df_branch['Branch_Country'] = df_branch['Branch_Location'].map(location_to_country)

# Preview unique values
df_branch[['Branch', 'Branch_Location', 'Branch_Country']].drop_duplicates()


Unnamed: 0,Branch,Branch_Location,Branch_Country
0,Disneyland_HongKong,HongKong,China
9620,Disneyland_California,California,United States
29026,Disneyland_Paris,Paris,France


In [None]:
# 🧹 Step 7: Drop rows with missing Year_Month (NaT)
df_cleaned = df_branch.dropna(subset=['Year_Month'])

# ✅ Final cleaned shape
print("Shape after dropping missing Year_Month:", df_cleaned.shape)
df_cleaned.head()


Shape after dropping missing Year_Month: (40043, 10)


Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,Word_Count,Sentiment_Score,Branch_Location,Branch_Country
0,670772142,4,2019-04-01,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong,59,0.243981,HongKong,China
1,670682799,4,2019-05-01,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong,171,0.236131,HongKong,China
2,670623270,4,2019-04-01,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong,169,0.160498,HongKong,China
3,670607911,4,2019-04-01,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong,91,0.189286,HongKong,China
4,670607296,4,2019-04-01,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong,31,0.266667,HongKong,China


In [None]:
print("📊 DATAFRAME SHAPES")
print("--------------------")
print("df shape           :", df.shape)           # Original loaded dataset
print("df_date shape      :", df_date.shape)      # After datetime conversion and feature extraction
print("df_branch shape    :", df_branch.shape)    # After extracting branch location/country
print("df_cleaned shape   :", df_cleaned.shape)   # After dropping missing Year_Month rows



📊 DATAFRAME SHAPES
--------------------
df shape           : (42656, 6)
df_date shape      : (42656, 8)
df_branch shape    : (42656, 10)
df_cleaned shape   : (40043, 10)


In [None]:
# 🔍 Check for missing values in each column
print("🔎 Missing Values:")
print(df_cleaned.isnull().sum())
print("\n")

# 🧯 Check for duplicate rows
duplicate_count = df_cleaned.duplicated().sum()
print(f"🧯 Duplicate Rows: {duplicate_count}\n")

# 🚫 Check for whitespace issues in object (string) columns
print("🚫 Whitespace Issues in Object Columns:")
for col in df_cleaned.select_dtypes(include='object').columns:
    count = df_cleaned[col].apply(lambda x: x != x.strip() if isinstance(x, str) else False).sum()
    print(f"{col}: {count} entries with extra whitespace")

# 🌍 Preview top unique values in Reviewer_Location (to spot typos/abbreviations)
print("\n🌍 Top 20 Reviewer_Location entries:")
print(df_cleaned['Reviewer_Location'].value_counts().head(20))


🔎 Missing Values:
Review_ID            0
Rating               0
Year_Month           0
Reviewer_Location    0
Review_Text          0
Branch               0
Word_Count           0
Sentiment_Score      0
Branch_Location      0
Branch_Country       0
dtype: int64


🧯 Duplicate Rows: 12

🚫 Whitespace Issues in Object Columns:
Reviewer_Location: 0 entries with extra whitespace
Review_Text: 5607 entries with extra whitespace
Branch: 0 entries with extra whitespace
Branch_Location: 0 entries with extra whitespace
Branch_Country: 0 entries with extra whitespace

🌍 Top 20 Reviewer_Location entries:
Reviewer_Location
United States           13522
United Kingdom           9115
Australia                4412
Canada                   2116
India                    1470
Philippines              1024
Singapore                 971
New Zealand               714
Malaysia                  562
Hong Kong                 515
Indonesia                 511
Ireland                   456
United Arab Emirates     

checking duplicate rows

In [None]:
# Drop only full, exact duplicates
df_cleaned = df_cleaned.drop_duplicates()

# Confirm shape after dropping
print("New shape after removing full duplicates:", df_cleaned.shape)


New shape after removing full duplicates: (40031, 10)


In [None]:
# Recreate df_cleaned from df_branch by dropping only rows with missing Year_Month
df_cleaned = df_branch.dropna(subset=['Year_Month'])

# Confirm shape is restored (should match original before deduplication)
print("Restored df_cleaned shape:", df_cleaned.shape)


Restored df_cleaned shape: (40043, 10)


In [None]:
# 🔍 Find and display all fully duplicated rows in df_cleaned
duplicate_rows = df_cleaned[df_cleaned.duplicated(keep=False)]

# 👀 Show how many there are
print(f"Total fully duplicated rows: {duplicate_rows.shape[0]}")

# 🧾 View them all
duplicate_rows


Total fully duplicated rows: 24


Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,Word_Count,Sentiment_Score,Branch_Location,Branch_Country
7939,164862064,5,2013-06-01,Singapore,Great atmosphere... A place for everyone in th...,Disneyland_HongKong,19,0.8,HongKong,China
7949,164862064,5,2013-06-01,Singapore,Great atmosphere... A place for everyone in th...,Disneyland_HongKong,19,0.8,HongKong,China
8814,129231609,5,2012-04-01,United States,Let me just start off by saying that although ...,Disneyland_HongKong,45,0.37,HongKong,China
8815,129214104,3,2012-04-01,Malaysia,Like: Back to the time with Disney characters....,Disneyland_HongKong,45,0.0625,HongKong,China
8816,129207323,5,2011-09-01,Australia,Having never been to any Disneyland I was thri...,Disneyland_HongKong,45,0.0625,HongKong,China
8823,129231609,5,2012-04-01,United States,Let me just start off by saying that although ...,Disneyland_HongKong,45,0.37,HongKong,China
8824,129214104,3,2012-04-01,Malaysia,Like: Back to the time with Disney characters....,Disneyland_HongKong,45,0.0625,HongKong,China
8825,129207323,5,2011-09-01,Australia,Having never been to any Disneyland I was thri...,Disneyland_HongKong,45,0.0625,HongKong,China
9174,121586148,4,2011-05-01,Australia,Only a single day adventure. Not as good as d...,Disneyland_HongKong,22,0.265476,HongKong,China
9175,121580686,4,2011-03-01,United States,"Very small, they are expanding but at a slow p...",Disneyland_HongKong,21,-0.093571,HongKong,China


In [None]:
# 🧹 Drop only exact duplicates from df_cleaned
df_cleaned = df_cleaned.drop_duplicates()

# ✅ Confirm the shape after dropping
print("New shape after dropping full duplicates:", df_cleaned.shape)


New shape after dropping full duplicates: (40031, 10)


white space rows cleening

In [None]:
# 🔍 Identify rows with leading or trailing whitespace in Review_Text
whitespace_rows = df_cleaned[df_cleaned['Review_Text'].apply(lambda x: str(x) != str(x).strip())]

# 👀 Show how many there are
print(f"Total rows with leading/trailing whitespace in Review_Text: {whitespace_rows.shape[0]}")

# 🧾 Display the affected rows
whitespace_rows[['Review_ID', 'Review_Text']]


Total rows with leading/trailing whitespace in Review_Text: 5607


Unnamed: 0,Review_ID,Review_Text
0,670772142,If you've ever been to Disneyland anywhere you...
8,670571027,"Feel so let down with this place,the Disneylan..."
10,670443403,Disneyland never cease to amaze me! I've been ...
14,670274554,This place is HUGE! Definately need more than ...
16,670199487,"Its huge , not enough to visit in one day. We ..."
...,...,...
41700,120778281,My son age 5 had a great time. He often was po...
41743,120009253,Excellent place for kids and grownups alike. O...
41855,117883312,"The Disney part it's great, the French organi..."
41978,115254578,We went to euro Disney while on a family holid...


In [None]:
# 🧼 Strip leading/trailing whitespace from Review_Text
df_cleaned['Review_Text'] = df_cleaned['Review_Text'].astype(str).str.strip()

# 🔍 Recheck to confirm no extra whitespace remains
remaining_ws = df_cleaned['Review_Text'].apply(lambda x: x != x.strip()).sum()
print(f"Remaining entries with extra whitespace: {remaining_ws}")


Remaining entries with extra whitespace: 0


In [None]:
# 🔍 Identify rows with leading or trailing whitespace in Review_Text
whitespace_rows = df_cleaned[df_cleaned['Review_Text'].apply(lambda x: str(x) != str(x).strip())]

# 👀 Show how many there are
print(f"Total rows with leading/trailing whitespace in Review_Text: {whitespace_rows.shape[0]}")

# 🧾 Display the affected rows
whitespace_rows[['Review_ID', 'Review_Text']]

Total rows with leading/trailing whitespace in Review_Text: 0


Unnamed: 0,Review_ID,Review_Text


whitespace issue resolved

adding character count from review_length

In [None]:
# Add Review_Length as number of characters in Review_Text
df_cleaned['Review_Length'] = df_cleaned['Review_Text'].apply(lambda x: len(str(x)))

# Preview the new feature
df_cleaned[['Review_Text', 'Word_Count', 'Review_Length']].head()


Unnamed: 0,Review_Text,Word_Count,Review_Length
0,If you've ever been to Disneyland anywhere you...,59,328
1,Its been a while since d last time we visit HK...,171,970
2,Thanks God it wasn t too hot or too humid wh...,169,938
3,HK Disneyland is a great compact park. Unfortu...,91,485
4,"the location is not in the city, took around 1...",31,163


In [None]:
print("📊 DATAFRAME SHAPES")
print("--------------------")
print("df shape           :", df.shape)           # Original loaded dataset
print("df_date shape      :", df_date.shape)      # After datetime conversion and feature extraction
print("df_branch shape    :", df_branch.shape)    # After extracting branch location/country
print("df_cleaned shape   :", df_cleaned.shape)   # After dropping missing Year_Month rows


📊 DATAFRAME SHAPES
--------------------
df shape           : (42656, 6)
df_date shape      : (42656, 8)
df_branch shape    : (42656, 10)
df_cleaned shape   : (40031, 11)


In [None]:
df_cleaned

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,Word_Count,Sentiment_Score,Branch_Location,Branch_Country,Review_Length
0,670772142,4,2019-04-01,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong,59,0.243981,HongKong,China,328
1,670682799,4,2019-05-01,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong,171,0.236131,HongKong,China,970
2,670623270,4,2019-04-01,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong,169,0.160498,HongKong,China,938
3,670607911,4,2019-04-01,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong,91,0.189286,HongKong,China,485
4,670607296,4,2019-04-01,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong,31,0.266667,HongKong,China,163
...,...,...,...,...,...,...,...,...,...,...,...
42113,92198076,4,2011-01-01,United Kingdom,Although our pick up was prompt the taxi drive...,Disneyland_Paris,316,0.042000,Paris,France,1574
42114,92061774,4,2011-01-01,Germany,Just returned from a 4 days family trip to Dis...,Disneyland_Paris,647,0.198286,Paris,France,3593
42115,91995748,1,2010-12-01,United Kingdom,We spent the 20 Dec 2010 in the Disney park an...,Disneyland_Paris,440,0.020628,Paris,France,2537
42116,91984642,2,2010-12-01,United Kingdom,Well I was really looking forward to this trip...,Disneyland_Paris,314,0.108065,Paris,France,1758


In [None]:
# Export df_cleaned to a CSV file
df_cleaned.to_csv("Disneyland_Reviews_Cleaned.csv", index=False)

# Download link in Colab
from google.colab import files
files.download("Disneyland_Reviews_Cleaned.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

adding sentiment category

In [None]:
# Categorize sentiment scores into Positive, Neutral, Negative
def classify_sentiment(score):
    if score > 0.1:
        return 'Positive'
    elif score < -0.1:
        return 'Negative'
    else:
        return 'Neutral'

# Apply classification
df_cleaned['Sentiment_Category'] = df_cleaned['Sentiment_Score'].apply(classify_sentiment)

# Preview counts per category
print(df_cleaned['Sentiment_Category'].value_counts())
df_cleaned[['Review_Text', 'Sentiment_Score', 'Sentiment_Category']].head()


Sentiment_Category
Positive    30573
Neutral      8169
Negative     1289
Name: count, dtype: int64


Unnamed: 0,Review_Text,Sentiment_Score,Sentiment_Category
0,If you've ever been to Disneyland anywhere you...,0.243981,Positive
1,Its been a while since d last time we visit HK...,0.236131,Positive
2,Thanks God it wasn t too hot or too humid wh...,0.160498,Positive
3,HK Disneyland is a great compact park. Unfortu...,0.189286,Positive
4,"the location is not in the city, took around 1...",0.266667,Positive


In [None]:
print("df_cleaned shape   :", df_cleaned.shape)

df_cleaned shape   : (40031, 12)


adding column, review subjectivity

In [None]:
from textblob import TextBlob

# Calculate subjectivity using TextBlob
df_cleaned['Review_Subjectivity'] = df_cleaned['Review_Text'].apply(lambda x: TextBlob(str(x)).sentiment.subjectivity)

# Preview the new column
df_cleaned[['Review_Text', 'Review_Subjectivity']].head()


Unnamed: 0,Review_Text,Review_Subjectivity
0,If you've ever been to Disneyland anywhere you...,0.561481
1,Its been a while since d last time we visit HK...,0.434649
2,Thanks God it wasn t too hot or too humid wh...,0.422944
3,HK Disneyland is a great compact park. Unfortu...,0.512143
4,"the location is not in the city, took around 1...",0.4375


In [None]:
print("df_cleaned shape   :", df_cleaned.shape)

df_cleaned shape   : (40031, 13)


In [None]:
# Export df_cleaned to a CSV file
df_cleaned.to_csv("Disneyland_Reviews_Cleaned2.csv", index=False)

# Download link in Colab
from google.colab import files
files.download("Disneyland_Reviews_Cleaned2.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Define a helper function to flag keyword mentions (case-insensitive)
def keyword_flag(text, keywords):
    text = str(text).lower()
    return int(any(keyword in text for keyword in keywords))

# Define keyword lists
keywords = {
    'Mentions_Staff': ['staff', 'employee', 'service'],
    'Mentions_Crowd': ['crowd', 'crowded', 'busy'],
    'Mentions_Cost': ['price', 'expensive', 'cheap', 'money'],
    'Mentions_Cleanliness': ['clean', 'dirty', 'hygiene']
}

# Apply the keyword flagging
for flag, words in keywords.items():
    df_cleaned[flag] = df_cleaned['Review_Text'].apply(lambda x: keyword_flag(x, words))

# ✅ Preview the new binary flags
df_cleaned[['Review_Text'] + list(keywords.keys())].head()


Unnamed: 0,Review_Text,Mentions_Staff,Mentions_Crowd,Mentions_Cost,Mentions_Cleanliness
0,If you've ever been to Disneyland anywhere you...,0,1,0,0
1,Its been a while since d last time we visit HK...,1,0,0,0
2,Thanks God it wasn t too hot or too humid wh...,0,1,0,0
3,HK Disneyland is a great compact park. Unfortu...,0,1,1,0
4,"the location is not in the city, took around 1...",0,1,0,0


In [None]:
print("df_cleaned shape   :", df_cleaned.shape)

df_cleaned shape   : (40031, 17)


adding polarity extremes

In [None]:
# Create a new column for extreme sentiment polarity
def flag_extreme(score):
    if score >= 0.9:
        return 'Positive_Extreme'
    elif score <= -0.9:
        return 'Negative_Extreme'
    else:
        return 'None'

df_cleaned['Polarity_Extreme'] = df_cleaned['Sentiment_Score'].apply(flag_extreme)

# ✅ Preview the flag distribution
print(df_cleaned['Polarity_Extreme'].value_counts())
df_cleaned[['Review_Text', 'Sentiment_Score', 'Polarity_Extreme']].head()


Polarity_Extreme
None                39907
Positive_Extreme      120
Negative_Extreme        4
Name: count, dtype: int64


Unnamed: 0,Review_Text,Sentiment_Score,Polarity_Extreme
0,If you've ever been to Disneyland anywhere you...,0.243981,
1,Its been a while since d last time we visit HK...,0.236131,
2,Thanks God it wasn t too hot or too humid wh...,0.160498,
3,HK Disneyland is a great compact park. Unfortu...,0.189286,
4,"the location is not in the city, took around 1...",0.266667,


In [None]:
print("df_cleaned shape   :", df_cleaned.shape)

df_cleaned shape   : (40031, 18)


continent from country

In [None]:
!pip install -q pycountry_convert


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.0/244.0 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pycountry_convert as pc

def get_continent(country_name):
    try:
        country_code = pc.country_name_to_country_alpha2(country_name)
        continent_code = pc.country_alpha2_to_continent_code(country_code)
        continent_name = pc.convert_continent_code_to_continent_name(continent_code)
        return continent_name
    except:
        return 'Unknown'

# Apply function to map countries to continents
df_cleaned['Continent'] = df_cleaned['Branch_Country'].apply(get_continent)

# Preview
df_cleaned[['Branch_Country', 'Continent']].drop_duplicates()


Unnamed: 0,Branch_Country,Continent
0,China,Asia
9620,United States,North America
29026,France,Europe


In [None]:
print("df_cleaned shape   :", df_cleaned.shape)

df_cleaned shape   : (40031, 19)


In [None]:
df_cleaned

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,Word_Count,Sentiment_Score,Branch_Location,Branch_Country,Review_Length,Sentiment_Category,Review_Subjectivity,Mentions_Staff,Mentions_Crowd,Mentions_Cost,Mentions_Cleanliness,Polarity_Extreme,Continent
0,670772142,4,2019-04-01,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong,59,0.243981,HongKong,China,328,Positive,0.561481,0,1,0,0,,Asia
1,670682799,4,2019-05-01,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong,171,0.236131,HongKong,China,970,Positive,0.434649,1,0,0,0,,Asia
2,670623270,4,2019-04-01,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong,169,0.160498,HongKong,China,938,Positive,0.422944,0,1,0,0,,Asia
3,670607911,4,2019-04-01,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong,91,0.189286,HongKong,China,485,Positive,0.512143,0,1,1,0,,Asia
4,670607296,4,2019-04-01,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong,31,0.266667,HongKong,China,163,Positive,0.437500,0,1,0,0,,Asia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42113,92198076,4,2011-01-01,United Kingdom,Although our pick up was prompt the taxi drive...,Disneyland_Paris,316,0.042000,Paris,France,1574,Neutral,0.477822,1,1,0,0,,Europe
42114,92061774,4,2011-01-01,Germany,Just returned from a 4 days family trip to Dis...,Disneyland_Paris,647,0.198286,Paris,France,3593,Positive,0.433570,0,0,1,0,,Europe
42115,91995748,1,2010-12-01,United Kingdom,We spent the 20 Dec 2010 in the Disney park an...,Disneyland_Paris,440,0.020628,Paris,France,2537,Neutral,0.493521,1,1,1,0,,Europe
42116,91984642,2,2010-12-01,United Kingdom,Well I was really looking forward to this trip...,Disneyland_Paris,314,0.108065,Paris,France,1758,Positive,0.479960,1,1,1,1,,Europe


adding reviewer location

In [None]:
# Normalize Reviewer_Location for comparison (lowercase strip)
df_cleaned['Reviewer_Location'] = df_cleaned['Reviewer_Location'].astype(str).str.strip()

# Create Local_Reviewer column
df_cleaned['Local_Reviewer'] = df_cleaned['Reviewer_Location'] == df_cleaned['Branch_Country']

# Preview it
df_cleaned[['Reviewer_Location', 'Branch_Country', 'Local_Reviewer']].head()


Unnamed: 0,Reviewer_Location,Branch_Country,Local_Reviewer
0,Australia,China,False
1,Philippines,China,False
2,United Arab Emirates,China,False
3,Australia,China,False
4,United Kingdom,China,False


In [None]:
print("df_cleaned shape   :", df_cleaned.shape)

df_cleaned shape   : (40031, 20)


### reviewer location, polarity extremes, check if these are usefull or not

temporal features for seasonal trend

In [None]:
# Extract temporal features from Year_Month datetime column
df_cleaned['Review_Year'] = df_cleaned['Year_Month'].dt.year
df_cleaned['Review_Month'] = df_cleaned['Year_Month'].dt.month
df_cleaned['Review_Quarter'] = df_cleaned['Year_Month'].dt.to_period('Q').astype(str)

# Preview the new columns
df_cleaned[['Year_Month', 'Review_Year', 'Review_Month', 'Review_Quarter']].head()


Unnamed: 0,Year_Month,Review_Year,Review_Month,Review_Quarter
0,2019-04-01,2019,4,2019Q2
1,2019-05-01,2019,5,2019Q2
2,2019-04-01,2019,4,2019Q2
3,2019-04-01,2019,4,2019Q2
4,2019-04-01,2019,4,2019Q2


In [None]:
print("df_cleaned shape   :", df_cleaned.shape)

df_cleaned shape   : (40031, 23)


adding review frequency per branch

In [None]:
# Reviews per branch per month
df_cleaned['Monthly_Reviews_Per_Branch'] = df_cleaned.groupby(['Branch', 'Year_Month'])['Review_ID'].transform('count')

# Preview
df_cleaned[['Branch', 'Year_Month', 'Monthly_Reviews_Per_Branch']].head()


Unnamed: 0,Branch,Year_Month,Monthly_Reviews_Per_Branch
0,Disneyland_HongKong,2019-04-01,61
1,Disneyland_HongKong,2019-05-01,1
2,Disneyland_HongKong,2019-04-01,61
3,Disneyland_HongKong,2019-04-01,61
4,Disneyland_HongKong,2019-04-01,61


per location

In [None]:
# Total reviews per reviewer location
df_cleaned['Total_Reviews_Per_Reviewer'] = df_cleaned.groupby('Reviewer_Location')['Review_ID'].transform('count')

# Preview
df_cleaned[['Reviewer_Location', 'Total_Reviews_Per_Reviewer']].head()


Unnamed: 0,Reviewer_Location,Total_Reviews_Per_Reviewer
0,Australia,4410
1,Philippines,1024
2,United Arab Emirates,339
3,Australia,4410
4,United Kingdom,9115


In [None]:
print("df_cleaned shape   :", df_cleaned.shape)

df_cleaned shape   : (40031, 25)


for reviewer behaviour pattern adding length normalization

In [None]:
# Add Avg_Word_Length, with a safety check to avoid division by zero
df_cleaned['Avg_Word_Length'] = df_cleaned.apply(
    lambda row: row['Review_Length'] / row['Word_Count'] if row['Word_Count'] > 0 else 0, axis=1
)

# Preview
df_cleaned[['Review_Text', 'Word_Count', 'Review_Length', 'Avg_Word_Length']].head()


Unnamed: 0,Review_Text,Word_Count,Review_Length,Avg_Word_Length
0,If you've ever been to Disneyland anywhere you...,59,328,5.559322
1,Its been a while since d last time we visit HK...,171,970,5.672515
2,Thanks God it wasn t too hot or too humid wh...,169,938,5.550296
3,HK Disneyland is a great compact park. Unfortu...,91,485,5.32967
4,"the location is not in the city, took around 1...",31,163,5.258065


Use Case	Why It's Helpful
📏 Writing Style	Longer words may indicate more formal or descriptive writing
📉 Spam Detection / Filtering	Extremely low or high values may flag unnatural reviews
📊 Reviewer Profiling	Compare writing habits by country/branch/sentiment
🧠 Text Quality	High-quality reviews often have richer vocabulary

In [None]:
print("df_cleaned shape   :", df_cleaned.shape)

df_cleaned shape   : (40031, 26)


In [None]:
df_cleaned

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,Word_Count,Sentiment_Score,Branch_Location,Branch_Country,...,Mentions_Cleanliness,Polarity_Extreme,Continent,Local_Reviewer,Review_Year,Review_Month,Review_Quarter,Monthly_Reviews_Per_Branch,Total_Reviews_Per_Reviewer,Avg_Word_Length
0,670772142,4,2019-04-01,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong,59,0.243981,HongKong,China,...,0,,Asia,False,2019,4,2019Q2,61,4410,5.559322
1,670682799,4,2019-05-01,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong,171,0.236131,HongKong,China,...,0,,Asia,False,2019,5,2019Q2,1,1024,5.672515
2,670623270,4,2019-04-01,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong,169,0.160498,HongKong,China,...,0,,Asia,False,2019,4,2019Q2,61,339,5.550296
3,670607911,4,2019-04-01,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong,91,0.189286,HongKong,China,...,0,,Asia,False,2019,4,2019Q2,61,4410,5.329670
4,670607296,4,2019-04-01,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong,31,0.266667,HongKong,China,...,0,,Asia,False,2019,4,2019Q2,61,9115,5.258065
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42113,92198076,4,2011-01-01,United Kingdom,Although our pick up was prompt the taxi drive...,Disneyland_Paris,316,0.042000,Paris,France,...,0,,Europe,False,2011,1,2011Q1,9,9115,4.981013
42114,92061774,4,2011-01-01,Germany,Just returned from a 4 days family trip to Dis...,Disneyland_Paris,647,0.198286,Paris,France,...,0,,Europe,False,2011,1,2011Q1,9,182,5.553323
42115,91995748,1,2010-12-01,United Kingdom,We spent the 20 Dec 2010 in the Disney park an...,Disneyland_Paris,440,0.020628,Paris,France,...,0,,Europe,False,2010,12,2010Q4,10,9115,5.765909
42116,91984642,2,2010-12-01,United Kingdom,Well I was really looking forward to this trip...,Disneyland_Paris,314,0.108065,Paris,France,...,1,,Europe,False,2010,12,2010Q4,10,9115,5.598726


In [None]:
print("df_cleaned shape   :", df_cleaned.shape)

df_cleaned shape   : (40031, 26)


ordering the extracted data

1. Core Review Info
Review_ID

Review_Text

Rating

📅 2. Time Features
Year_Month

Review_Year

Review_Month

Review_Quarter

🌍 3. Reviewer Details
Reviewer_Location

Local_Reviewer

Total_Reviews_Per_Reviewer

🏰 4. Branch Details
Branch

Branch_Location

Branch_Country

Continent

Monthly_Reviews_Per_Branch

📊 5. Textual Analysis
Word_Count

Review_Length (if present)

Avg_Word_Length

Sentiment_Score

Sentiment_Category

Review_Subjectivity

Polarity_Extreme

🏷️ 6. Keyword Flags
Mentions_Staff

Mentions_Crowd

Mentions_Cost

Mentions_Cleanliness

In [None]:
# Define the desired column order
column_order = [
    'Review_ID', 'Review_Text', 'Rating',
    'Year_Month', 'Review_Year', 'Review_Month', 'Review_Quarter',
    'Reviewer_Location', 'Local_Reviewer', 'Total_Reviews_Per_Reviewer',
    'Branch', 'Branch_Location', 'Branch_Country', 'Continent', 'Monthly_Reviews_Per_Branch',
    'Word_Count', 'Review_Length', 'Avg_Word_Length',
    'Sentiment_Score', 'Sentiment_Category', 'Review_Subjectivity', 'Polarity_Extreme',
    'Mentions_Staff', 'Mentions_Crowd', 'Mentions_Cost', 'Mentions_Cleanliness'
]

# Reorder the DataFrame
df1 = df_cleaned[column_order]

# Preview new structure
df1.head()


Unnamed: 0,Review_ID,Review_Text,Rating,Year_Month,Review_Year,Review_Month,Review_Quarter,Reviewer_Location,Local_Reviewer,Total_Reviews_Per_Reviewer,...,Review_Length,Avg_Word_Length,Sentiment_Score,Sentiment_Category,Review_Subjectivity,Polarity_Extreme,Mentions_Staff,Mentions_Crowd,Mentions_Cost,Mentions_Cleanliness
0,670772142,If you've ever been to Disneyland anywhere you...,4,2019-04-01,2019,4,2019Q2,Australia,False,4410,...,328,5.559322,0.243981,Positive,0.561481,,0,1,0,0
1,670682799,Its been a while since d last time we visit HK...,4,2019-05-01,2019,5,2019Q2,Philippines,False,1024,...,970,5.672515,0.236131,Positive,0.434649,,1,0,0,0
2,670623270,Thanks God it wasn t too hot or too humid wh...,4,2019-04-01,2019,4,2019Q2,United Arab Emirates,False,339,...,938,5.550296,0.160498,Positive,0.422944,,0,1,0,0
3,670607911,HK Disneyland is a great compact park. Unfortu...,4,2019-04-01,2019,4,2019Q2,Australia,False,4410,...,485,5.32967,0.189286,Positive,0.512143,,0,1,1,0
4,670607296,"the location is not in the city, took around 1...",4,2019-04-01,2019,4,2019Q2,United Kingdom,False,9115,...,163,5.258065,0.266667,Positive,0.4375,,0,1,0,0


In [None]:
# Export the reordered DataFrame
df1.to_csv("Disneyland_Reviews_Organized.csv", index=False)

# Trigger download (if in Colab)
from google.colab import files
files.download("Disneyland_Reviews_Organized.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

about the data

# Column Descriptions and How They Help in Tableau

| Column Name                   | What It Means                                                                 | How It Helps in Tableau                                             |
|------------------------------|--------------------------------------------------------------------------------|---------------------------------------------------------------------|
| Review_ID                    | A special number for each review                                              | Helps show tooltips or filter each review separately               |
| Review_Text                  | The full message written by the visitor                                       | Can show in tooltips or use for reading real reviews               |
| Rating                       | A score (1–5) given by the visitor                                            | Used to show average ratings, trends, or satisfaction charts       |
| Year_Month                   | The full date (year and month) of the review                                  | Great for monthly trend graphs or timelines                        |
| Review_Year                  | The year when the review was written                                          | Good for comparing reviews year by year                            |
| Review_Month                 | The month number (1–12) when the review was written                           | Helps see which months are busier or have better reviews           |
| Review_Quarter               | The 3-month period (Q1, Q2...) when the review was written                    | Useful for making quarterly dashboards                             |
| Reviewer_Location            | The country of the person who wrote the review                                | Can make world maps or country-based charts                        |
| Local_Reviewer               | Shows if the visitor is from the same country as the park                     | Helps compare local vs international visitors                      |
| Total_Reviews_Per_Reviewer   | Number of reviews written by the same person                                  | Helps find regular or loyal reviewers                              |
| Branch                       | The full name of the Disneyland park                                          | Lets you compare different Disneyland locations                    |
| Branch_Location              | The city where the Disneyland park is                                         | More detailed view than country, helpful in maps                   |
| Branch_Country               | The country where the park is                                                 | Used for comparing countries' reviews                              |
| Continent                    | The continent where the park is                                               | Used for high-level map or bar charts                              |
| Monthly_Reviews_Per_Branch   | How many reviews a park got each month                                        | Shows which parks are busy in which months                         |
| Word_Count                   | How many words are in the review                                              | Helps show how detailed the reviews are                            |
| Review_Length                | How many characters (letters) are in the review                              | Similar to word count; used in review size analysis                |
| Avg_Word_Length              | Average size of words in the review                                           | Shows if reviews use simple or complex language                    |
| Sentiment_Score              | A number showing how positive or negative the review is                      | Can color graphs based on mood of reviews                          |
| Sentiment_Category           | Shows if review is Positive, Negative, or Neutral                            | Great for pie charts or filters by sentiment                       |
| Review_Subjectivity          | Shows if review is more opinion or fact-based                                | Helps understand tone of reviews                                   |
| Polarity_Extreme             | Shows very happy or very unhappy reviews (needs fixing)                      | Can be used to highlight extreme reviews                           |
| Mentions_Staff               | 1 if the review talks about staff, else 0                                     | Helpful for topic-specific analysis                                |
| Mentions_Crowd               | 1 if the review talks about crowds or lines                                   | Useful for crowd concern charts                                    |
| Mentions_Cost                | 1 if the review mentions prices or cost                                       | Shows how often people complain or talk about prices               |
| Mentions_Cleanliness         | 1 if the review mentions cleanliness or hygiene                               | Helps track cleanliness feedback over time                         |
