In [1]:
import subprocess, sys

def ensure_spacy_model(model_name="en_core_web_sm"):
    try:
        import spacy
        spacy.load(model_name)
    except Exception:
        print(f"Installing spaCy model: {model_name}")
        subprocess.check_call([sys.executable, "-m", "spacy", "download", model_name])

ensure_spacy_model()
print("spaCy model is ready.")


spaCy model is ready.


In [2]:
# ------------------------------------------------------------------------------
# Imports and basic setup
# ------------------------------------------------------------------------------

from pathlib import Path
import os
import re

import numpy as np
import pandas as pd
import spacy
from tqdm import tqdm


In [3]:
# ------------------------------------------------------------------------------
# Load raw data
# ------------------------------------------------------------------------------

# Path to the raw CSV
RAW_DATA = Path("../data/raw_data") / "Hotel_Reviews copy 2.csv"

# Read CSV
df_raw = pd.read_csv(RAW_DATA, low_memory=False)

# Quick shape check
print("Raw shape:", df_raw.shape)

Raw shape: (515738, 17)


In [4]:
# ------------------------------------------------------------------------------
# label creation
# ------------------------------------------------------------------------------

df = df_raw.copy()

# Replace placeholders with empty strings (avoid "No Positive"/"No Negative" in text)
df["Positive_Review"] = df["Positive_Review"].replace("No Positive", "", regex=False)
df["Negative_Review"] = df["Negative_Review"].replace("No Negative", "", regex=False)

# Combine into one review text; fill missing with empty strings
df["Total_Review"] = df["Negative_Review"].fillna("") + " " + df["Positive_Review"].fillna("")

# Binary target: 1 if score >= 7, else 0
df["encoded_review"] = (df["Reviewer_Score"] >= 7).astype(int)

# Keep only what we need
df = df[["Total_Review", "encoded_review"]]

# Drop exact duplicate rows, reset index
df = df.drop_duplicates().reset_index(drop=True)

print("After clean shape:", df.shape)
df.head(3)

After clean shape: (499845, 2)


Unnamed: 0,Total_Review,encoded_review
0,I am so angry that i made this post available...,0
1,No real complaints the hotel was great great...,1
2,Rooms are nice but for elderly a bit difficul...,1


In [5]:
# ------------------------------------------------------------------------------
# sample a subset for faster processing
# ------------------------------------------------------------------------------

N_SAMPLES = 200_000  # change to None to use all rows

if N_SAMPLES is not None and N_SAMPLES < len(df):
    df_sample = df.sample(n=N_SAMPLES, random_state=123)
else:
    df_sample = df.copy()

print("Working set shape:", df_sample.shape)

Working set shape: (200000, 2)


In [6]:
# ------------------------------------------------------------------------------
# Text preprocessing
#   - clean_text() does lowercase + simple regex cleanup
# ------------------------------------------------------------------------------

# Settings
USE_LEMMATIZATION = True   # set False if you want faster, no-lemmatization run
BATCH_SIZE        = 1000
N_PROC            = None   # None -> use ~half your CPU cores

# Precompiled regex for speed
RE_CONTRACTIONS = [
    (re.compile(r"\b(can't)\b", re.I), "cannot"),
    (re.compile(r"\b(won't)\b", re.I), "will not"),
    (re.compile(r"n't\b", re.I), " not"),
    (re.compile(r"'re\b", re.I), " are"),
    (re.compile(r"'ve\b", re.I), " have"),
    (re.compile(r"'ll\b", re.I), " will"),
    (re.compile(r"'d\b", re.I), " would"),
    (re.compile(r"\bI'm\b", re.I), "I am"),
    (re.compile(r"'s\b", re.I), " is"),
    (re.compile(r"'t\b", re.I), " not"),
]
RE_PUNCT    = re.compile(r"[^\w\s]")
RE_DIGIT    = re.compile(r"\b\d+\b")
RE_NUMWORDS = re.compile(
    r"\b(one|two|three|four|five|six|seven|eight|nine|ten|"
    r"first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth)\b", re.I
)
RE_SINGLE   = re.compile(r"\b[a-zA-Z]\b")
RE_SPACE    = re.compile(r"\s+")

def clean_text(x: str) -> str:
    """Lowercase, expand contractions, strip punctuation/digits/noise."""
    if not isinstance(x, str):
        return ""
    for patt, repl in RE_CONTRACTIONS:
        x = patt.sub(repl, x)
    x = x.lower()
    x = RE_PUNCT.sub(" ", x)      # remove punctuation
    x = RE_DIGIT.sub(" ", x)      # remove standalone digits
    x = RE_NUMWORDS.sub(" ", x)   # remove spelled-out numbers (optional)
    x = RE_SINGLE.sub(" ", x)     # remove single letters
    x = RE_SPACE.sub(" ", x).strip()
    return x

def preprocess_text_column(df_in: pd.DataFrame, text_col: str) -> pd.Series:
    """Apply clean_text()"""
    texts = df_in[text_col].fillna("").astype(str).map(clean_text).tolist()

    if not USE_LEMMATIZATION:
        return pd.Series(texts, index=df_in.index)

    # Load spaCy English model
    nlp = spacy.load("en_core_web_sm", disable=["ner", "parser", "attribute_ruler"])
    stop_words = nlp.Defaults.stop_words

    # Decide parallel workers
    n_proc = max(1, (os.cpu_count() or 2) // 2) if N_PROC is None else N_PROC

    out = []
    for doc in tqdm(nlp.pipe(texts, batch_size=BATCH_SIZE, n_process=n_proc),
                    total=len(texts), desc="lemmatizing"):
        # keep alphabetic lemmas, drop spaces/punct/numbers, drop stopwords
        kept = [t.lemma_ for t in doc
                if (not t.is_space) and (not t.is_punct) and (not t.like_num)
                and t.lemma_ and (t.lemma_ not in stop_words)]
        out.append(" ".join(kept))

    return pd.Series(out, index=df_in.index)

In [7]:
# ------------------------------------------------------------------------------
# Apply the preprocessing to the working dataframe
# ------------------------------------------------------------------------------

df_sample["preprocessed_review"] = preprocess_text_column(df_sample, "Total_Review")

df_sample[["Total_Review", "preprocessed_review", "encoded_review"]].head(5)

lemmatizing: 100%|██████████| 200000/200000 [01:21<00:00, 2450.38it/s]


Unnamed: 0,Total_Review,preprocessed_review,encoded_review
429328,Only a short stay Reception was extremely he...,short stay reception extremely helpful advice ...,1
481203,It location far from historic center The park...,location far historic center parking free room...,1
344085,Nothing Perfect Location and staff,perfect location staff,1
181804,More english tv channels The croissants are ...,english tv channels croissants amazing omelettes,1
321093,Room looked into an internal courtyard and wa...,room looked internal courtyard dark issue loca...,1


In [8]:
# ------------------------------------------------------------------------------
# Save to clean_data (CSV)
# ------------------------------------------------------------------------------

OUT_DIR = Path("../data/clean_data")
OUT_DIR.mkdir(parents=True, exist_ok=True)

OUT_PATH = OUT_DIR / "clean_preprocessed_reviews.csv"
df_sample.to_csv(OUT_PATH, index=False)

print("Saved preprocessed dataset to:", OUT_PATH.resolve())

Saved preprocessed dataset to: /Users/lucasvercauteren/Desktop/gehaalde vakken/Master eur/seminar/final paper/hotel_reviews_sent_python_notebook/data/clean_data/clean_preprocessed_reviews.csv
