In [2]:
import pandas as pd
import hashlib
import re

In [3]:
# File paths
paths = {
    'cleaned_model_fulltxt': "../data/cleaned_for_modeling_fulltxt.csv",
    'cleaned_model_sentences': "../data/cleaned_for_modeling_sentences.csv",
    'cleaned_for_tfidf': "../data/cleaned_for_tfidf.csv"
}

# Read in cleaned data

In [None]:
# Load CSV
df = pd.read_csv('../data/cleaned_for_modeling.csv')
print(f"Loaded {len(df):,} rows\n")

# Process Data

In [None]:
# Define functions

def is_likely_english(text):
    """Fast heuristic: >80% ASCII characters = likely English."""
    if pd.isna(text) or len(str(text).strip()) == 0:
        return False
    text = str(text)
    ascii_count = sum(1 for c in text if ord(c) < 128)
    return (ascii_count / len(text)) > 0.8

def filter_english_only(df):
    """Filter to English-only reviews."""
    print(f"Before English filter: {len(df):,} rows")
    df_clean = df.copy()
    df_clean['is_english'] = df_clean['text'].apply(is_likely_english)
    df_clean = df_clean[df_clean['is_english']].drop(columns=['is_english'])
    print(f"  After: {len(df_clean):,} rows ({len(df) - len(df_clean):,} removed)\n")
    return df_clean

def filter_original_tag(df):
    """Remove reviews with Google's (Original) translation tag."""
    print(f"Before (Original) tag filter: {len(df):,} rows")
    df_clean = df[~df['text'].str.contains(r'\(Original\)', na=False)].copy()
    print(f"  After: {len(df_clean):,} rows ({len(df) - len(df_clean):,} removed)\n")
    return df_clean

In [None]:
# Apply english filtering pipeline

df = (df
    .pipe(filter_english_only)
    .pipe(filter_original_tag)
    .reset_index(drop=True)
)

print(f"Final dataset: {len(df):,} rows")
df.info()

# Add review_id (unique per review)

In [None]:
# Define review_id generating function
def make_review_id(df):

    # Combine fields into one string
    combined = (
        df["user_id"].astype(str) + "||" +
        df["gmap_id"].astype(str) + "||" +
        df["rating"].astype(str) + "||" +
        df["text"].astype(str)
    )

    # Create SHA256 hash and insert as first column
    df.insert(
        0,
        "review_id",
        combined.apply(lambda x: hashlib.sha256(x.encode("utf-8")).hexdigest())
    )

    return df


In [None]:
# Apply function to dataframe
df = (
    df
    .pipe(make_review_id)
)

In [None]:
df.head(10)

In [None]:
# Drop duplicates (found 5 when exploring the data)
before = len(df)
df = df.drop_duplicates(subset="review_id", keep="first")
after = len(df)

print(f"Rows before deduplication: {before:,}")
print(f"Rows after deduplication:  {after:,}")
print(f"Duplicates removed:        {before - after:,}")

In [None]:
# Save data
df.to_csv(paths['cleaned_model_fulltxt'], index=False)
print(f"Saved to {paths['cleaned_model_fulltxt']}\n")
df.info()

# For Baseline Model
Logistic regression or Naive Bayes + TF-IDF
* Lowercasing
* Remove punctuation
* Remove stop words, except for negations like not/no (in model pipeline)
* N-grams (bigrams) (in model pipeline)
* Lemmatization (maybe)
* Sentence-level tokens (maybe)

In [None]:
def baseline_text_chunked_csv(in_path: str, out_path: str, chunksize: int, text_col: str, new_col: str):
    """
    Memory-safe baseline text processing
    Reads `in_path` in chunks, creates `new_col` by:
      - lowercasing
      - removing punctuation (keeping letters/numbers/spaces)
      - normalizing whitespace
      - stripping leading/trailing spaces

    Writes the result incrementally to `out_path`.

    Parameters
    ----------
    in_path : str
        Input CSV path (e.g., "reviews_with_review_id.csv")
    out_path : str
        Output CSV path (e.g., "reviews_with_text_classical.csv")
    chunksize : int
        Number of rows per chunk. Lower if you still crash (e.g., 20_000).
    text_col : str
        Name of the raw text column in the input CSV.
    new_col : str
        Name of the output processed text column.
    """
    first = True

    # Use keep_default_na=False so empty strings don't become NaN unexpectedly
    reader = pd.read_csv(in_path, chunksize=chunksize, keep_default_na=False)

    for i, chunk in enumerate(reader, start=1):
        if text_col not in chunk.columns:
            raise KeyError(
                f"Column '{text_col}' not found. Available columns: {list(chunk.columns)}"
            )

        s = chunk[text_col].astype("string")

        chunk[new_col] = (
            s.str.lower()
             .str.replace(r"[^a-z0-9\s]", " ", regex=True)
             .str.replace(r"\s+", " ", regex=True)
             .str.strip()
        )

        # Write chunk to output CSV (append after first chunk)
        chunk.to_csv(out_path, mode="w" if first else "a", index=False, header=first)
        first = False

        # Lightweight progress
        print(f"âœ… Processed chunk {i:,} (rows: {len(chunk):,})")

    print(f"ðŸŽ‰ Done. Wrote cleaned CSV to: {out_path}")


In [None]:
baseline_text_chunked_csv(
    in_path=paths["cleaned_model_fulltxt"],
    out_path=paths["cleaned_for_tfidf"],
    chunksize=500_000,
    text_col="text",
    new_col="text_baseline",
)

In [8]:
# Load CSV
df = pd.read_csv(
    paths["cleaned_for_tfidf"],
    usecols=["review_id", "rating", "text_baseline"],
    dtype={
        "review_id": "string",
        "rating": "int8",
        "text_baseline": "string"
    }
)
df.head(10)

Unnamed: 0,review_id,rating,text_baseline
0,456e420929727f933dbaed63eff45cde53c7b92438cf0d...,5,easy process extremely friendly helpful staff ...
1,ea2ad448a8b443c1c42c5d4ca9dd84d02fe9f2f110b993...,5,my girlfriends and i took a weekend ski trip t...
2,77efbe6a6f4d27512b59bb2f878b0ac8b533aa03a11fb7...,5,the team at black tie never disappoints our se...
3,ba742a26b57396fde7a05136bbbf551906f5c6d9e66008...,5,they were awesome people first timer they help...
4,57787d7d11fc5e75d7a7643fd966534de03fdc72cf764f...,5,great service fast responsive they came and pi...
5,6cd920c382945084ed2537452eef5d2603e3c19e4c26d6...,5,awesome incredible customer service profession...
6,275023c6626a913042ec003365906cbf377ca45cfc0acf...,5,unforgettable remarkable customer for life i j...
7,1bc8c890f35919ec310e65b64701a07a249989f0adcdd3...,5,good service affordable
8,56a745e1bb00e7475f21f1fdd5051b5d328c54c74747ef...,5,nice people good service
9,32d6a0dcb40c36f484a2dfc47b50ae9b6eb86b0cc143dd...,5,awesome


# For Primary Model
Transformer-based embeddings (e.g., distilBERT) + simple classifier (e.g., logistic regression)
* Sentence-level tokens

# For Advanced Model
Large Language Model-based ABSA using prompting
* None, wants raw text, capitalization, punctuation, etc.