In [1]:
import pandas as pd
import hashlib

In [2]:
# File paths
paths = {
    'cleaned_model_fulltxt': "../data/cleaned_for_modeling_fulltxt.csv"
}

# Processing chunk size
chunk_size = 1_000_000

# Read in cleaned data

In [3]:
# Load CSV
df = pd.read_csv('../data/cleaned_for_modeling.csv')
print(f"Loaded {len(df):,} rows\n")

Loaded 24,189,110 rows



# Process Data

In [4]:
# Define functions

def is_likely_english(text):
    """Fast heuristic: >80% ASCII characters = likely English."""
    if pd.isna(text) or len(str(text).strip()) == 0:
        return False
    text = str(text)
    ascii_count = sum(1 for c in text if ord(c) < 128)
    return (ascii_count / len(text)) > 0.8

def filter_english_only(df):
    """Filter to English-only reviews."""
    print(f"Before English filter: {len(df):,} rows")
    df_clean = df.copy()
    df_clean['is_english'] = df_clean['text'].apply(is_likely_english)
    df_clean = df_clean[df_clean['is_english']].drop(columns=['is_english'])
    print(f"  After: {len(df_clean):,} rows ({len(df) - len(df_clean):,} removed)\n")
    return df_clean

def filter_original_tag(df):
    """Remove reviews with Google's (Original) translation tag."""
    print(f"Before (Original) tag filter: {len(df):,} rows")
    df_clean = df[~df['text'].str.contains(r'\(Original\)', na=False)].copy()
    print(f"  After: {len(df_clean):,} rows ({len(df) - len(df_clean):,} removed)\n")
    return df_clean

In [5]:
# Apply english filtering pipeline

df = (df
    .pipe(filter_english_only)
    .pipe(filter_original_tag)
    .reset_index(drop=True)
)

print(f"Final dataset: {len(df):,} rows")
df.info()

Before English filter: 24,189,110 rows
  After: 24,075,432 rows (113,678 removed)

Before (Original) tag filter: 24,075,432 rows
  After: 22,641,041 rows (1,434,391 removed)

Final dataset: 22,641,041 rows
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22641041 entries, 0 to 22641040
Data columns (total 4 columns):
 #   Column   Dtype  
---  ------   -----  
 0   user_id  float64
 1   rating   float64
 2   text     object 
 3   gmap_id  object 
dtypes: float64(2), object(2)
memory usage: 690.9+ MB


# Add review_id (unique per review)

In [6]:
# Define review_id generating function
def make_review_id(df):

    # Combine fields into one string
    combined = (
        df["user_id"].astype(str) + "||" +
        df["gmap_id"].astype(str) + "||" +
        df["rating"].astype(str) + "||" +
        df["text"].astype(str)
    )

    # Create SHA256 hash and insert as first column
    df.insert(
        0,
        "review_id",
        combined.apply(lambda x: hashlib.sha256(x.encode("utf-8")).hexdigest())
    )

    return df


In [7]:
# Apply function to dataframe
df = (
    df
    .pipe(make_review_id)
)

In [8]:
df.head(10)

Unnamed: 0,review_id,user_id,rating,text,gmap_id
0,456e420929727f933dbaed63eff45cde53c7b92438cf0d...,1.067134e+20,5.0,"Easy process, extremely friendly, helpful staf...",0x80960c29f2e3bf29:0x4b291f0d275a5699
1,ea2ad448a8b443c1c42c5d4ca9dd84d02fe9f2f110b993...,1.024963e+20,5.0,My girlfriends and I took a weekend ski trip t...,0x80960c29f2e3bf29:0x4b291f0d275a5699
2,77efbe6a6f4d27512b59bb2f878b0ac8b533aa03a11fb7...,1.102407e+20,5.0,The team at Black Tie never disappoints our se...,0x80960c29f2e3bf29:0x4b291f0d275a5699
3,ba742a26b57396fde7a05136bbbf551906f5c6d9e66008...,1.116481e+20,5.0,They were awesome people! First timer they hel...,0x80960c29f2e3bf29:0x4b291f0d275a5699
4,57787d7d11fc5e75d7a7643fd966534de03fdc72cf764f...,1.169394e+20,5.0,"Great service, fast, responsive, they came and...",0x80960c29f2e3bf29:0x4b291f0d275a5699
5,6cd920c382945084ed2537452eef5d2603e3c19e4c26d6...,1.03519e+20,5.0,Awesome! Incredible customer service. Profess...,0x80960c29f2e3bf29:0x4b291f0d275a5699
6,275023c6626a913042ec003365906cbf377ca45cfc0acf...,1.021517e+20,5.0,"Unforgettable, remarkable, customer for life. ...",0x80960c29f2e3bf29:0x4b291f0d275a5699
7,1bc8c890f35919ec310e65b64701a07a249989f0adcdd3...,1.110766e+20,5.0,"Good service, affordable",0x80960c29f2e3bf29:0x4b291f0d275a5699
8,56a745e1bb00e7475f21f1fdd5051b5d328c54c74747ef...,1.089911e+20,5.0,"Nice people, good service.",0x80960c29f2e3bf29:0x4b291f0d275a5699
9,32d6a0dcb40c36f484a2dfc47b50ae9b6eb86b0cc143dd...,1.070499e+20,5.0,Awesome!!!!!,0x80960c29f2e3bf29:0x4b291f0d275a5699


In [9]:
# Save data
df.to_csv(paths['cleaned_model_fulltxt'], index=False)
print(f"Saved to {paths['cleaned_model_fulltxt']}\n")
df.info()

Saved to ../data/cleaned_for_modeling_fulltxt.csv

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22641041 entries, 0 to 22641040
Data columns (total 5 columns):
 #   Column     Dtype  
---  ------     -----  
 0   review_id  object 
 1   user_id    float64
 2   rating     float64
 3   text       object 
 4   gmap_id    object 
dtypes: float64(2), object(3)
memory usage: 863.7+ MB
