In [16]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re

In [30]:
# Load the data using your function
df = pd.read_csv("clean_bank_reviews (2).csv")

# Display first few rows
print("Original Data:")
display(df.head())


Original Data:


Unnamed: 0,review,rating,date,bank,source
0,This application is very important and advanta...,5,11/27/2025,CBE,Google Play Store
1,why didn't work this app?,1,11/27/2025,CBE,Google Play Store
2,The app makes our life easier. Thank you CBE!,5,11/27/2025,CBE,Google Play Store
3,this app very bad ðŸ‘Ž,1,11/27/2025,CBE,Google Play Store
4,the most advanced app. but how to stay safe?,5,11/27/2025,CBE,Google Play Store


In [32]:
# 1. Remove duplicates
df = df.drop_duplicates()
print(f"Data shape after removing duplicates: {df.shape}")

Data shape after removing duplicates: (1187, 5)


In [36]:
# 2. Handle missing values
df = df.dropna()
print(df.shape)

(1187, 5)


In [38]:
# 3. Normalize / clean text columns
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()  # lowercase
        text = text.strip()  # remove leading/trailing spaces
        text = re.sub(r'[^a-z0-9\s]', '', text)  # remove special characters
        text = re.sub(r'\s+', ' ', text)  # replace multiple spaces with single space
        return text
    return text

In [40]:
# Apply cleaning to all object (text) columns
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].apply(clean_text)

In [42]:
# 4. Clean column names
df.columns = [col.strip().lower().replace(' ', '_') for col in df.columns]

In [44]:
# Show cleaned data
print("Cleaned Data:")
display(df.head())

Cleaned Data:


Unnamed: 0,review,rating,date,bank,source
0,this application is very important and advanta...,5,11272025,cbe,google play store
1,why didnt work this app,1,11272025,cbe,google play store
2,the app makes our life easier thank you cbe,5,11272025,cbe,google play store
3,this app very bad,1,11272025,cbe,google play store
4,the most advanced app but how to stay safe,5,11272025,cbe,google play store


In [50]:
from dateutil import parser
import re

# Normalize text and dates
def clean_text(text):
    if isinstance(text, str):
        text = text.strip()  # remove leading/trailing spaces
        
        # Try to parse as date
        try:
            parsed_date = parser.parse(text, fuzzy=False)
            return parsed_date.strftime('%Y-%m-%d')  # normalize to YYYY-MM-DD
        except (ValueError, OverflowError):
            pass  # not a date, continue cleaning as text
        
        # Clean as regular text
        text = text.lower()  # lowercase
        text = re.sub(r'[^a-z0-9\s]', '', text)  # remove special characters
        text = re.sub(r'\s+', ' ', text)  # replace multiple spaces with single space
        return text
    
    return text


In [54]:
# Ensure the date column is string
df['date'] = df['date'].astype(str)

# Function to convert MMDDYYYY to YYYY-MM-DD
def normalize_date(date_str):
    try:
        # Parse date assuming MMDDYYYY format
        month = int(date_str[:2])
        day = int(date_str[2:4])
        year = int(date_str[4:])
        return f"{year:04d}-{month:02d}-{day:02d}"
    except:
        return np.nan  # if format is wrong, mark as missing

# Apply normalization
df['date'] = df['date'].apply(normalize_date)

# Check result
df.head()


Unnamed: 0,review,rating,date,bank,source
0,this application is very important and advanta...,5,2025-11-27,cbe,google play store
1,why didnt work this app,1,2025-11-27,cbe,google play store
2,the app makes our life easier thank you cbe,5,2025-11-27,cbe,google play store
3,this app very bad,1,2025-11-27,cbe,google play store
4,the most advanced app but how to stay safe,5,2025-11-27,cbe,google play store


In [52]:
# Show cleaned data
print("Cleaned Data:")
display(df.head())

Cleaned Data:


Unnamed: 0,review,rating,date,bank,source
0,this application is very important and advanta...,5,11272025,cbe,google play store
1,why didnt work this app,1,11272025,cbe,google play store
2,the app makes our life easier thank you cbe,5,11272025,cbe,google play store
3,this app very bad,1,11272025,cbe,google play store
4,the most advanced app but how to stay safe,5,11272025,cbe,google play store


In [56]:
# Save the cleaned DataFrame to a CSV file
df.to_csv('cleaned_bank_reviews.csv', index=False)

print("File saved successfully!")


File saved successfully!
