# Import Packages

Before starting the cleaning process, import all required packages

In [1]:
import pandas as pd
import re

# Import the Dataset

Import the Analyzed Dataset to process it

In [2]:
df = pd.read_csv("datasets/Dataset_small.csv")

# Create Cleaner Function
Create a function that selectively removes all unwanted Strings like `TXN363507` or `#7300`. The function developed does the following work:
- Converts the Text to lowercase
- Removes predictable patterns
- Removes stray Numbers and Special Characters
- Returns the text

In [3]:
def clean_text(text):
    # Lowercase text
    text = text.lower()
    
    # Remove TXN patterns
    text = re.sub(r'txn\d+', '', text)

    # Remove hashtags patterns
    text = re.sub(r'#\d+', '', text)
    
    # Remove standalone numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove special characters
    text = re.sub(r'[^a-z\s]', '', text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [4]:
# Testing the function

test_samples = [
    "Mobile Center TXN797664",
    "Hulu #5061 TXN363507",
    "PNC Bank - INDIA (Digital Wallet)",
    "Zaxby's #3457 (Digital Wallet)"
]

for t in test_samples:
    print("Original:", t)
    print("Cleaned :", clean_text(t))
    print()

Original: Mobile Center TXN797664
Cleaned : mobile center

Original: Hulu #5061 TXN363507
Cleaned : hulu

Original: PNC Bank - INDIA (Digital Wallet)
Cleaned : pnc bank india digital wallet

Original: Zaxby's #3457 (Digital Wallet)
Cleaned : zaxbys digital wallet



# Perform Cleaning

In [5]:
df['transaction_description_c'] = df['transaction_description'].apply(clean_text)

In [6]:
df[['transaction_description', 'transaction_description_c', 'category']].head(10)

Unnamed: 0,transaction_description,transaction_description_c,category
0,Mobile Center TXN797664,mobile center,Utilities & Services
1,Megabus Online,megabus online,Transportation
2,Mobile Hotspot Online - Weekday,mobile hotspot online weekday,Utilities & Services
3,PNC Bank - INDIA (Digital Wallet),pnc bank india digital wallet,Financial Services
4,Cinema - UK - Holiday,cinema uk holiday,Entertainment & Recreation
5,NYU Langone #9471 Hospital,nyu langone hospital,Healthcare & Medical
6,United,united,Transportation
7,Transfer,transfer,Financial Services
8,Hulu #5061 TXN363507,hulu,Entertainment & Recreation
9,X-Ray #8565,xray,Healthcare & Medical


# Verify Cleaning

In [7]:
# Check for existance of Numbers
df['transaction_description_c'].str.contains(r'\d').sum()

np.int64(0)

In [8]:
# Check for existance of Special Characters
df['transaction_description_c'].str.contains(r'[^a-z\s]').sum()

np.int64(0)

In [9]:
# Visually compare outputs of the Cleaning
df[['transaction_description', 'transaction_description_c']].sample(20)

Unnamed: 0,transaction_description,transaction_description_c
61976,Stanford Health,stanford health
106084,Consolidated Communications,consolidated communications
146319,KFC #2563,kfc
181600,Fair #7164 Online Shopping Center,fair online shopping center
46401,Transfer,transfer
45911,Cardiologist - UK,cardiologist uk
113582,Credit Union,credit union
79859,WWF Center TXN554957,wwf center
64171,Eye Care,eye care
130464,Chevron #8786 TXN782099,chevron


In [10]:
df = df.drop(columns=['transaction_description'])

In [11]:
df.to_csv('datasets/Dataset_cleaned.csv', index=False)