# Import Packages

Before starting the cleaning process, import all required packages

In [7]:
import pandas as pd
import re

# Import the Dataset

Import the Analyzed Dataset to process it

In [8]:
df = pd.read_csv("datasets/Dataset_small.csv")

# Create Cleaner Function
Create a function that selectively removes all unwanted Strings like `TXN363507` or `#7300`. The function developed does the following work:
- Converts the Text to lowercase
- Removes predictable patterns
- Removes stray Numbers and Special Characters
- Returns the text

In [9]:
def clean_text(text):
    # Lowercase text
    text = text.lower()
    
    # Remove TXN patterns
    text = re.sub(r'txn\d+', '', text)

    # Remove hashtags patterns
    text = re.sub(r'#\d+', '', text)
    
    # Remove standalone numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove special characters
    text = re.sub(r'[^a-z\s]', '', text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [22]:
# Testing the function

test_samples = [
    "Mobile Center TXN797664",
    "Hulu #5061 TXN363507",
    "PNC Bank - INDIA (Digital Wallet)",
    "Zaxby's #3457 (Digital Wallet)"
]

for t in test_samples:
    print("Original:", t)
    print("Cleaned :", clean_text(t))
    print()

Original: Mobile Center TXN797664
Cleaned : mobile center

Original: Hulu #5061 TXN363507
Cleaned : hulu

Original: PNC Bank - INDIA (Digital Wallet)
Cleaned : pnc bank india digital wallet

Original: Zaxby's #3457 (Digital Wallet)
Cleaned : zaxbys digital wallet



# Perform Cleaning

In [10]:
df['transaction_description_c'] = df['transaction_description'].apply(clean_text)

In [12]:
df[['transaction_description', 'transaction_description_c', 'category']].head(10)

Unnamed: 0,transaction_description,transaction_description_c,category
0,Mobile Center TXN797664,mobile center,Utilities & Services
1,Megabus Online,megabus online,Transportation
2,Mobile Hotspot Online - Weekday,mobile hotspot online weekday,Utilities & Services
3,PNC Bank - INDIA (Digital Wallet),pnc bank india digital wallet,Financial Services
4,Cinema - UK - Holiday,cinema uk holiday,Entertainment & Recreation
5,NYU Langone #9471 Hospital,nyu langone hospital,Healthcare & Medical
6,United,united,Transportation
7,Transfer,transfer,Financial Services
8,Hulu #5061 TXN363507,hulu,Entertainment & Recreation
9,X-Ray #8565,xray,Healthcare & Medical


# Verify Cleaning

In [18]:
# Check for existance of Numbers
df['transaction_description_c'].str.contains(r'\d').sum()

np.int64(0)

In [19]:
# Check for existance of Special Characters
df['transaction_description_c'].str.contains(r'[^a-z\s]').sum()

np.int64(0)

In [20]:
# Visually compare outputs of the Cleaning
df[['transaction_description', 'transaction_description_c']].sample(20)

Unnamed: 0,transaction_description,transaction_description_c
141460,Panera Bread,panera bread
129625,Satellite Internet,satellite internet
154050,Qdoba #7405,qdoba
127593,City Hall #2709 - CANADA Store Center (Bank Tr...,city hall canada store center bank transfer
108065,BB&T,bbt
21005,Turo #6697,turo
190761,Frontier Online TXN890821,frontier online
158706,Consolidated Communications Store,consolidated communications store
124796,Mayo Clinic Branch,mayo clinic branch
177427,Verizon #5414,verizon


In [23]:
df = df.drop(columns=['transaction_description'])

In [24]:
df.to_csv('datasets/Dataset_cleaned.csv')