In [9]:
import pandas as pd
import re

def process_string_column(df, column_name, new_column_name):
    # Copy the dataframe to avoid modifying the original
    df_processed = df.copy()

    # Step 1: Replace all punctuations with whitespace
    df_processed[new_column_name] = df_processed[column_name].str.replace(r'[^\w\s]', ' ')

    # Step 2: Replace 'the' with ' the ' (case-insensitive)
    df_processed[new_column_name] = df_processed[new_column_name].str.replace(r'\bthe\b', ' the ', flags=re.IGNORECASE)

    # Step 3: Replace words containing 'colleague' with 'coll' (case-insensitive)
    df_processed[new_column_name] = df_processed[new_column_name].str.replace(r'\b\w*colleague\w*\b', 'coll', flags=re.IGNORECASE)

    # Step 4: Replace words containing 'communit' with 'comm' (case-insensitive)
    df_processed[new_column_name] = df_processed[new_column_name].str.replace(r'\b\w*communit\w*\b', 'comm', flags=re.IGNORECASE)

    # Remove extra spaces between words
    df_processed[new_column_name] = df_processed[new_column_name].str.replace(r'\s+', ' ')

    return df_processed


In [12]:
# Example dataframe
data = {'text_column': ['Hello, the Colleagues    community!', 'This  is the test string.', ' this one is empty communities']}
df = pd.DataFrame(data)

# Process the string column
# df_processed = process_string_column(df, 'text_column')
df_processed = process_string_column(df, 'text_column', 'processed_column')


# Print the processed dataframe
print(df_processed)


                           text_column          processed_column
0  Hello, the Colleagues    community!      Hello the coll comm 
1            This  is the test string.  This is the test string 
2        this one is empty communities    this one is empty comm


  df_processed[new_column_name] = df_processed[column_name].str.replace(r'[^\w\s]', ' ')
  df_processed[new_column_name] = df_processed[new_column_name].str.replace(r'\bthe\b', ' the ', flags=re.IGNORECASE)
  df_processed[new_column_name] = df_processed[new_column_name].str.replace(r'\b\w*colleague\w*\b', 'coll', flags=re.IGNORECASE)
  df_processed[new_column_name] = df_processed[new_column_name].str.replace(r'\b\w*communit\w*\b', 'comm', flags=re.IGNORECASE)
  df_processed[new_column_name] = df_processed[new_column_name].str.replace(r'\s+', ' ')
