## Data Cleaning for Wordcloud

### This script contains the following:

#### 1. Importing libraries and data
#### 2. Data cleaning for wordcloud

### 1. Importing libraries and data

In [5]:
import pandas as pd
import numpy as np
import os
from fuzzywuzzy import process
from rapidfuzz import process, fuzz
from collections import Counter
from multiprocessing import Pool

In [6]:
# Define path
path = r'C:\Users\16307\Desktop\Tasks - DA Immersion\Gun Violence Analysis'

In [7]:
# Import data
df = pd.read_csv(os.path.join(path, '02 Data', 'gun_violence_cleaned4.csv'))

In [8]:
# Create a subset DataFrame with only the 'location_description' column
location_df = df[['location_description']].copy()

In [None]:
# Convert the 'location_description' column to string and handle NaN values
location_df['location_description'] = location_df['location_description'].astype(str).fillna('')

# Preprocessing: Bucket names based on their first 3 characters (this can be adjusted)
def bucket_by_initials(df, column, bucket_size=3):
    df['bucket'] = df[column].str[:bucket_size].str.lower()
    return df

# Function to find similar location descriptions in chunks
def find_similar_in_chunk(names_chunk, all_names, threshold=80):
    similar_names_chunk = []
    for name in names_chunk:
        if name.strip():
            matches = process.extract(name, all_names, scorer=fuzz.ratio, limit=None)
            similar_names = [match[0] for match in matches if match[1] >= threshold]
            similar_names_chunk.append(similar_names)
    return similar_names_chunk

# Split data into chunks and process in parallel
def parallel_find_similar_names(location_df, column, num_workers=4, threshold=80):
    all_names = location_df[column].unique()
    name_chunks = np.array_split(all_names, num_workers)
    
    with Pool(num_workers) as pool:
        results = pool.starmap(find_similar_in_chunk, [(chunk, all_names, threshold) for chunk in name_chunks])
    
    # Flatten results from all workers
    similar_names = [item for sublist in results for item in sublist]
    return similar_names

# Preprocess by bucketing based on first few characters
location_df = bucket_by_initials(location_df, 'location_description', bucket_size=3)

# Get all unique buckets
buckets = location_df['bucket'].unique()

# For each bucket, find similar names within that bucket
name_mapping = {}
for bucket in buckets:
    bucket_df = location_df[location_df['bucket'] == bucket]
    similar_names = parallel_find_similar_names(bucket_df, 'location_description', num_workers=4, threshold=80)
    
    # Create a mapping for standardized names based on the most common name in each group
    for group in similar_names:
        most_common_name = Counter(group).most_common(1)[0][0]
        for name in group:
            name_mapping[name] = most_common_name

# Replace the original names with standardized names in the DataFrame
location_df['standardized_location_description'] = location_df['location_description'].map(name_mapping).fillna(location_df['location_description'])

# Drop the bucket column and save the DataFrame with standardized names to a new CSV file
location_df.drop('bucket', axis=1, inplace=True)
location_df.to_csv('standardized_data.csv', index=False)

# Display the results
print(location_df[['location_description', 'standardized_location_description']])