### Similar Station Names with Reservior Script Threshold at 0.7 matrix csv output files as thing with similar name

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

# Function to preprocess names
def preprocess_name(name):
    name = name.upper()
    name = re.sub(r'\bRES\.?\b', 'RESERVOIR', name)
    name = re.sub(r'\bRSVR\.?\b', 'RESERVOIR', name)
    name = re.sub(r'\(.*?\)|[^A-Z0-9\s]', '', name)
    name = re.sub(r'\s+', ' ', name).strip()
    return name

# Set the similarity threshold (hardcoded)
similarity_threshold = 0.7  # Adjust this value as needed

# Load the CSV file
file_path = 'Things.csv'
data = pd.read_csv(file_path)

# Ensure required columns exist
required_columns = ['thing_name', 'thing_siteType', 'system_name']
for col in required_columns:
    if col not in data.columns:
        raise ValueError(f"The CSV file must contain a '{col}' column.")

# Apply preprocessing to all names
data['processed_name'] = data['thing_name'].apply(preprocess_name)

# Initialize an identifier column
data['identifier'] = None

# Create a dictionary to store generalized names
generalized_names_map = {}

# Process data grouped by 'thing_siteType'
current_id = 0
for site_type, group in data.groupby('thing_siteType'):
    # Compute TF-IDF vectors for the processed names within the same group
    vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2))
    tfidf_matrix = vectorizer.fit_transform(group['processed_name'])
    
    # Compute cosine similarity matrix
    cosine_sim = cosine_similarity(tfidf_matrix)
    
    # Assign unique identifiers based on similarity
    group_identifiers = [None] * len(group)
    group_index = group.index.tolist()  # Get original index from the group
    for i in range(len(group)):
        if group_identifiers[i] is None:  # If not already assigned
            current_id += 1
            group_identifiers[i] = current_id
            similar_names = [group.iloc[i]['processed_name']]
            for j in range(i + 1, len(group)):
                if cosine_sim[i, j] > similarity_threshold:
                    group_identifiers[j] = current_id
                    similar_names.append(group.iloc[j]['processed_name'])
            # Assign the common base name for this identifier
            generalized_name = " ".join(set(similar_names))
            generalized_names_map[current_id] = generalized_name
    
    # Update the identifiers in the original DataFrame
    data.loc[group_index, 'identifier'] = group_identifiers

# Replace processed_name with the generalized name based on identifier
data['processed_name'] = data['identifier'].map(generalized_names_map)

# Split data into two categories
identifier_counts = data['identifier'].value_counts()

similar_data = data[data['identifier'].isin(identifier_counts[identifier_counts > 1].index)]
unique_data = data[data['identifier'].isin(identifier_counts[identifier_counts == 1].index)]

# Sort the subsets
similar_data = similar_data.sort_values(by=['thing_name', 'identifier', 'system_name'], ascending=True)
unique_data = unique_data.sort_values(by=['thing_name', 'identifier', 'system_name'], ascending=True)

# Save the subsets to separate CSV files
similar_output_path = 'Things_with_similar_names.csv'
unique_output_path = 'Things_with_unique_names.csv'

similar_data.to_csv(similar_output_path, index=False)
unique_data.to_csv(unique_output_path, index=False)



print(f"Processed files saved: \n - {similar_output_path}\n - {unique_output_path}")

# Count rows in each subset
num_similar_names = len(similar_data)
num_unique_names = len(unique_data)

print("Number of similar names:", num_similar_names)
print("Number of unique names:", num_unique_names)



# Create a dictionary to store similarity matrices
similarity_matrices = {}

# Process data grouped by 'thing_siteType'
for site_type, group in data.groupby('thing_siteType'):
    # Compute TF-IDF vectors for the processed names within the same group
    vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2))
    tfidf_matrix = vectorizer.fit_transform(group['processed_name'])
    
    # Compute cosine similarity matrix
    cosine_sim = cosine_similarity(tfidf_matrix)
    
    # Convert cosine similarity matrix to a DataFrame
    similarity_matrix_df = pd.DataFrame(
        cosine_sim,
        index=group['thing_name'],
        columns=group['thing_name']
    )
    
    # Store the similarity matrix for this site type
    similarity_matrices[site_type] = similarity_matrix_df
    
    # Save to a CSV file for this group
    output_path = f"Similarity_Matrix_{site_type}.csv"
    similarity_matrix_df.to_csv(output_path)
    print(f"Saved similarity matrix for site type '{site_type}' to {output_path}")

### Similar Station Names w/o Reservior Script Threshold at 0.7 matrix csv output files as thing with similar name no reservior

In [None]:

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

# Function to preprocess names
def preprocess_name(name):
    name = name.upper()
    name = re.sub(r'\bRES\.?\b', 'RESERVOIR', name)
    name = re.sub(r'\bRSVR\.?\b', 'RESERVOIR', name)
    name = re.sub(r'\(.*?\)|[^A-Z0-9\s]', '', name)
    name = re.sub(r'\s+', ' ', name).strip()
    return name

# Set the similarity threshold
similarity_threshold = 0.7

# Load the CSV file
file_path = 'Things.csv'
data = pd.read_csv(file_path)

# Ensure required columns exist
required_columns = ['thing_name', 'thing_siteType', 'system_name']
for col in required_columns:
    if col not in data.columns:
        raise ValueError(f"The CSV file must contain a '{col}' column.")

# Apply preprocessing to all names
data['processed_name'] = data['thing_name'].apply(preprocess_name)

# Initialize an identifier column
data['identifier'] = None
generalized_names_map = {}
current_id = 0

# Grouping and similarity calculation
for site_type, group in data.groupby('thing_siteType'):
    vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2))
    tfidf_matrix = vectorizer.fit_transform(group['processed_name'])
    cosine_sim = cosine_similarity(tfidf_matrix)

    group_identifiers = [None] * len(group)
    group_index = group.index.tolist()

    for i in range(len(group)):
        if group_identifiers[i] is None:
            current_id += 1
            group_identifiers[i] = current_id
            similar_names = [group.iloc[i]['processed_name']]
            for j in range(i + 1, len(group)):
                if cosine_sim[i, j] > similarity_threshold:
                    group_identifiers[j] = current_id
                    similar_names.append(group.iloc[j]['processed_name'])
            generalized_name = " ".join(set(similar_names))
            generalized_names_map[current_id] = generalized_name

    data.loc[group_index, 'identifier'] = group_identifiers

# Replace processed_name with generalized name based on identifier
data['processed_name'] = data['identifier'].map(generalized_names_map)

# Split into similar and unique groups
identifier_counts = data['identifier'].value_counts()
similar_data = data[data['identifier'].isin(identifier_counts[identifier_counts > 1].index)]
unique_data = data[data['identifier'].isin(identifier_counts[identifier_counts == 1].index)]

# Filter out names with 'Reservoir' in original thing_name (case-insensitive)
similar_data_no_reservoir = similar_data[~similar_data['thing_name'].str.upper().str.contains("RESERVOIR")]

# Sort
similar_data = similar_data.sort_values(by=['thing_name', 'identifier', 'system_name'])
unique_data = unique_data.sort_values(by=['thing_name', 'identifier', 'system_name'])
similar_data_no_reservoir = similar_data_no_reservoir.sort_values(by=['thing_name', 'identifier', 'system_name'])

# Output file paths
similar_output_path = 'Things_with_similar_names.csv'
unique_output_path = 'Things_with_unique_names.csv'
no_reservoir_output_path = 'Things_with_similar_names_no_reservoir.csv'

# Save files
similar_data.to_csv(similar_output_path, index=False)
unique_data.to_csv(unique_output_path, index=False)
similar_data_no_reservoir.to_csv(no_reservoir_output_path, index=False)

print(f"Processed files saved:\n - {similar_output_path}\n - {unique_output_path}\n - {no_reservoir_output_path}")
print("Number of similar names:", len(similar_data))
print("Number of unique names:", len(unique_data))
print("Number of similar names without 'Reservoir':", len(similar_data_no_reservoir))

# Save similarity matrices per site_type
similarity_matrices = {}
for site_type, group in data.groupby('thing_siteType'):
    vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2))
    tfidf_matrix = vectorizer.fit_transform(group['processed_name'])
    cosine_sim = cosine_similarity(tfidf_matrix)

    similarity_matrix_df = pd.DataFrame(
        cosine_sim,
        index=group['thing_name'],
        columns=group['thing_name']
    )
    similarity_matrices[site_type] = similarity_matrix_df

    output_path = f"Similarity_Matrix_{site_type}.csv"
    similarity_matrix_df.to_csv(output_path)
    print(f"Saved similarity matrix for site type '{site_type}' to {output_path}")