# DESCRIPTION

In [None]:
import pandas as pd
import re
import numpy as np
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from collections import defaultdict
from tqdm import tqdm

# Step 1: Define Replacement Dictionary
replacement_dict = {
    "streeet space": "street space",
    "street spacemta meters": "street space",
    "street sace": "street space",
    "street spacemta": "street space",
}

# Step 2: Regex Normalization
def apply_regex(text):
    if re.search(r"\bstreet\s*space\b", text):
        return "street space"
    elif re.search(r"\binspection\b", text):
        return "inspections"
    elif re.search(r"\breroofing\b", text):
        return "reroofing"
    elif re.search(r"\breroof\b", text):
        return "reroofing"
    elif re.search(r"\bsoft story retrofit\b", text):
        return "soft story retrofit"
    elif re.search(r"provide.*?(sprinkler|sprinklers).*?monitoring system.*?water flow.*?(valve monitoring|tamper monitoring)", text):
        return "provide sprinkler system monitoring"
    return text

# Step 3: Text Cleaning and Normalization
def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\d+", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def normalize_text(text, replacement_dict):
    for old_term, new_term in replacement_dict.items():
        text = text.replace(old_term, new_term)
    return apply_regex(text)

# Step 4: Dimensionality Reduction with t-SNE
def compute_tsne_groups(texts, threshold=0.8, perplexity=30, random_state=42):
    print("Reducing Dimensionality with t-SNE...")
    
    # Create embeddings using character-level vectors
    vectorizer = TfidfVectorizer(analyzer="char", ngram_range=(2, 3), stop_words="english")
    tfidf_matrix = vectorizer.fit_transform(texts)
    
    # Convert sparse matrix to dense matrix
    dense_matrix = tfidf_matrix.toarray()
    
    # Standardize the features
    scaler = StandardScaler()
    standardized_matrix = scaler.fit_transform(dense_matrix)

    # Apply t-SNE for dimensionality reduction
    tsne = TSNE(n_components=2, perplexity=perplexity, random_state=random_state, init="random")
    tsne_embeddings = tsne.fit_transform(standardized_matrix)

    # Compute cosine similarity matrix
    sim_matrix = cosine_similarity(tsne_embeddings)
    groups = defaultdict(list)

    print("Creating Groups Using Cosine Similarity...")
    for i in tqdm(range(len(sim_matrix)), desc="Assigning Groups"):
        similar_indices = np.where(sim_matrix[i] > threshold)[0]
        for j in similar_indices:
            if i != j:  # Avoid self-matching
                groups[i].append(j)
    return groups

# Step 5: Assign Groups to DataFrame
def assign_groups(df, groups):
    group_labels = {}
    group_id = 1

    print("Assigning Groups to DataFrame...")
    for leader, members in tqdm(groups.items(), desc="Processing Groups"):
        for member in members:
            group_labels[member] = f"Group_{group_id}"
        group_labels[leader] = f"Group_{group_id}"
        group_id += 1

    df["group_label"] = df.index.map(group_labels).fillna("No Group")
    return df

# Step 6: Main Pipeline
def main_pipeline(file_path, column, threshold=0.8, batch_size=5000, shuffle=True):
    print("Loading Dataset...")
    df = pd.read_csv(file_path)

    # Shuffle Dataset
    if shuffle:
        print("Shuffling Dataset...")
        df = df.sample(frac=1, random_state=42).reset_index(drop=True)

    print("Cleaning and Normalizing Text...")
    df["cleaned_description"] = df[column].apply(clean_text)
    df["normalized_description"] = df["cleaned_description"].apply(lambda x: normalize_text(x, replacement_dict))

    print("Deduplicating Data...")
    unique_texts = df["normalized_description"].drop_duplicates().reset_index(drop=True)

    # Process in batches to save memory
    num_batches = (len(unique_texts) // batch_size) + 1
    combined_groups = defaultdict(list)

    print(f"Processing in {num_batches} batches...")
    for batch_id in tqdm(range(num_batches), desc="Processing Batches"):
        start_idx = batch_id * batch_size
        end_idx = min((batch_id + 1) * batch_size, len(unique_texts))
        batch_texts = unique_texts[start_idx:end_idx]

        batch_groups = compute_tsne_groups(batch_texts.tolist(), threshold=threshold)
        for key, values in batch_groups.items():
            combined_groups[start_idx + key] = [start_idx + v for v in values]

    print("Assigning Groups to Dataset...")
    grouped_df = assign_groups(df, combined_groups)

    print("Saving Results...")
    output_file = f"grouped_descriptions_tsne_{threshold}.csv"
    grouped_df.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")
    return grouped_df

# Run the Pipeline
if __name__ == "__main__":
    file_path = "/Users/satvikbisht/Documents/Polimi/Semester 3/Data Quality /Project/diq/data/raw/building_permits.csv"
    threshold = 0.8
    batch_size = 10000
    df = main_pipeline(file_path, column="Description", threshold=threshold, batch_size=batch_size)
    print("Processing Complete!")

In [None]:
# Display unique group labels
unique_groups = df["group_label"].unique()
print("Unique Group Labels:")
print(unique_groups)

In [None]:
# Filter rows belonging to Group_1
group_1_df = df[df["group_label"] == "Group_121257"]

# Display descriptions in Group_1
print("Descriptions in Group_1:")
print(group_1_df["Description"].tolist())

# EXISTING USE

In [1]:
import pandas as pd
import re
import numpy as np
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from collections import defaultdict
from tqdm import tqdm

# Step 1: Define Replacement Dictionary
replacement_dict = {
    "streeet space": "street space",
    "street spacemta meters": "street space",
    "street sace": "street space",
    "street spacemta": "street space",
}

# Step 2: Regex Normalization
def apply_regex(text):
    if re.search(r"\bstreet\s*space\b", text):
        return "street space"
    elif re.search(r"\binspection\b", text):
        return "inspections"
    elif re.search(r"\breroofing\b", text):
        return "reroofing"
    elif re.search(r"\breroof\b", text):
        return "reroofing"
    elif re.search(r"\bsoft story retrofit\b", text):
        return "soft story retrofit"
    elif re.search(r"provide.*?(sprinkler|sprinklers).*?monitoring system.*?water flow.*?(valve monitoring|tamper monitoring)", text):
        return "provide sprinkler system monitoring"
    return text

# Step 3: Text Cleaning and Normalization
def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\d+", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def normalize_text(text, replacement_dict):
    for old_term, new_term in replacement_dict.items():
        text = text.replace(old_term, new_term)
    return apply_regex(text)

# Step 4: Dimensionality Reduction with t-SNE
def compute_tsne_groups(texts, threshold=0.8, perplexity=30, random_state=42):
    print("Reducing Dimensionality with t-SNE...")
    
    # Create embeddings using character-level vectors
    vectorizer = TfidfVectorizer(analyzer="char", ngram_range=(2, 3), stop_words="english")
    tfidf_matrix = vectorizer.fit_transform(texts)
    
    # Convert sparse matrix to dense matrix
    dense_matrix = tfidf_matrix.toarray()
    
    # Standardize the features
    scaler = StandardScaler()
    standardized_matrix = scaler.fit_transform(dense_matrix)

    # Apply t-SNE for dimensionality reduction
    tsne = TSNE(n_components=2, perplexity=perplexity, random_state=random_state, init="random")
    tsne_embeddings = tsne.fit_transform(standardized_matrix)

    # Compute cosine similarity matrix
    sim_matrix = cosine_similarity(tsne_embeddings)
    groups = defaultdict(list)

    print("Creating Groups Using Cosine Similarity...")
    for i in tqdm(range(len(sim_matrix)), desc="Assigning Groups"):
        similar_indices = np.where(sim_matrix[i] > threshold)[0]
        for j in similar_indices:
            if i != j:  # Avoid self-matching
                groups[i].append(j)
    return groups

# Step 5: Assign Groups to DataFrame
def assign_groups(df, groups):
    group_labels = {}
    group_id = 1

    print("Assigning Groups to DataFrame...")
    for leader, members in tqdm(groups.items(), desc="Processing Groups"):
        for member in members:
            group_labels[member] = f"Group_{group_id}"
        group_labels[leader] = f"Group_{group_id}"
        group_id += 1

    df["group_label"] = df.index.map(group_labels).fillna("No Group")
    return df

# Step 6: Main Pipeline
def main_pipeline(file_path, column, threshold=0.8, batch_size=5000, shuffle=True):
    print("Loading Dataset...")
    df = pd.read_csv(file_path)

    # Shuffle Dataset
    if shuffle:
        print("Shuffling Dataset...")
        df = df.sample(frac=1, random_state=42).reset_index(drop=True)

    print("Cleaning and Normalizing Text...")
    df["cleaned_Existing_Use"] = df[column].apply(clean_text)
    df["normalized__Existing_Use"] = df["cleaned_Existing_Use"].apply(lambda x: normalize_text(x, replacement_dict))

    print("Deduplicating Data...")
    unique_texts = df["normalized__Existing_Use"].drop_duplicates().reset_index(drop=True)

    # Process in batches to save memory
    num_batches = (len(unique_texts) // batch_size) + 1
    combined_groups = defaultdict(list)

    print(f"Processing in {num_batches} batches...")
    for batch_id in tqdm(range(num_batches), desc="Processing Batches"):
        start_idx = batch_id * batch_size
        end_idx = min((batch_id + 1) * batch_size, len(unique_texts))
        batch_texts = unique_texts[start_idx:end_idx]

        batch_groups = compute_tsne_groups(batch_texts.tolist(), threshold=threshold)
        for key, values in batch_groups.items():
            combined_groups[start_idx + key] = [start_idx + v for v in values]

    print("Assigning Groups to Dataset...")
    grouped_df = assign_groups(df, combined_groups)

    print("Saving Results...")
    output_file = f"grouped_Existing_Use_tsne_{threshold}.csv"
    grouped_df.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")
    return grouped_df

# Run the Pipeline
if __name__ == "__main__":
    file_path = "/Users/satvikbisht/Documents/Polimi/Semester 3/Data Quality /Project/diq/data/raw/building_permits.csv"
    threshold = 0.8
    batch_size = 10000
    df = main_pipeline(file_path, column="Existing Use", threshold=threshold, batch_size=batch_size)
    print("Processing Complete!")

Loading Dataset...


  df = pd.read_csv(file_path)


Shuffling Dataset...
Cleaning and Normalizing Text...
Deduplicating Data...
Processing in 1 batches...


Processing Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Reducing Dimensionality with t-SNE...




Creating Groups Using Cosine Similarity...


Assigning Groups: 100%|██████████| 93/93 [00:00<00:00, 264275.25it/s]
Processing Batches: 100%|██████████| 1/1 [00:00<00:00,  2.18it/s]


Assigning Groups to Dataset...
Assigning Groups to DataFrame...


Processing Groups: 100%|██████████| 93/93 [00:00<00:00, 397625.15it/s]

Saving Results...





Results saved to grouped_Existing_Use_tsne_0.8.csv
Processing Complete!


In [2]:
# Display unique group labels
unique_groups = df["group_label"].unique()
print("Unique Group Labels:")
print(unique_groups)

Unique Group Labels:
['Group_87' 'Group_92' 'Group_79' 'Group_93' 'Group_89' 'Group_82'
 'Group_91' 'No Group']


In [9]:
# Filter rows belonging to Group_1
group_1_df = df[df["group_label"] == "Group_93"]

# Display descriptions in Group_1
print("Descriptions in Group_1:")
print(group_1_df["Description"].tolist())

Descriptions in Group_1:
['1)interior t.i. 1st & 2nd floor. demo old restrooms at both flrs & add new restrooms, shower, & kchn at 1st flr. 2)new store front system with new entry door on 7th street. 3)nw swing dr on gilbert st. 4)voluntary seismic upgrade. elect. mech. under seperate permit.n/a for the maher.', 'kitchen cabinet, install 4 cabinets and counter top, hook up stove, same location, install 2 counter plugs and hood. no structural work.', 'street space permit  - 201705126445', 'street space permit  (renewal of permit # 1319329)', 'new natural gas powered emergency generator  at lower level.', 'install a fire alarm system. ref pa 2015-0410-3363', 'like for like replacement of one chiller. bms upgrade.', 'replace 11 windows on back and sides of house.  no structural changes.  max u-factor .40.  2 windows on right side of house to be anderson paintable fibrex, all others vinyl. all at existing openings at upper level.', 'revision to bpa no. 201404183624  replacing 34 windows. 1