### Purpose
Take all streets in the code, categorise by type, delete duplicates

In [12]:
import re
import pandas as pd

# Configuration
INPUT_PATH = r"C:\Users\TheAiro\Desktop\Machine Learning Projects\Visuem\data\kyiv_types.csv"
TARGET_TYPES = [
    "вулиця",
    "провулок",
    "алея",
    "узвіз",
    "площа",
    "проспект",
    "бульвар",
    "шосе",
    "тракт",
    "мост",  # covers міст/моста variations after basic stemming
    "міст",
    "мікрорайон",
    "проїзд",
    "дорога",
    "набережна",
    "узвоз",
    "спуск",
    "переїзд",
    "доріжка",
    "узлісся",
    "пров",
]
TOP_UNKNOWN = 10


def extract_type(name: str) -> str:
    """Return the first matching token from TARGET_TYPES found in any word."""
    if not name:
        return None
    words = re.findall(r"[\wʼ''-]+", name.lower())
    for w in words:
        if w in TARGET_TYPES:
            return w
    return None


df = pd.read_csv(INPUT_PATH, sep=",", keep_default_na=False)
df = df.drop_duplicates(subset=["name"], keep="first")

df["name_type"] = df["name"].astype(str).apply(extract_type)

# Delete instances that don't belong to any target type
df = df[df["name_type"].notna()]

# Overall distribution of leading tokens
overall_counts = df["name_type"].value_counts(dropna=True)

# Focus on specific target types (ordered by our list)
target_counts = {
    t: int(overall_counts.get(t, 0)) for t in TARGET_TYPES
}

# Unknown / other types
known_set = set(TARGET_TYPES)
unknown_counts = overall_counts[~overall_counts.index.isin(known_set)]

print("Total rows:", len(df))
print("\nTarget type counts:")
for t, c in target_counts.items():
    print(f"  {t}: {c}")

print("\nTop unknown/other tokens (any position):")
print(unknown_counts.head(TOP_UNKNOWN))

Total rows: 2639

Target type counts:
  вулиця: 1911
  провулок: 506
  алея: 17
  узвіз: 15
  площа: 57
  проспект: 34
  бульвар: 22
  шосе: 10
  тракт: 0
  мост: 0
  міст: 14
  мікрорайон: 0
  проїзд: 15
  дорога: 9
  набережна: 23
  узвоз: 0
  спуск: 4
  переїзд: 1
  доріжка: 1
  узлісся: 0
  пров: 0

Top unknown/other tokens (any position):
Series([], Name: count, dtype: int64)


In [14]:
# Save the cleaned dataset
output_path = r"C:\Users\TheAiro\Desktop\Machine Learning Projects\Visuem\data\kyiv_types_cleaned.csv"
df.to_csv(output_path, sep=",", index=False, encoding="utf-8")
print(f"Cleaned dataset saved to: {output_path}")
print(f"Total rows saved: {len(df)}")

Cleaned dataset saved to: C:\Users\TheAiro\Desktop\Machine Learning Projects\Visuem\data\kyiv_types_cleaned.csv
Total rows saved: 2639
