# **NUS DATHATON 2026**

***2 Sons 2 Daughters***

In [None]:
!pip -q uninstall -y numpy
!pip -q install --no-cache-dir "numpy==2.0.2"



[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.2/19.2 MB[0m [31m251.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.3.3 which is incompatible.
google-colab 1.0.0 requires requests==2.32.4, but you have requests 2.32.5 which is incompatible.[0m[31m
[0m

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import MultipleLocator, AutoMinorLocator
import matplotlib.patches as patches
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px


# read data

In [None]:
from google.colab import files
uploaded = files.upload("champions_group_data.xlsx")
fname = next(iter(uploaded))   # gets the uploaded filename
df_raw = pd.read_excel(fname)
print("Loaded:", fname, "shape:", df.shape)



Saving champions_group_data.xlsx to champions_group_data.xlsx/champions_group_data (2).xlsx
Loaded: champions_group_data.xlsx/champions_group_data (2).xlsx shape: (8559, 72)


#clean data


In [None]:
import re
import numpy as np
import pandas as pd

def _normalise_colname(s: str) -> str:
    s = s.strip().lower()
    s = re.sub(r"[%()\/]", " ", s)
    s = re.sub(r"[^a-z0-9]+", "_", s)
    s = re.sub(r"_+", "_", s).strip("_")
    return s

def _clean_string_series(x: pd.Series) -> pd.Series:
    x = x.astype("string")
    x = x.str.replace(r"\s+", " ", regex=True).str.strip()
    x = x.replace(
        {
            "": pd.NA,
            "na": pd.NA, "n/a": pd.NA, "none": pd.NA, "null": pd.NA,
            "unknown": pd.NA, "not available": pd.NA, "not applicable": pd.NA,
        }
    )
    return x

def _to_float(x: pd.Series) -> pd.Series:
    if x.dtype.kind in "if":
        return x.astype("float")
    x = _clean_string_series(x)
    # remove currency symbols and commas, keep digits, dot, minus
    x = x.str.replace(r"[$£€,\s]", "", regex=True)
    # handle parentheses negatives like (123)
    x = x.str.replace(r"^\((.+)\)$", r"-\1", regex=True)
    # keep only valid number-like
    x = x.where(x.str.match(r"^-?\d+(\.\d+)?$", na=False), pd.NA)
    return x.astype("float")

def _bucket_midpoint(val: str):
    """
    Handles:
      '1 to 10'
      '11 - 50'
      '1,001 to 5,000'
      '100000+' or '100,000+'
    Returns midpoint as float or NaN.
    """
    if val is None or (isinstance(val, float) and np.isnan(val)):
        return np.nan
    s = str(val).strip().lower()
    if s in {"", "na", "n/a", "none", "null", "unknown"}:
        return np.nan

    s = s.replace(",", "")
    s = re.sub(r"\s+", " ", s)

    # plus buckets
    m = re.match(r"^(\d+)\s*\+?$", s)
    if m:
        return float(m.group(1))

    # ranges with 'to' or '-'
    m = re.match(r"^(\d+)\s*(to|-)\s*(\d+)$", s)
    if m:
        a = float(m.group(1))
        b = float(m.group(3))
        return (a + b) / 2.0

    return np.nan

def clean_company_data(df_raw: pd.DataFrame) -> pd.DataFrame:
    df = df_raw.copy()

    # 1) normalise column names
    df.columns = [_normalise_colname(c) for c in df.columns]

    # 2) standardise obvious string columns
    for c in df.columns:
        if df[c].dtype == "object":
            df[c] = _clean_string_series(df[c])

    # 3) ensure key IDs are strings if present
    for c in ["company_id", "id", "duns", "sic_code", "naics_code", "nace_code", "isic_code"]:
        if c in df.columns:
            df[c] = _clean_string_series(df[c]).astype("string")

    # 4) numeric conversions where expected
    numeric_candidates = [
        "employees_total",
        "revenue_usd",
        "year_found",
        "corporate_family_members",
        "it_budget",
        "it_spend",
        "latitude",
        "longitude",
    ]
    for c in numeric_candidates:
        if c in df.columns:
            df[c] = _to_float(df[c])

    # 5) bucket to midpoint conversion for device footprint columns
    bucket_cols = [
        "no_of_pc",
        "no_of_desktops",
        "no_of_laptops",
        "no_of_routers",
        "no_of_servers",
        "no_of_storage_devices",
    ]
    for c in bucket_cols:
        if c in df.columns:
            df[c] = df[c].map(_bucket_midpoint).astype("float")

    # 6) basic sanity fixes
    if "year_found" in df.columns:
        df.loc[(df["year_found"] < 1700) | (df["year_found"] > 2030), "year_found"] = np.nan

    if "employees_total" in df.columns:
        df.loc[df["employees_total"] < 0, "employees_total"] = np.nan

    if "revenue_usd" in df.columns:
        df.loc[df["revenue_usd"] < 0, "revenue_usd"] = np.nan

    # 7) derived features that help segmentation
    if "revenue_usd" in df.columns and "employees_total" in df.columns:
        df["revenue_per_employee"] = df["revenue_usd"] / df["employees_total"]
        df.loc[~np.isfinite(df["revenue_per_employee"]), "revenue_per_employee"] = np.nan

    if "it_spend" in df.columns and "revenue_usd" in df.columns:
        df["it_spend_to_revenue"] = df["it_spend"] / df["revenue_usd"]
        df.loc[~np.isfinite(df["it_spend_to_revenue"]), "it_spend_to_revenue"] = np.nan

    # 8) drop columns that are basically empty (tune threshold if needed)
    missing_rate = df.isna().mean()
    mostly_empty = missing_rate[missing_rate >= 0.99].index.tolist()
    df = df.drop(columns=mostly_empty)

    # 9) de duplicate rows if an obvious unique key exists
    for key in ["company_id", "id", "duns", "company_name"]:
        if key in df.columns:
            df = df.drop_duplicates(subset=[key], keep="first")
            break

    return df


df_clean = clean_company_data(df_raw)
print(df_raw.shape, "->", df_clean.shape)
df_clean.head()


(8559, 72) -> (8559, 72)


Unnamed: 0,duns_number,company_sites,website,address_line_1,city,state,state_or_province_abbreviation,postal_code,country,phone_number,...,it_budget,it_spend,no_of_pc,no_of_desktops,no_of_laptops,no_of_routers,no_of_servers,no_of_storage_devices,revenue_per_employee,it_spend_to_revenue
0,639677726,"Zyf Lopsking Material Technology Co., Ltd. No....",,"No.2777 Taidong Road, Panyang Industrial Park,...",Suzhou,Jiangsu,JS,215000.0,CHINA,,...,0.0,0.0,5.5,5.5,5.5,5.5,,,0.0,
1,547756179,"Beijing Kaishi Lide Commerce And Trade Co., Ltd.",,"No.306, Building 8, Shuangyushudongli Jia, Hai...",Beijing,Beijing,BJ,100086.0,CHINA,1082129000.0,...,5601.0,3472.0,5.5,5.5,5.5,5.5,5.5,5.5,15558.3,0.011158
2,728834216,Keshan Shengren Potato Industry Processing Co....,,"South of Siduan Road, Xi Street, Keshan Town, ...",Qiqihar,Heilongjiang,HL,161000.0,CHINA,,...,86905.0,53881.0,30.5,5.5,5.5,5.5,5.5,5.5,27124.0,0.02232
3,728791839,Zuoquan County Yuanfeng Agriculture Technology...,,"Songaoliang Village, Liaoyang Town, Zuoquan Co...",Jinzhong,Shanxi,SX,30600.0,CHINA,,...,11630.0,7210.0,,,,,,,,0.023557
4,728889244,Zuoquan County Tianxin Real Estate Development...,,"No.14, Chengyadao Alley, Zuoquan County",Jinzhong,Shanxi,SX,30600.0,CHINA,3548653000.0,...,94564.0,58629.0,5.5,5.5,5.5,5.5,5.5,5.5,583728.0,0.050219


In [None]:
# === Step 1: Build the clustering feature table (X) ===
# Assumes you already have: df_clean (your cleaned dataframe)

import numpy as np
import pandas as pd

df = df_clean.copy()

# --- Core columns for segmentation (v1) ---
industry_cols = ["sic_code", "sic_description", "8_digit_sic_code", "8_digit_sic_description"]

size_cols = ["employees_total", "revenue_usd", "revenue_per_employee"]

structure_cols = ["entity_type", "corporate_family_members", "is_headquarters", "is_domestic_ultimate"]

it_cols = [
    "it_budget", "it_spend",
    "no_of_pc", "no_of_desktops", "no_of_laptops",
    "no_of_routers", "no_of_servers", "no_of_storage_devices",
    "it_spend_to_revenue"
]

geo_cols = ["country", "region"]  # keep it coarse to avoid exploding categories

# Prefer 8-digit SIC if available; else regular SIC; else fall back to SIC description
# (this avoids redundant, high-cardinality columns)
preferred_industry = []
if "8_digit_sic_code" in df.columns:
    preferred_industry.append("8_digit_sic_code")
elif "sic_code" in df.columns:
    preferred_industry.append("sic_code")

# Add a description field if it exists (optional but useful)
if "8_digit_sic_description" in df.columns:
    preferred_industry.append("8_digit_sic_description")
elif "sic_description" in df.columns:
    preferred_industry.append("sic_description")

core_cols = preferred_industry + size_cols + structure_cols + it_cols + geo_cols
core_cols = [c for c in core_cols if c in df.columns]  # keep only existing

# --- Derived hierarchy flags (avoid clustering on raw company name strings) ---
# These are boolean features summarising parent/ultimate presence.
name_cols_for_flags = ["parent_company", "global_ultimate_company", "domestic_ultimate_company"]
for c in name_cols_for_flags:
    if c in df.columns:
        df[c] = df[c].astype("string").str.strip()

if "parent_company" in df.columns:
    df["has_parent_company"] = df["parent_company"].notna()
else:
    df["has_parent_company"] = False

if "global_ultimate_company" in df.columns:
    df["has_global_ultimate"] = df["global_ultimate_company"].notna()
else:
    df["has_global_ultimate"] = False

if "domestic_ultimate_company" in df.columns:
    df["has_domestic_ultimate_company"] = df["domestic_ultimate_company"].notna()
else:
    df["has_domestic_ultimate_company"] = False

derived_flag_cols = ["has_parent_company", "has_global_ultimate", "has_domestic_ultimate_company"]

# --- Build X ---
X = df[core_cols + derived_flag_cols].copy()

# --- Quick sanity check summary ---
print("X shape:", X.shape)
print("\nColumns used:")
print(X.columns.tolist())

missing_pct = (X.isna().mean().sort_values(ascending=False) * 100).round(1)
print("\nTop 15 columns by missing %:")
print(missing_pct.head(15))

# Optional: view a few rows
X.head(5)


X shape: (8559, 23)

Columns used:
['8_digit_sic_code', '8_digit_sic_description', 'employees_total', 'revenue_usd', 'revenue_per_employee', 'entity_type', 'corporate_family_members', 'is_headquarters', 'is_domestic_ultimate', 'it_budget', 'it_spend', 'no_of_pc', 'no_of_desktops', 'no_of_laptops', 'no_of_routers', 'no_of_servers', 'no_of_storage_devices', 'it_spend_to_revenue', 'country', 'region', 'has_parent_company', 'has_global_ultimate', 'has_domestic_ultimate_company']

Top 15 columns by missing %:
is_headquarters            82.9
is_domestic_ultimate       82.3
8_digit_sic_code           62.0
8_digit_sic_description    62.0
no_of_storage_devices      40.1
no_of_servers              39.1
it_spend_to_revenue        38.7
no_of_routers              38.5
revenue_per_employee       36.6
no_of_laptops               7.5
no_of_desktops              4.0
no_of_pc                    3.3
entity_type                 0.0
revenue_usd                 0.0
employees_total             0.0
dtype: flo

Unnamed: 0,8_digit_sic_code,8_digit_sic_description,employees_total,revenue_usd,revenue_per_employee,entity_type,corporate_family_members,is_headquarters,is_domestic_ultimate,it_budget,...,no_of_laptops,no_of_routers,no_of_servers,no_of_storage_devices,it_spend_to_revenue,country,region,has_parent_company,has_global_ultimate,has_domestic_ultimate_company
0,50510000.0,Metals service centers and offices,1.0,0.0,0.0,Branch,24.0,0.0,0.0,0.0,...,5.5,5.5,,,,CHINA,Asia,True,True,True
1,,,20.0,311166.0,15558.3,Subsidiary,2.0,,,5601.0,...,5.5,5.5,5.5,5.5,0.011158,CHINA,Asia,True,True,True
2,20370000.0,Frozen fruits and vegetables,89.0,2414036.0,27124.0,Subsidiary,9.0,,,86905.0,...,5.5,5.5,5.5,5.5,0.02232,CHINA,Asia,True,True,True
3,,,0.0,306060.0,,Parent,3.0,,,11630.0,...,,,,,0.023557,CHINA,Asia,True,True,True
4,,,2.0,1167456.0,583728.0,Parent,2.0,,,94564.0,...,5.5,5.5,5.5,5.5,0.050219,CHINA,Asia,True,True,True


In [None]:
import numpy as np
import pandas as pd

X2 = X.copy()

# Ensure numeric columns are truly numeric (float) and replace pd.NA with np.nan
for col in num_cols:
    # pd.to_numeric handles various forms of missing data (like pd.NA, or strings like 'N/A')
    # and converts them to np.nan upon conversion to float. 'coerce' turns unconvertible values to NaN.
    X2[col] = pd.to_numeric(X2[col], errors='coerce')

# Ensure categorical columns are 'object' dtype and replace any pd.NA with np.nan
for col in cat_cols:
    # Convert pandas 'string' dtype to Python 'object' dtype if present
    if pd.api.types.is_string_dtype(X2[col]) or X2[col].dtype.name == "string":
        X2[col] = X2[col].astype("object")
    # Replace any pd.NA that might still exist in object columns with np.nan
    X2[col] = X2[col].replace({pd.NA: np.nan})

# A final global replacement for any pd.NA that might have been missed, just to be safe.
# This ensures that before passing to sklearn, no pd.NA values remain anywhere.
X2 = X2.replace({pd.NA: np.nan})

X2.head()

Unnamed: 0,8_digit_sic_code,8_digit_sic_description,employees_total,revenue_usd,revenue_per_employee,entity_type,corporate_family_members,is_headquarters,is_domestic_ultimate,it_budget,...,no_of_laptops,no_of_routers,no_of_servers,no_of_storage_devices,it_spend_to_revenue,country,region,has_parent_company,has_global_ultimate,has_domestic_ultimate_company
0,50510000.0,Metals service centers and offices,1.0,0.0,0.0,Branch,24.0,0.0,0.0,0.0,...,5.5,5.5,,,,CHINA,Asia,True,True,True
1,,,20.0,311166.0,15558.3,Subsidiary,2.0,,,5601.0,...,5.5,5.5,5.5,5.5,0.011158,CHINA,Asia,True,True,True
2,20370000.0,Frozen fruits and vegetables,89.0,2414036.0,27124.0,Subsidiary,9.0,,,86905.0,...,5.5,5.5,5.5,5.5,0.02232,CHINA,Asia,True,True,True
3,,,0.0,306060.0,,Parent,3.0,,,11630.0,...,,,,,0.023557,CHINA,Asia,True,True,True
4,,,2.0,1167456.0,583728.0,Parent,2.0,,,94564.0,...,5.5,5.5,5.5,5.5,0.050219,CHINA,Asia,True,True,True


In [None]:
# Identify numeric vs categorical columns inside X
num_cols = X.select_dtypes(include=["number", "bool"]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

print("Numeric cols:", num_cols)
print("Categorical cols:", cat_cols)


Numeric cols: ['8_digit_sic_code', 'employees_total', 'revenue_usd', 'revenue_per_employee', 'corporate_family_members', 'is_headquarters', 'is_domestic_ultimate', 'it_budget', 'it_spend', 'no_of_pc', 'no_of_desktops', 'no_of_laptops', 'no_of_routers', 'no_of_servers', 'no_of_storage_devices', 'it_spend_to_revenue', 'has_parent_company', 'has_global_ultimate', 'has_domestic_ultimate_company']
Categorical cols: ['8_digit_sic_description', 'entity_type', 'country', 'region']


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

numeric_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

categorical_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

preprocess = ColumnTransformer([
    ("num", numeric_pipe, num_cols),
    ("cat", categorical_pipe, cat_cols),
])

X_mat = preprocess.fit_transform(X2)
print("Transformed shape:", X_mat.shape)

Transformed shape: (8559, 464)


In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

ks = range(3, 11)
scores = []

for k in ks:
    km = KMeans(n_clusters=k, random_state=42, n_init="auto")
    labels = km.fit_predict(X_mat)
    s = silhouette_score(X_mat, labels)
    scores.append(s)
    print(f"k={k}  silhouette={s:.4f}")

best_k = list(ks)[int(max(range(len(scores)), key=lambda i: scores[i]))]
print("Best k:", best_k)


k=3  silhouette=0.3478
k=4  silhouette=0.3471
k=5  silhouette=0.3473
k=6  silhouette=0.3461
k=7  silhouette=0.3430
k=8  silhouette=0.3712
k=9  silhouette=0.3713
k=10  silhouette=0.3684
Best k: 9


In [None]:
kmeans = KMeans(n_clusters=best_k, random_state=42, n_init="auto")
segments = kmeans.fit_predict(X_mat)

df_segmented = df.copy()
df_segmented["segment"] = segments

df_segmented["segment"].value_counts().sort_index()


Unnamed: 0_level_0,count
segment,Unnamed: 1_level_1
0,2253
1,5517
2,13
3,1
4,180
5,6
6,2
7,574
8,13


#new clustering

In [None]:
# === PLAN A: Rule-based segmentation (Industry + Size + Structure + IT + Geo) ===
# Assumes you already have df_clean

import numpy as np
import pandas as pd
import re

df = df_clean.copy()

# ---------- helpers ----------
def _digits_only(x):
    if pd.isna(x):
        return np.nan
    s = str(x).strip()
    s = re.sub(r"\D+", "", s)  # keep digits only
    return s if s != "" else np.nan

def sic_prefix(x, n=2):
    s = _digits_only(x)
    if pd.isna(s):
        return np.nan
    if len(s) >= n:
        return s[:n]
    return s.zfill(n)

def safe_qcut(series, q=4, labels=None):
    """Quantile binning that won't crash if not enough unique values."""
    s = pd.to_numeric(series, errors="coerce")
    if s.notna().sum() < q * 5:
        # too few points -> fallback to 3 bins
        q = 3
        labels = labels[:3] if labels is not None else None
    try:
        return pd.qcut(s, q=q, labels=labels, duplicates="drop")
    except Exception:
        # fallback: all Unknown
        return pd.Series(pd.NA, index=series.index, dtype="string")

# ---------- 1) Industry bucket ----------
sic_col = "8_digit_sic_code" if "8_digit_sic_code" in df.columns else "sic_code"
df["sic_2digit"] = df[sic_col].map(lambda x: sic_prefix(x, n=2))

# ---------- 2) Size tiers (employees + revenue) ----------
for c in ["employees_total", "revenue_usd", "it_spend", "it_budget"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

df["log_employees"] = np.log1p(df["employees_total"])
df["log_revenue"] = np.log1p(df["revenue_usd"])

df["size_emp_tier"] = safe_qcut(
    df["log_employees"], q=4, labels=["emp_s", "emp_m", "emp_l", "emp_xl"]
).astype("string")

df["size_rev_tier"] = safe_qcut(
    df["log_revenue"], q=4, labels=["rev_s", "rev_m", "rev_l", "rev_xl"]
).astype("string")

# ---------- 3) Corporate structure tier ----------
# Make sure boolean-like columns behave
for b in ["is_headquarters", "is_domestic_ultimate"]:
    if b in df.columns:
        # convert common string booleans to True/False
        if df[b].dtype.name in ["string", "object"]:
            df[b] = df[b].astype("string").str.lower().map({"true": True, "false": False})
        df[b] = df[b].fillna(False).astype(bool)
    else:
        df[b] = False

# derived presence flags
if "parent_company" in df.columns:
    df["has_parent_company"] = df["parent_company"].notna()
else:
    df["has_parent_company"] = False

if "global_ultimate_company" in df.columns:
    df["has_global_ultimate"] = df["global_ultimate_company"].notna()
else:
    df["has_global_ultimate"] = False

if "domestic_ultimate_company" in df.columns:
    df["has_domestic_ultimate_company"] = df["domestic_ultimate_company"].notna()
else:
    df["has_domestic_ultimate_company"] = False

def structure_tier(row):
    if row.get("is_headquarters", False):
        return "hq"
    if row.get("is_domestic_ultimate", False):
        return "domestic_ultimate"
    # If entity_type exists, use it as first signal
    et = str(row.get("entity_type", "")).lower()
    if "subsidi" in et:
        return "subsidiary"
    if "branch" in et:
        return "branch"
    if row.get("has_parent_company", False):
        return "subsidiary_like"
    if row.get("has_global_ultimate", False) or row.get("has_domestic_ultimate_company", False):
        return "member_of_group"
    return "standalone_like"

df["structure_tier"] = df.apply(structure_tier, axis=1).astype("string")

# ---------- 4) IT footprint tiers (spend + device footprint) ----------
if "it_spend" in df.columns:
    df["log_it_spend"] = np.log1p(df["it_spend"])
    df["it_spend_tier"] = safe_qcut(
        df["log_it_spend"], q=4, labels=["it_low", "it_mid", "it_high", "it_top"]
    ).astype("string")
else:
    df["it_spend_tier"] = pd.Series(pd.NA, index=df.index, dtype="string")

device_cols = [c for c in [
    "no_of_pc", "no_of_desktops", "no_of_laptops", "no_of_routers", "no_of_servers", "no_of_storage_devices"
] if c in df.columns]

df["device_total"] = df[device_cols].sum(axis=1, min_count=1)
df["log_device_total"] = np.log1p(df["device_total"])
df["device_tier"] = safe_qcut(
    df["log_device_total"], q=4, labels=["dev_low", "dev_mid", "dev_high", "dev_top"]
).astype("string")

# ---------- 5) Geography tiers ----------
# Keep it coarse. Use region if present, else country.
if "region" in df.columns:
    df["geo_tier"] = df["region"].astype("string")
else:
    df["geo_tier"] = df["country"].astype("string")

# ---------- 6) Build final segment label + id ----------
seg_parts = ["sic_2digit", "size_emp_tier", "size_rev_tier", "structure_tier", "it_spend_tier", "device_tier", "geo_tier"]
for c in seg_parts:
    df[c] = df[c].fillna("Unknown").astype("string")

df["segment_label"] = df[seg_parts].agg("|".join, axis=1)

# numeric id for convenience
seg_order = df["segment_label"].value_counts().index.tolist()
seg_map = {lab: i for i, lab in enumerate(seg_order)}
df["segment_id"] = df["segment_label"].map(seg_map).astype(int)

# ---------- 7) Quick sanity outputs ----------
counts = df["segment_id"].value_counts()
print("Rows:", df.shape[0])
print("Num segments:", counts.shape[0])
print("Top 10 segment sizes:\n", counts.head(10))
print("Segments with <10 companies:", int((counts < 10).sum()))

df[["segment_id", "segment_label"]].head(10)


Rows: 8559
Num segments: 219
Top 10 segment sizes:
 segment_id
0    2941
1    1283
2    1085
3     336
4     214
5     208
6     136
7     124
8     117
9     116
Name: count, dtype: int64
Segments with <10 companies: 156


Unnamed: 0,segment_id,segment_label
0,10,50|Unknown|Unknown|branch|Unknown|Unknown|Asia
1,0,Unknown|Unknown|Unknown|subsidiary|Unknown|Unk...
2,53,20|Unknown|Unknown|subsidiary|Unknown|Unknown|...
3,2,Unknown|Unknown|Unknown|subsidiary_like|Unknow...
4,2,Unknown|Unknown|Unknown|subsidiary_like|Unknow...
5,2,Unknown|Unknown|Unknown|subsidiary_like|Unknow...
6,0,Unknown|Unknown|Unknown|subsidiary|Unknown|Unk...
7,3,59|Unknown|Unknown|branch|Unknown|Unknown|Asia
8,0,Unknown|Unknown|Unknown|subsidiary|Unknown|Unk...
9,4,73|Unknown|Unknown|branch|Unknown|Unknown|Asia


# Get Secret Key

In [None]:
from google.colab import userdata
userdata.get('HF_TOKEN')

In [None]:
!pip install streamlit