In [4]:
from pathlib import Path
import pandas as pd

DATA = Path("data")
OUT = Path("data/clean"); OUT.mkdir(parents=True, exist_ok=True)

def normalize_orgnr(s: pd.Series) -> pd.Series:
    s = s.astype(str).str.replace(r"\D", "", regex=True).str[-9:]
    return s.where(s.str.len() == 9)

# --- HubSpot ---
HUBSPOT_KEEP = {
    "Company name": "company_name",
    "Organisasjonsnummer": "orgnr",
    "Last Activity Date": "last_activity_date",
    "Record ID": "record_id",
}

hs = pd.read_csv(DATA / "hubspot.csv", dtype="string")
hs = hs.rename(columns=HUBSPOT_KEEP)
hs = hs[[c for c in HUBSPOT_KEEP.values() if c in hs.columns]]
if "orgnr" in hs:
    hs["orgnr"] = normalize_orgnr(hs["orgnr"])
if "company_name" in hs:
    hs["company_name"] = hs["company_name"].astype(str).str.strip()
if "last_activity_date" in hs:
    hs["last_activity_date"] = pd.to_datetime(hs["last_activity_date"], errors="coerce")

hs.to_parquet(OUT / "hubspot_clean.parquet", index=False)

# --- Brønnøysund ---
BRREG_KEEP = {
    "navn": "company_name",
    "organisasjonsnummer": "orgnr",
    "naeringskode1.kode": "nace",
    "antallAnsatte": "employees",
}

header = pd.read_csv(DATA / "brreg.csv", nrows=0).columns.tolist()
usecols = [c for c in BRREG_KEEP.keys() if c in header]

chunks = pd.read_csv(
    DATA / "brreg.csv",
    usecols=usecols,
    dtype="string",
    chunksize=200_000,
    low_memory=True,
)

parts = []
for ch in chunks:
    ch = ch.rename(columns=BRREG_KEEP)
    if "orgnr" in ch:
        ch["orgnr"] = normalize_orgnr(ch["orgnr"])
    if "company_name" in ch:
        ch["company_name"] = ch["company_name"].astype(str).str.strip()
    parts.append(ch)

br = pd.concat(parts, ignore_index=True)
br.to_parquet(OUT / "brreg_clean.parquet", index=False)


In [None]:
from pathlib import Path
import math, gzip
import pandas as pd

DATA = Path("data")
OUT = Path("data/clean"); OUT.mkdir(parents=True, exist_ok=True)

# --- felles ---
def normalize_orgnr(s: pd.Series) -> pd.Series:
    s = s.astype(str).str.replace(r"\D", "", regex=True).str[-9:]
    return s.where(s.str.len()==9)

# hvilke kolonner fra brreg.csv vi beholder
BRREG_KEEP = {
    "navn": "company_name",
    "organisasjonsnummer": "orgnr",
    "naeringskode1.kode": "nace",
    "antallAnsatte": "employees",
}

# les header for å tåle manglende kolonner
hdr = pd.read_csv(DATA/"brreg.csv", nrows=0).columns.tolist()
usecols = [c for c in BRREG_KEEP.keys() if c in hdr]

# strøm inn og rens
parts = []
for ch in pd.read_csv(
    DATA/"brreg.csv",
    usecols=usecols,
    dtype="string",
    chunksize=200_000,
    low_memory=True,
):
    ch = ch.rename(columns=BRREG_KEEP)
    if "orgnr" in ch: ch["orgnr"] = normalize_orgnr(ch["orgnr"])
    if "company_name" in ch: ch["company_name"] = ch["company_name"].astype(str).str.strip()
    parts.append(ch)

df = pd.concat(parts, ignore_index=True)

# dedup på orgnr, ellers navn
if "orgnr" in df:
    df = pd.concat([
        df[df["orgnr"].notna()].drop_duplicates("orgnr", keep="first"),
        df[df["orgnr"].isna()].drop_duplicates("company_name", keep="first"),
    ], ignore_index=True)

# -----------------------------
# ALTERNATIV A: Parquet sharding
# -----------------------------
# Velg antall shards. Øk til filene < 100 MB.
SHARDS = 12
shard_dir = OUT / "brreg_parquet_shards"
shard_dir.mkdir(parents=True, exist_ok=True)

# nøkkel for sharding
key = df["orgnr"].fillna(df["company_name"]).astype(str)
bucket = key.apply(lambda x: hash(x) % SHARDS)
for b in range(SHARDS):
    part = df[bucket == b]
    if len(part) == 0:
        continue
    outp = shard_dir / f"brreg_clean.part{b:02d}.parquet"
    part.to_parquet(outp, index=False)

# Eksempel på lesing i app:
# import glob, pandas as pd
# files = sorted(glob.glob("data/clean/brreg_parquet_shards/*.parquet"))
# df_br = pd.concat((pd.read_parquet(f) for f in files), ignore_index=True)

