uses Sentence-BERT embeddings for both Description and permit_content

fuses them with your numeric features

trains a logistic regression (easy to swap for LightGBM/XGBoost)

does CV, pseudo-labels high-confidence unlabeled rows, and exports results

## Inputs and Helper

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import os
os.getcwd()
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


In [2]:

df = pd.read_csv('02ProcessedData/Heat_Pump_Classification1104.csv')  # <- load your dataframe

# REQUIRED columns (adjust names if different)
TEXT_COL_SHORT   = 'Description'
TEXT_COL_LONG    = 'Permit_Content'
TARGET_COL       = 'coverage_classification_final'  # your label
NUM_COLS = [
    'Stories', 'n_unit_ratio', 'compressors_per_units', 'tonnage_per_units',
    'tonnage_cover_ratio'  # add/remove as appropriate
]

# Optional domain keyword flags (add/modify as needed)
KEYWORDS = {
    'has_hp'     : r'\bheat\s*pump\b|\bhp\b',
    'has_vrf'    : r'\bvrf\b|\bvrv\b|\bcity\s*multi\b|\brxyq\b',
    'has_od'     : r'\bodu\b|\boutdoor\s+unit\b',
    'has_id'     : r'\bidu\b|\bindoor\s+unit\b|\bair\s*handler\b|\bahu\b|\battic\b',
    'has_refer'  : r'\brefer\b|\brefrigerat',
    'has_cond'   : r'\bcondens(?:er|ing)\b',
    'has_chiller': r'\bchiller\b',
}

df.head()

Unnamed: 0,Permit Number,Permit_Content,addr_clean_x,BuildingName,StarRating,Submarket,City,ZIPCode,Stories,YearBuilt,...,compressors_per_units,tonnage_per_units,compressors_final,compressors_final_cumsum,coverage_classification_tonnage,HP_related,common_area_only,Number_final_cumsum,coverage_classification_units,coverage_classification_final
0,6691699-RF,: 6691699-RF : Refrigeration Permit Record Sta...,100 denny way,Alexan 100,4,Lower Queen Anne,Seattle,98109,7.0,2019.0,...,0.006098,0.0,1,1,Partial Coverage,1,,90.0,Partial Coverage,Partial Coverage
1,6908623-RF,: 6908623-RF : Refrigeration Permit Record Sta...,100 denny way,Alexan 100,4,Lower Queen Anne,Seattle,98109,7.0,2019.0,...,0.109756,0.204268,17,18,Common Areas Only,1,,,Common Areas Only,Common Areas Only
2,7052479-RF,: 7052479-RF : Refrigeration Permit Record Sta...,100 denny way,Alexan 100,4,Lower Queen Anne,Seattle,98109,7.0,2019.0,...,0.115854,0.216463,1,19,Common Areas Only,1,,91.0,Uncertain,Common Areas Only
3,6172962-RF,: 6172962-RF : Refrigeration Permit Record Sta...,100 taylor ave n,Taylor 28,4,South Lake Union,Seattle,98109,6.0,2008.0,...,0.015228,0.0,3,3,Partial Coverage,1,,3.0,Uncertain,Partial Coverage
4,6515780-RF,: 6515780-RF : Refrigeration Permit Record Sta...,1000 8th ave,The Mill at First Hill,3,First Hill,Seattle,98104,14.0,1949.0,...,0.005587,0.0,2,2,Partial Coverage,1,,1.0,Common Areas Only,Partial Coverage


## Load and Split Labeled/unlabeled


In [3]:
assert TEXT_COL_SHORT in df and TEXT_COL_LONG in df

df[TARGET_COL] = df[TARGET_COL].astype('category')

## labeled should be the 'coverage_classification_final' that is not 'Uncertain'
is_labeled   = df[TARGET_COL].isin(['Whole Building', 'Common Areas Only', 'Partial Coverage'])
df_labeled   = df[is_labeled].copy()
df_unlabeled = df[~is_labeled].copy()
print(df_labeled.shape, df_unlabeled.shape)


(751, 102) (170, 102)


## Text Embedding (two fields --> dual embedding)

In [4]:
# import torch, sentence_transformers as st
# print(torch.__version__, torch.cuda.is_available())
# model = st.SentenceTransformer("all-MiniLM-L6-v2")


In [5]:
import re
from sentence_transformers import SentenceTransformer

# A small, fast, high-quality model (384-dim)
sbert = SentenceTransformer('all-mpnet-base-v2') # all-MiniLM-L6-v2

def embed_text(series):
    return sbert.encode(series.fillna('').astype(str).tolist(), normalize_embeddings=True)

desc_emb_l = embed_text(df_labeled[TEXT_COL_SHORT])
cont_emb_l = embed_text(df_labeled[TEXT_COL_LONG])
X_text_l   = np.hstack([desc_emb_l, cont_emb_l])

desc_emb_u = embed_text(df_unlabeled[TEXT_COL_SHORT]) if len(df_unlabeled) else np.empty((0,384))
cont_emb_u = embed_text(df_unlabeled[TEXT_COL_LONG]) if len(df_unlabeled) else np.empty((0,384))
X_text_u   = np.hstack([desc_emb_u, cont_emb_u]) if len(df_unlabeled) else np.empty((0,768))


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


## Numeric and keyword features


In [6]:
import re
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Numeric features
num_imputer = SimpleImputer(strategy='median')
num_scaler  = StandardScaler()

X_num_l = num_imputer.fit_transform(df_labeled[NUM_COLS])
X_num_l = num_scaler.fit_transform(X_num_l)

X_num_u = num_imputer.transform(df_unlabeled[NUM_COLS]) if len(df_unlabeled) else np.empty((0, len(NUM_COLS)))
X_num_u = num_scaler.transform(X_num_u) if len(df_unlabeled) else X_num_u

# Keyword features on concatenated text for robustness
def make_keyword_frame(df):
    text = (df[TEXT_COL_SHORT].fillna('') + ' ' + df[TEXT_COL_LONG].fillna('')).str.lower()
    out = {}
    for k, pat in KEYWORDS.items():
        out[k] = text.str.contains(pat, regex=True).astype(int)
    return pd.DataFrame(out, index=df.index)

X_kw_l = make_keyword_frame(df_labeled).to_numpy()
X_kw_u = make_keyword_frame(df_unlabeled).to_numpy() if len(df_unlabeled) else np.empty((0, len(KEYWORDS)))


## fubak featyre natrux & training labels


In [7]:
X_l = np.hstack([X_text_l, X_num_l, X_kw_l])
y_l = df_labeled[TARGET_COL].values

X_u = np.hstack([X_text_u, X_num_u, X_kw_u]) if len(df_unlabeled) else np.empty((0, X_l.shape[1]))


## Baseline model + CV (can swap for LightGBM)

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score

clf = LogisticRegression(max_iter=200, class_weight='balanced', n_jobs=None)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(clf, X_l, y_l, cv=cv, scoring='f1_macro')
print("CV macro-F1:", scores.mean().round(3), "±", scores.std().round(3))


CV macro-F1: 0.743 ± 0.02


## fit on all labeled --> predict unlabeled with confidence

In [9]:
clf.fit(X_l, y_l)

if len(df_unlabeled):
    proba_u = clf.predict_proba(X_u)
    pred_u  = clf.classes_[np.argmax(proba_u, axis=1)]
    conf_u  = proba_u.max(axis=1)

    df_unlabeled['predicted_label'] = pred_u
    df_unlabeled['confidence']      = conf_u
else:
    proba_u = np.array([])


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,200


## Pseudo-label high-confidence unlabled and retrain

In [12]:
THRESH = 0.85  # tune on validation
if len(df_unlabeled):
    take = df_unlabeled['confidence'] >= THRESH
    print("Adding pseudo-labels:", int(take.sum()))

    if take.any():
        X_aug = np.vstack([X_l, X_u[take]])
        y_aug = np.concatenate([y_l, df_unlabeled.loc[take, 'predicted_label'].values])

        clf.fit(X_aug, y_aug)  # retrain on augmented set

        # optional: re-predict the remaining unlabeled
        remain = ~take
        if remain.any():
            proba_rem = clf.predict_proba(X_u[remain])
            df_unlabeled.loc[remain, 'predicted_label'] = clf.classes_[np.argmax(proba_rem, axis=1)]
            df_unlabeled.loc[remain, 'confidence']      = proba_rem.max(axis=1)


Adding pseudo-labels: 1


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,200


## Export results & quick diagnostics

In [13]:
out = pd.concat([
    df_labeled.assign(source='labeled', predicted_label=df_labeled[TARGET_COL], confidence=1.0),
    df_unlabeled.assign(source='unlabeled')
], axis=0)

# Sort to review weakest predictions first
out_sorted = out.sort_values(by=['source','confidence'], ascending=[True, True])
Path('outputs').mkdir(exist_ok=True)
out_sorted.to_csv('outputs/predictions_with_confidence_1112.csv', index=False)

# Class report on labeled (CV-style estimate already printed; you can also hold out a test split)
from sklearn.metrics import classification_report
print(classification_report(y_l, clf.predict(X_l)))  # training fit; mainly for sanity


                   precision    recall  f1-score   support

Common Areas Only       0.83      0.76      0.80       388
 Partial Coverage       0.74      0.80      0.77       308
   Whole Building       0.85      0.95      0.90        55

         accuracy                           0.79       751
        macro avg       0.81      0.84      0.82       751
     weighted avg       0.80      0.79      0.79       751

