In [15]:


import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.multioutput import MultiOutputClassifier

# === Load correct files ===
X = pd.read_excel("cleaned_features (1).xlsx")
Y = pd.read_excel("target_new_resources.xlsx")
# === Clean feature matrix ===
X_numeric = X.select_dtypes(include=["number", "bool"]).astype(float)

# Drop rows with NA in either X or Y
valid_idx = X_numeric.dropna().index.intersection(Y.dropna().index)

X_numeric = X_numeric.loc[valid_idx].reset_index(drop=True)
Y = Y.loc[valid_idx].reset_index(drop=True)

# Ensure Y is integer (0/1)
Y = Y.astype(int)
# Keep only targets that have at least 2 of each class
valid_targets = [col for col in Y.columns if Y[col].nunique() > 1]

# Optionally: require at least 3 positives for stability
stable_targets = [col for col in valid_targets if Y[col].sum() >= 3]

Y = Y[stable_targets]

print("Targets kept:", stable_targets)
print("Targets removed:", set(Y.columns) - set(stable_targets))


# === Train/test split ===
X_train, X_test, Y_train, Y_test = train_test_split(
    X_numeric, Y, test_size=0.2, random_state=42
)

# === Build model ===
logreg = LogisticRegression(max_iter=1000)
multi_logreg = MultiOutputClassifier(logreg)

multi_logreg.fit(X_train, Y_train)

print("Y_train class counts per column:")
for col in Y_train.columns:
    print(col, Y_train[col].value_counts().to_dict())


# === Predictions ===
Y_pred = multi_logreg.predict(X_test)

# === Overall accuracy ===
print("Overall Accuracy:", accuracy_score(Y_test, Y_pred))

# === Per-target classification reports ===
for i, col in enumerate(Y.columns):
    print(f"\n=== {col} ===")
    print(classification_report(Y_test[col], Y_pred[:, i]))

# === Important features per target ===
for i, col in enumerate(Y.columns):
    coefs = multi_logreg.estimators_[i].coef_[0]
    top_features = np.argsort(abs(coefs))[-10:]

    print(f"\nTop features for {col}:")
    print(X_train.columns[top_features].tolist())


Targets kept: ['target_Boss Up NYCHA Competition', 'target_Brooklyn Public Library Cision Communications Resource', 'target_Brooklyn Public Library PowerUP Business Plan Competition', 'target_Cambio Labs', 'target_REES Business Development Resource', 'target_REES Dreamers', 'target_REES Go-getters', 'target_REES Home-Based Business', 'target_SBS BE NYC Startup Intensive', 'target_SBS FastTrac NewVenture', 'target_SBS FastTrac® GrowthVenture™', 'target_SBS FastTrac® NewVenture™ 50+', 'target_SBS FastTrac® NewVenture™ for the Female Entrepreneur', 'target_Brooklyn Fashion Incubator', 'target_NYC M/WBE Mentors']
Targets removed: set()
Y_train class counts per column:
target_Boss Up NYCHA Competition {0: 13, 1: 8}
target_Brooklyn Public Library Cision Communications Resource {0: 18, 1: 3}
target_Brooklyn Public Library PowerUP Business Plan Competition {0: 18, 1: 3}
target_Cambio Labs {1: 19, 0: 2}
target_REES Business Development Resource {1: 17, 0: 4}
target_REES Dreamers {1: 14, 0: 7}
t

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

In [8]:
from google.colab import files
uploaded = files.upload()
import os
os.listdir()


Saving target_new_resources.xlsx to target_new_resources (2).xlsx
Saving cleaned_features (1).xlsx to cleaned_features (1) (2).xlsx


['.config',
 'target_new_resources (1).xlsx',
 'cleaned_features (1).xlsx',
 'cleaned_features (1) (1).xlsx',
 'target_new_resources.xlsx',
 'cleaned_features (1) (2).xlsx',
 'target_new_resources (2).xlsx',
 'sample_data']

In [14]:
print("Y class counts per column:")
for col in Y.columns:
    try:
        print(col, Y[col].value_counts().to_dict())
    except Exception as e:
        print(col, "error:", e)


Y class counts per column:
target_BKPL PowerUP Business Plan Competition {0: 26, 1: 1}
target_Boss Up NYCHA Competition {0: 16, 1: 11}
target_Boss Up Vetern competition {0: 26, 1: 1}
target_Brooklyn Public Library Cision Communications Resource {0: 24, 1: 3}
target_Brooklyn Public Library PowerUP Business Plan Competition {0: 24, 1: 3}
target_Cambio Labs {1: 24, 0: 3}
target_Just the common BKPL Resources {0: 26, 1: 1}
target_REES Business Development Resource {1: 22, 0: 5}
target_REES Champions {0: 26, 1: 1}
target_REES Dreamers {1: 18, 0: 9}
target_REES Go-getters {0: 23, 1: 4}
target_REES Home-Based Business {0: 20, 1: 7}
target_REES Idealists {0: 25, 1: 2}
target_SBS BE NYC Startup Intensive {1: 17, 0: 10}
target_SBS FastTrac NewVenture {0: 18, 1: 9}
target_SBS FastTrac® GrowthVenture™ {0: 24, 1: 3}
target_SBS FastTrac® NewVenture™ 50+ {0: 22, 1: 5}
target_SBS FastTrac® NewVenture™ for the Female Entrepreneur {1: 16, 0: 11}
target_SBS FastTrac® TechVenture™ {0: 27}
target_Start Sma