In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.ensemble import IsolationForest

import pandas as pd

dfBen = pd.read_excel("../shared_data_read_only/Data/Article1/Hackaton_Benevoles_JPMORGAN.xlsx")
dfBin = pd.read_excel("../shared_data_read_only/Data/Article1/Hackaton_Binomes_JPMORGAN.xlsx")
dfJeu = pd.read_excel("../shared_data_read_only/Data/Article1/Hackaton_Jeunes_JPMORGAN.xlsx")

df = pd.read_csv("../shared_documents/pairs_cleaned.csv")

df['y_true'] = df['binome_statut'].isin(['COMPLETED', 'ACTIVE']).astype(int)

df_work = df.copy()

# 1) registration_date -> days_since_registration
import pandas as pd, numpy as np
df_work["registration_date"] = pd.to_datetime(df_work["registration_date"], errors="coerce")
ref_date = pd.Timestamp.today().normalize()
df_work["days_since_registration"] = (ref_date - df_work["registration_date"]).dt.days
num_cols = ["average_grade","engagement_score","days_since_registration"]
cat_cols = ["workfield","field_of_study","study_level","degree","needs",
            "program","desired_exchange_frequency","binome_statut"]
id_cols  = [c for c in ["binome_id","mentor_id","mentee_id"] if c in df_work.columns]

cols = [
    'binome_statut', 'binome_acceptance_delay', 'binome_date_update_statut',
    'workfield', 'current_role', 'needs_to_address_mentor',
    'field_of_study', 'study_level', 'degree', 'needs_to_address_mentee',
    'average_grade', 'program', 'engagement_score',
    'registration_date_mentee', 'desired_exchange_frequency', 'hobby',
    'project_confidence_level', 'project_development_level',
    'binome_score_clean'
]




  df = pd.read_csv("../shared_documents/pairs_cleaned.csv")


KeyError: 'registration_date'

In [None]:

df_pairs = df.copy()

mentor_keep = ["mentor_id","workfield","field_of_study","study_level","degree",
               "program","desired_exchange_frequency","average_grade","engagement_score"]
mentee_keep = ["mentee_id","workfield","field_of_study","study_level","degree",
               "program","desired_exchange_frequency","average_grade","engagement_score"]

mcols = [c for c in mentor_keep if c in dfBen.columns]
tcols = [c for c in mentee_keep if c in dfJeu.columns]

dfM = dfBen[mcols].copy().add_prefix("mentor_")
dfT = dfJeu[tcols].copy().add_prefix("mentee_")


dfM = dfM.rename(columns={"mentor_mentor_id":"mentor_id"})
dfT = dfT.rename(columns={"mentee_mentee_id":"mentee_id"})


train_df = df_pairs.merge(dfM, on="mentor_id", how="left").merge(dfT, on="mentee_id", how="left")

train_df["y_true"] = train_df["binome_statut"].isin(["COMPLETED","ACTIVE"]).astype(int)

num_cols = [c for c in [
    "mentor_average_grade","mentor_engagement_score",
    "mentee_average_grade","mentee_engagement_score"
] if c in train_df.columns]

cat_cols = [c for c in [
    "mentor_workfield","mentor_field_of_study","mentor_study_level","mentor_degree",
    "mentor_program","mentor_desired_exchange_frequency",
    "mentee_workfield","mentee_field_of_study","mentee_study_level","mentee_degree",
    "mentee_program","mentee_desired_exchange_frequency"
] if c in train_df.columns]

# one-hot
X_cat = pd.get_dummies(train_df[cat_cols], drop_first=True) if cat_cols else pd.DataFrame(index=train_df.index)
X_num = train_df[num_cols].apply(pd.to_numeric, errors="coerce").fillna(0.0) if num_cols else pd.DataFrame(index=train_df.index)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score

X_train_full = pd.concat([X_num, X_cat], axis=1)
y_train_full = train_df["y_true"].astype(int)

X_tr, X_te, y_tr, y_te = train_test_split(
    X_train_full, y_train_full, test_size=0.2, random_state=42, stratify=y_train_full
)

scaler = StandardScaler()
if len(num_cols):
    X_tr.loc[:, num_cols] = scaler.fit_transform(X_tr[num_cols])
    X_te.loc[:, num_cols] = scaler.transform(X_te[num_cols])

rf = RandomForestClassifier(
    n_estimators=400, class_weight="balanced", random_state=42, n_jobs=-1
)
rf.fit(X_tr, y_tr)

print("Acc:", accuracy_score(y_te, rf.predict(X_te)))
print("AUC:", roc_auc_score(y_te, rf.predict_proba(X_te)[:,1]))

feature_cols_pair_schema = X_tr.columns.tolist()

In [6]:
def rank_top5_rf_pair_schema(mentee_row, dfMentors, rf_model, scaler, feature_cols, num_cols, cat_cols, mentor_id_col="mentor_id"):
    mentee_pref = mentee_row.add_prefix("mentee_")
    mentors_pref = dfMentors.add_prefix("mentor_").copy()

    mentee_block = pd.DataFrame([mentee_pref.values]*len(mentors_pref), columns=mentee_pref.index)
    pair_df = pd.concat([mentors_pref.reset_index(drop=True), mentee_block.reset_index(drop=True)], axis=1)

    use_num = [c for c in num_cols if c in pair_df.columns]
    use_cat = [c for c in cat_cols if c in pair_df.columns]

    X_num = pair_df[use_num].apply(pd.to_numeric, errors="coerce").fillna(0.0) if use_num else pd.DataFrame(index=pair_df.index)
    X_cat = pd.get_dummies(pair_df[use_cat], drop_first=True) if use_cat else pd.DataFrame(index=pair_df.index)

    X_pair = pd.concat([X_num, X_cat], axis=1)

    for col in feature_cols:
        if col not in X_pair:
            X_pair[col] = 0
    X_pair = X_pair[feature_cols]

    if use_num:
        X_pair.loc[:, use_num] = scaler.transform(X_pair[use_num])

    prob = rf_model.predict_proba(X_pair)[:,1]
    out = pd.DataFrame({
        "mentor_id": dfMentors[mentor_id_col].values,
        "rf_prob": prob
    }).sort_values("rf_prob", ascending=False).head(5).reset_index(drop=True)
    return out

mentee_row = dfJeu.iloc[11]
top5 = rank_top5_rf_pair_schema(
    mentee_row, dfBen, rf, scaler,
    feature_cols_pair_schema, num_cols=num_cols, cat_cols=cat_cols,
    mentor_id_col="mentor_id" 
)
p = np.array(top5['rf_prob'].values)
k = 50      
delta = 3   
alpha = k * p + np.random.uniform(0, delta, size=len(p))
beta  = k * (1 - p) + np.random.uniform(0, delta, size=len(p))
p_new = alpha / (alpha + beta)

top5['rf_prob_bayesian'] = p_new
top5 = top5.sort_values('rf_prob_bayesian', ascending=False).reset_index(drop=True)
print(top5[['mentor_id', 'rf_prob_bayesian']])


NameError: name 'rf' is not defined