In [105]:
from general_program import *

In [4]:
categories_encoder = LabelEncoder()
categories_encoder.fit(list(unique_categories))
user_id_encoder = LabelEncoder()
business_id_encoder = LabelEncoder()
business_geohash_encoder = LabelEncoder()

user_scaler = StandardScaler()
business_scaler = StandardScaler()

In [106]:
user_df, business_df, review_df, user_continuous_features_scaled, business_continuous_features_scaled, num_users, num_businesses, num_categories, num_geohashes = prepare_data(user_df, business_df, review_df, categories_df, user_id_encoder, business_id_encoder, categories_encoder, business_geohash_encoder, user_scaler, business_scaler)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [107]:
user_continuous_features_scaled = user_continuous_features_scaled.set_index(user_df['user_id_encoded'].values)
business_continuous_features_scaled = business_continuous_features_scaled.set_index(business_df['business_id_encoded'].values)

In [None]:
review_df = review_df.merge(user_df[['user_id_encoded', 'average_stars']], on='user_id_encoded', how='left')

review_df['stars'] = review_df['stars']/review_df['average_stars']

In [None]:
keep_features = ['user_id_encoded', 'business_id_encoded', 'stars', 'label']

dropped_review = review_df[keep_features]
dropped_review = dropped_review.join(user_continuous_features_scaled, on='user_id_encoded', rsuffix='_user')
dropped_review = dropped_review.join(business_continuous_features_scaled, on='business_id_encoded', rsuffix='_business')

continuous_features = dropped_review.columns.difference(keep_features)

In [115]:
def build_deepfm_mixed_model(num_continuous_features, categorical_info, deep_units=[64, 32, 16], dropout_rate=0.5):
    """
    Build a DeepFM model that uses both continuous and categorical features.
    
    Args:
      num_continuous_features: Integer, the number of continuous features.
      categorical_info: Dictionary mapping categorical feature names to (vocab_size, embed_dim).
                        For example: {'user_id_encoded': (num_users, 8), 'business_id_encoded': (num_businesses, 8)}
      deep_units: List of integers, sizes of the hidden layers in the deep part.
      dropout_rate: Float, dropout rate for the deep layers.
    
    Returns:
      A compiled Keras model with a regression output using MSE loss.
    """
    # Input layer for continuous features.
    input_cont = Input(shape=(num_continuous_features,), name="continuous_input")
    
    # Process categorical features: create an input and embedding layer for each.
    categorical_inputs = []
    categorical_embeddings = []
    for feature_name, (vocab_size, embed_dim) in categorical_info.items():
        inp = Input(shape=(1,), name=feature_name)
        emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim, name=f"{feature_name}_emb")(inp)
        emb_flat = layers.Flatten(name=f"{feature_name}_flat")(emb)
        categorical_inputs.append(inp)
        categorical_embeddings.append(emb_flat)
    
    # Combine continuous features with flattened categorical embeddings.
    if categorical_embeddings:
        deep_input = layers.Concatenate(name="deep_concat")([input_cont] + categorical_embeddings)
    else:
        deep_input = input_cont
    
    # --- Linear Part ---
    # For simplicity, the linear part uses only the continuous features.
    linear_part = layers.Dense(1, activation=None, name="linear_part")(input_cont)
    
    # --- Deep Part ---
    deep = deep_input
    for i, units in enumerate(deep_units):
        deep = layers.Dense(units, activation='relu', name=f"deep_dense_{i}")(deep)
        deep = layers.Dropout(dropout_rate, name=f"deep_dropout_{i}")(deep)
    deep_output = layers.Dense(1, activation=None, name="deep_output")(deep)
    
    # --- Combine Linear and Deep Parts ---
    combined_logit = layers.Add(name="combined")([linear_part, deep_output])
    
    # For regression (rating prediction), we use a linear output.
    output = combined_logit
    
    # Build the model including both continuous and categorical inputs.
    inputs = [input_cont] + categorical_inputs
    model = Model(inputs=inputs, outputs=output, name="DeepFM_mixed")
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    
    return model

In [171]:
# Example usage:
# Suppose:
#   - The number of continuous features is determined by your DataFrame, e.g., len(continuous_features)
#   - categorical_info includes:
#       'user_id_encoded': (num_users, 8)
#       'business_id_encoded': (num_businesses, 8)
categorical_info = {
    'user_id_encoded': (num_users, 8), 
    'business_id_encoded': (num_businesses, 8)
}

num_cont_features = len(continuous_features)  # Your continuous feature columns from dropped_review

model = build_deepfm_mixed_model(num_cont_features, categorical_info, dropout_rate=0.3)
# model.summary()

# For continuous features:
X_cont = dropped_review[continuous_features].values

# For categorical features (ensuring they are arrays of shape (num_samples, 1)):
X_user = dropped_review['user_id_encoded'].values.reshape(-1, 1)
X_business = dropped_review['business_id_encoded'].values.reshape(-1, 1)

# Prepare labels, e.g., the 'stars' column.
y = dropped_review['stars'].values

X_cont_train, X_cont_test, \
X_user_train, X_user_test, \
X_business_train, X_business_test, \
y_train, y_test = train_test_split(X_cont, X_user, X_business, y, test_size=0.2, random_state=42)


In [176]:
len(y_train)

784333

In [None]:
model.fit([X_cont_train, X_user_train, X_business_train], y_train, epochs=3, batch_size=32, validation_split=0.1)

In [117]:
save_folder_path = 'Saved_DeepFM/'

# Save the models
model.save(save_folder_path + 'DeepFM.keras')

# Save the label encoders
with open(save_folder_path + 'user_id_encoder.pkl', 'wb') as f:
    pickle.dump(user_id_encoder, f)

with open(save_folder_path + 'business_id_encoder.pkl', 'wb') as f:
    pickle.dump(business_id_encoder, f)

# with open(save_folder_path + 'categories_encoder.pkl', 'wb') as f:
#     pickle.dump(categories_encoder, f)
    
# Save the scalers
with open(save_folder_path + 'user_scaler.pkl', 'wb') as f:
    pickle.dump(user_scaler, f)

with open(save_folder_path + 'business_scaler.pkl', 'wb') as f:
    pickle.dump(business_scaler, f)

# Save the user continuous features
np.save(save_folder_path + 'user_continuous_features.npy', user_continuous_features_scaled)

# Save the business continuous features
np.save(save_folder_path + 'business_continuous_features.npy', business_continuous_features_scaled)


In [151]:
def pad_users_with_negatives(
    test_df: pd.DataFrame,
    all_business_ids: np.ndarray,
    N: int = 100,
    seed: int = 42
) -> pd.DataFrame:
    """
    For each user in test_df, randomly sample additional businesses
    (not already in that user's rows) until they have N total rows.
    The new rows get stars = NaN.

    Args:
        test_df: DataFrame with columns ['user_id_encoded', 'business_id_encoded', 'stars'].
        all_business_ids: array of all possible business_id_encoded values.
        N: desired number of records per user.
        seed: random seed for reproducibility.

    Returns:
        padded_df: original rows plus sampled negatives with stars=NaN.
    """
    rng = np.random.default_rng(seed)
    out_rows = []

    # Group existing by user
    grouped = test_df.groupby('user_id_encoded')
    for user_id, group in grouped:
        existing = set(group['business_id_encoded'])
        n_existing = len(group)
        n_to_sample = max(0, N - n_existing)

        # Sample from the complement
        candidates = np.setdiff1d(all_business_ids, list(existing), assume_unique=True)
        sampled = rng.choice(candidates, size=n_to_sample, replace=False)

        # Build DataFrame of sampled negatives
        neg_df = pd.DataFrame({
            'user_id_encoded': user_id,
            'business_id_encoded': sampled,
            'stars': np.nan  # unknown
        })

        # Append existing + negatives
        out_rows.append(group)
        out_rows.append(neg_df)

    # Combine and return
    padded_df = pd.concat(out_rows, ignore_index=True)
    return padded_df

In [152]:
# 1) Build initial test_df
test_df = pd.DataFrame({
    'user_id_encoded': X_user_test.flatten(),
    'business_id_encoded': X_business_test.flatten(),
    'stars':            y_test.flatten()
})
# keep only users with at least one interaction
test_df = test_df[test_df['stars'] >= 1.1]

# 2) Get list of all business IDs
all_business_ids = business_df['business_id_encoded'].unique()

# 3) Pad each user up to 100 records
padded_test_df = pad_users_with_negatives(
    test_df,
    all_business_ids=all_business_ids,
    N=100,
    seed=42
)

counts = padded_test_df.groupby('user_id_encoded').size()
assert counts.min() == 100 and counts.max() == 100


In [161]:
padded_test_df = padded_test_df.join(user_continuous_features_scaled, on='user_id_encoded', rsuffix='_user')
padded_test_df = padded_test_df.join(business_continuous_features_scaled, on='business_id_encoded', rsuffix='_business')

In [168]:
# Assume 'df_filtered' contains your test set with all users.
keep_features = ['user_id_encoded', 'business_id_encoded', 'stars']
testing_features = padded_test_df.columns.difference(keep_features)

# Prepare features and predict ratings for all rows.
X_cont_test = padded_test_df[testing_features].values
X_user_test = padded_test_df['user_id_encoded'].values.reshape(-1, 1)
X_business_test = padded_test_df['business_id_encoded'].values.reshape(-1, 1)

padded_test_df['predicted_rating'] = model.predict([X_cont_test, X_user_test, X_business_test])

# Compute ranking within each user group.
padded_test_df['ranking'] = padded_test_df.groupby('user_id_encoded')['predicted_rating'] \
                                      .rank(method='min', ascending=False)


[1m135925/135925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 948us/step


In [177]:
padded_test_df['user_id_encoded'].nunique()

43496

In [172]:
# Choose K for NDCG and entropy
K = 10

# Helper: NDCG@K for one user
def ndcg_at_k(ranked_labels, k):
    dcg = 0.0
    for i, rel in enumerate(ranked_labels[:k], start=1):
        dcg += (2**rel - 1) / np.log2(i + 1)
    # ideal DCG: all positives first
    ideal = sorted(ranked_labels, reverse=True)
    idcg = sum((2**rel - 1) / np.log2(i + 1)
               for i, rel in enumerate(ideal[:k], start=1))
    return dcg / idcg if idcg > 0 else 0.0

# Helper: entropy of top-K labels
def list_entropy(labels):
    p_pos = labels.mean()
    p_neg = 1 - p_pos
    ent = 0.0
    for p in (p_pos, p_neg):
        if p > 0:
            ent -= p * np.log2(p)
    return ent

# Containers
mrrs, ndcgs, ndcgs_all = [], [], []

# assign the real_label column to the padded_test_df if stars is not NaN
padded_test_df['real_label'] = np.where(padded_test_df['stars'].isna(), 0, 1)

# Iterate per user
for user_id, group in padded_test_df.groupby('user_id_encoded'):
    # Sort by predicted_rating descending
    grp = group.sort_values('predicted_rating', ascending=False)
    labels = grp['real_label'].values
    
    # MRR: reciprocal rank of first positive
    pos_indices = np.where(labels == 1)[0]
    if len(pos_indices) > 0:
        mrrs.append(1.0 / (pos_indices[0] + 1))
    else:
        mrrs.append(0.0)
    
    # NDCG@K
    ndcgs.append(ndcg_at_k(labels, K))

    # NDCG@K for all
    ndcgs_all.append(ndcg_at_k(labels, len(labels)))

# Aggregate
results = {
    'MRR':       np.mean(mrrs),
    f'First Relevant Rank':  int(round(1/np.mean(mrrs),0)),
    f'NDCG@{K}': np.mean(ndcgs),
    f'NDCG_all': np.mean(ndcgs_all),
}

# Display
results_df = pd.Series(results).round(4).to_frame().T
print("Ranking Evaluation Metrics")
display(results_df)

Ranking Evaluation Metrics


Unnamed: 0,MRR,First Relevant Rank,NDCG@10,NDCG_all
0,0.2091,5.0,0.1651,0.3335
