In [2]:
from general_program import *

Loaded 78059 rows from business_details table.
Loaded 360656 rows from business_categories table.
Loaded 980418 rows from review table.
Loaded 229447 rows from user table.
Loaded 173085 rows from tip table.


In [3]:
categories_encoder = LabelEncoder()
categories_encoder.fit(list(unique_categories))
user_id_encoder = LabelEncoder()
business_id_encoder = LabelEncoder()

user_scaler = StandardScaler()
business_scaler = StandardScaler()

In [4]:
user_df, business_df, review_df, user_continuous_features_scaled, business_continuous_features_scaled, num_users, num_businesses, num_categories, user_id_encoder, business_id_encoder, categories_encoder, user_scaler, business_scaler = prepare_data(user_df, business_df, review_df, categories_df, user_id_encoder, business_id_encoder, categories_encoder, user_scaler, business_scaler)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [5]:
keep_features = ['user_id_encoded', 'business_id_encoded', 'stars', 'label']

dropped_review = review_df[keep_features]
# join the business_continuous_features_scaled and user_continuous_features_scaled to the review_df based on the user_id_encoded and business_id_encoded, which are the same as the index of the user_continuous_features_scaled and business_continuous_features_scaled
dropped_review = dropped_review.join(user_continuous_features_scaled, on='user_id_encoded', rsuffix='_user')
dropped_review = dropped_review.join(business_continuous_features_scaled, on='business_id_encoded', rsuffix='_business')

# return the columns of dropped_review that are not label, user_id_encoded, business_id_encoded
continuous_features = dropped_review.columns.difference(keep_features)

# drop any row with any NaN value
dropped_review = dropped_review.dropna()

In [6]:
def build_deepfm_mixed_model(num_continuous_features, categorical_info, deep_units=[64, 32, 16], dropout_rate=0.5):
    """
    Build a DeepFM model that uses both continuous and categorical features.
    
    Args:
      num_continuous_features: Integer, the number of continuous features.
      categorical_info: Dictionary mapping categorical feature names to (vocab_size, embed_dim).
                        For example: {'user_id_encoded': (num_users, 8), 'business_id_encoded': (num_businesses, 8)}
      deep_units: List of integers, sizes of the hidden layers in the deep part.
      dropout_rate: Float, dropout rate for the deep layers.
    
    Returns:
      A compiled Keras model with a regression output using MSE loss.
    """
    # Input layer for continuous features.
    input_cont = Input(shape=(num_continuous_features,), name="continuous_input")
    
    # Process categorical features: create an input and embedding layer for each.
    categorical_inputs = []
    categorical_embeddings = []
    for feature_name, (vocab_size, embed_dim) in categorical_info.items():
        inp = Input(shape=(1,), name=feature_name)
        emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim, name=f"{feature_name}_emb")(inp)
        emb_flat = layers.Flatten(name=f"{feature_name}_flat")(emb)
        categorical_inputs.append(inp)
        categorical_embeddings.append(emb_flat)
    
    # Combine continuous features with flattened categorical embeddings.
    if categorical_embeddings:
        deep_input = layers.Concatenate(name="deep_concat")([input_cont] + categorical_embeddings)
    else:
        deep_input = input_cont
    
    # --- Linear Part ---
    # For simplicity, the linear part uses only the continuous features.
    linear_part = layers.Dense(1, activation=None, name="linear_part")(input_cont)
    
    # --- Deep Part ---
    deep = deep_input
    for i, units in enumerate(deep_units):
        deep = layers.Dense(units, activation='relu', name=f"deep_dense_{i}")(deep)
        deep = layers.Dropout(dropout_rate, name=f"deep_dropout_{i}")(deep)
    deep_output = layers.Dense(1, activation=None, name="deep_output")(deep)
    
    # --- Combine Linear and Deep Parts ---
    combined_logit = layers.Add(name="combined")([linear_part, deep_output])
    
    # For regression (rating prediction), we use a linear output.
    output = combined_logit
    
    # Build the model including both continuous and categorical inputs.
    inputs = [input_cont] + categorical_inputs
    model = Model(inputs=inputs, outputs=output, name="DeepFM_mixed")
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    
    return model

In [13]:
# Example usage:
# Suppose:
#   - The number of continuous features is determined by your DataFrame, e.g., len(continuous_features)
#   - categorical_info includes:
#       'user_id_encoded': (num_users, 8)
#       'business_id_encoded': (num_businesses, 8)
categorical_info = {
    'user_id_encoded': (num_users, 8), 
    'business_id_encoded': (num_businesses, 8)
}

num_cont_features = len(continuous_features)  # Your continuous feature columns from dropped_review

model = build_deepfm_mixed_model(num_cont_features, categorical_info)
model.summary()

# For continuous features:
X_cont = dropped_review[continuous_features].values

# For categorical features (ensuring they are arrays of shape (num_samples, 1)):
X_user = dropped_review['user_id_encoded'].values.reshape(-1, 1)
X_business = dropped_review['business_id_encoded'].values.reshape(-1, 1)

# Prepare labels, e.g., the 'stars' column.
y = dropped_review['stars'].values

X_cont_train, X_cont_test, \
X_user_train, X_user_test, \
X_business_train, X_business_test, \
y_train, y_test = train_test_split(X_cont, X_user, X_business, y, test_size=0.2, random_state=42)


model.fit([X_cont_train, X_user_train, X_business_train], y_train, epochs=3, batch_size=32, validation_split=0.1)


Epoch 1/3
[1m22060/22060[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 9ms/step - loss: 3.6993 - mae: 1.2977 - val_loss: 1.9597 - val_mae: 1.0162
Epoch 2/3
[1m22060/22060[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m198s[0m 9ms/step - loss: 1.5841 - mae: 0.9687 - val_loss: 1.8069 - val_mae: 0.9754
Epoch 3/3
[1m22060/22060[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m189s[0m 9ms/step - loss: 1.3856 - mae: 0.9143 - val_loss: 1.4453 - val_mae: 0.9441


<keras.src.callbacks.history.History at 0x259eff423d0>

In [30]:
save_folder_path = 'Saved_DeepFM/'

# Save the models
model.save(save_folder_path + 'DeepFM.keras')

# Save the label encoders
with open(save_folder_path + 'user_id_encoder.pkl', 'wb') as f:
    pickle.dump(user_id_encoder, f)

with open(save_folder_path + 'business_id_encoder.pkl', 'wb') as f:
    pickle.dump(business_id_encoder, f)

# with open(save_folder_path + 'categories_encoder.pkl', 'wb') as f:
#     pickle.dump(categories_encoder, f)
    
# Save the scalers
with open(save_folder_path + 'user_scaler.pkl', 'wb') as f:
    pickle.dump(user_scaler, f)

with open(save_folder_path + 'business_scaler.pkl', 'wb') as f:
    pickle.dump(business_scaler, f)

In [24]:
db_path = '../Retrieval Result/Retrieval.db'
conn = sqlite3.connect(db_path)
query = "SELECT * FROM recommendations WHERE model = 'DSSM' "

# initialize an empty dataframe to store the results with "model", "user_id", "business_id", "real_label"
df = pd.DataFrame(columns=["model", "user_id", "business_id", "real_label"])


for chunk in pd.read_sql_query(query, conn, chunksize=10000):
    # Process each DataFrame chunk here
    df = pd.concat([df, chunk])

conn.close()

In [25]:
# remove user without any real_label=1
user_ids = df[df['real_label'] == 1]['user_id'].unique()

# get the new df with only the user_ids that have real_label=1
df_filtered = df[df['user_id'].isin(user_ids)]

# encode the user_id and business_id in the df_filtered
df_filtered['user_id_encoded'] = user_id_encoder.transform(df_filtered['user_id'])
df_filtered['business_id_encoded'] = business_id_encoder.transform(df_filtered['business_id'])

df_filtered = df_filtered.join(user_continuous_features_scaled, on='user_id_encoded', rsuffix='_user')
df_filtered = df_filtered.join(business_continuous_features_scaled, on='business_id_encoded', rsuffix='_business')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['user_id_encoded'] = user_id_encoder.transform(df_filtered['user_id'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['business_id_encoded'] = business_id_encoder.transform(df_filtered['business_id'])


In [26]:
# Assume 'df_filtered' contains your test set with all users.
keep_features = ['model', 'user_id', 'business_id', 'real_label', 'user_id_encoded', 'business_id_encoded']
testing_features = df_filtered.columns.difference(keep_features)

# Prepare features and predict ratings for all rows.
X_cont_test = df_filtered[testing_features].values
X_user_test = df_filtered['user_id_encoded'].values.reshape(-1, 1)
X_business_test = df_filtered['business_id_encoded'].values.reshape(-1, 1)

df_filtered['predicted_rating'] = model.predict([X_cont_test, X_user_test, X_business_test])

# Compute ranking within each user group.
df_filtered['ranking'] = df_filtered.groupby('user_id_encoded')['predicted_rating'] \
                                      .rank(method='min', ascending=False)

# Function to compute ranking metrics using mean rank for positive items.
def compute_user_metrics(group):
    positives = group[group['real_label'] == 1]
    if positives.empty:
        return None  # Skip users with no positive items.
    
    # Calculate the mean rank for all positive items.
    mean_rank = positives['ranking'].mean()
    
    # Normalize the mean rank by dividing by the total number of candidates for that user.
    normalized_mean_rank = mean_rank / len(group)
    
    # Optionally, compute the reciprocal of the mean rank.
    reciprocal_mean_rank = 1.0 / mean_rank
    
    return pd.Series({
        'mean_rank': mean_rank,
        'normalized_mean_rank': normalized_mean_rank,
        'reciprocal_mean_rank': reciprocal_mean_rank
    })

# Apply the function to each user.
user_metrics = df_filtered.groupby('user_id_encoded').apply(compute_user_metrics).dropna()

# Compute overall ranking metrics.
mean_overall_mean_rank = user_metrics['mean_rank'].mean()
mean_overall_normalized_rank = user_metrics['normalized_mean_rank'].mean()
MRR_mean = user_metrics['reciprocal_mean_rank'].mean()

print("Mean of Mean Ranks:", mean_overall_mean_rank)
print("Mean of Normalized Mean Ranks:", mean_overall_normalized_rank)
print("Mean Reciprocal of Mean Ranks:", MRR_mean)

[1m16719/16719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 932us/step
Mean of Mean Ranks: 1562.2162512980271
Mean of Normalized Mean Ranks: 0.3124432502596054
Mean Reciprocal of Mean Ranks: 0.004504342693593237


  user_metrics = df_filtered.groupby('user_id_encoded').apply(compute_user_metrics).dropna()
