In [None]:
# Install core libraries
!pip install pandas matplotlib scikit-learn --quiet

# Recommender system (MF_SVD)
!pip install scikit-surprise --quiet

# SynthCity and all needed plugins
!pip install torch==2.2.0 --index-url https://download.pytorch.org/whl/cu121 --quiet
!pip install synthcity==0.2.11 --quiet

# Required extra libraries
!pip install optuna lifelines nflows pycox opacus pgmpy geomloss xgboost xgbse rdt deepecho copulas ctgan pytorch-lightning monai tsai loguru tqdm --quiet

# Enable Jupyter kernel support (optional, for working in VS Code / Jupyter)
!pip install ipykernel --quiet


In [None]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

from synthcity.plugins import Plugins
from synthcity.benchmark import Benchmarks

from synthcity.plugins.core.dataloader import GenericDataLoader
from surprise import Dataset, Reader, SVD, accuracy

from sklearn.model_selection import train_test_split
from surprise.model_selection import train_test_split as surprise_split


In [None]:
# Step 1: Load real data
df = pd.read_csv(
    "/kaggle/input/movies100k/u.data",
    sep="\t",
    names=["user_id", "movie_id", "rating", "timestamp"]
)

df = df.drop(columns=["timestamp"])
df = df.sample(n=10000, random_state=42).reset_index(drop=True)
train_real_df, test_real_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
df.head()

In [None]:
print(train_real_df.shape, test_real_df.shape)

In [None]:
# Step 2: Wrap training data with SynthCity's DataLoader
loader = GenericDataLoader(train_real_df, target_column="rating")

In [None]:


# Step 3: Benchmark generators
from synthcity.benchmark import Benchmarks

benchmark_results = Benchmarks.evaluate(
    [
        ("CTGAN", "ctgan", {"n_iter": 1000}),
        ("TVAE", "tvae", {"n_iter": 1000}),
        ("DPGAN", "dpgan", {"n_iter": 1000})
    ],
    loader,
    synthetic_size=100,
    metrics={
      "performance": ["linear_model"],
      "privacy": ["k-anonymization", "distinct l-diversity", "identifiability_score"],
      "stats": ["feature_corr", "inv_kl_divergence", "prdc"]
    },
    repeats=3,
    synthetic_reuse_if_exists=False
)

In [None]:
benchmark_results

In [None]:
# Step 4: Pick best generator and apply hyperparameter optimization
# Choose a performance metric for ranking (e.g., syn_id = in-distribution performance)
perf_metric = "performance.linear_model.syn_id"

plugin_scores = {
    name: df.loc[perf_metric]["mean"]
    for name, df in benchmark_results.items()
}

score_df = pd.DataFrame.from_dict(plugin_scores, orient="index", columns=["mean_performance"])
score_df = score_df.sort_values("mean_performance", ascending=False)

best_plugin_name = score_df.index[0].lower()
print("Best plugin:", best_plugin_name)

In [None]:
print("\nPlugin Ranking:")
print(score_df)

In [None]:

# Step 5: Run hyperparameter optimization on best generator
import optuna
from synthcity.utils.optuna_sample import suggest_all

plugin_cls = type(Plugins().get(best_plugin_name))

# Define training loader for Optuna objective
loader = GenericDataLoader(train_real_df, target_column="rating")
train_loader = loader.train()

def objective(trial: optuna.Trial):
    hp_space = plugin_cls.hyperparameter_space()
    
    # (Optional) Limit high values for faster tuning
    for hp in hp_space:
        if hp.name == "n_iter":
            hp.high = 100

    params = suggest_all(trial, hp_space)
    trial_id = f"trial_{trial.number}"

    try:
        report = Benchmarks.evaluate(
            [(trial_id, best_plugin_name, params)],
            train_loader,
            repeats=1,
            metrics={"performance": ["mlp"]}  # You can also add privacy/stats here
        )
    except Exception as e:
        print(f"⚠️ Trial {trial.number} failed: {e}")
        raise optuna.TrialPruned()

    # Use average of all "maximize" metrics as score
    score = report[trial_id].query('direction == "maximize"')['mean'].mean()
    return score

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

best_params = study.best_params
print("Best hyperparameters found:", best_params)


In [None]:
# Step 6: Generate synthetic data from optimized model
best_params["n_iter"] = 2000 
best_plugin = plugin_cls(**best_params)  
best_plugin.fit(train_real_df)

best_synth_df = best_plugin.generate(2000)

In [None]:
# Step 7: Prepare data for evaluation
if hasattr(best_synth_df, "data"):
    best_synth_df = best_synth_df.data
train_best_synth, test_best_synth = train_test_split(best_synth_df, test_size=0.2, random_state=42)


In [None]:
best_synth_df.head()

In [None]:
print(train_best_synth.shape, test_best_synth.shape)

In [None]:

# Step 8: Evaluation function
def train_and_evaluate(train_df, test_df, label):
    reader = Reader(rating_scale=(1, 5))
    data_train = Dataset.load_from_df(train_df, reader)
    trainset = data_train.build_full_trainset()

    model = SVD()
    model.fit(trainset)

    data_test = Dataset.load_from_df(test_df, reader)
    _, testset = surprise_split(data_test, test_size=1.0)
    predictions = model.test(testset)
    rmse = accuracy.rmse(predictions, verbose=False)
    print(f"{label}: RMSE = {rmse:.4f}")
    return rmse

In [None]:
# Step 9: Run MF_SVD experiments
results = {
    "Real → Real": train_and_evaluate(train_real_df, test_real_df, "Real → Real"),
    f"{best_plugin_name.upper()} → Real": train_and_evaluate(train_best_synth, test_real_df, f"{best_plugin_name.upper()} → Real"),
    f"Real + {best_plugin_name.upper()} → Real": train_and_evaluate(
        pd.concat([train_real_df, train_best_synth]),
        test_real_df,
        f"Real + {best_plugin_name.upper()} → Real"
    ),
    f"{best_plugin_name.upper()} → {best_plugin_name.upper()}": train_and_evaluate(
        train_best_synth, test_best_synth, f"{best_plugin_name.upper()} → {best_plugin_name.upper()}"
    ),
    f"Real → {best_plugin_name.upper()}": train_and_evaluate(
        train_real_df, test_best_synth, f"Real → {best_plugin_name.upper()}"
    ),
    "Small Real → Real": train_and_evaluate(
        train_real_df.sample(n=1600, random_state=42), test_real_df, "Small Real → Real"
    ),
}


In [None]:
# Step 10: Return results
results_df = pd.DataFrame.from_dict(results, orient='index', columns=['RMSE']).reset_index().rename(columns={'index': 'Experiment'})


In [None]:
results_df

In [None]:
# Step 11: Plot results
import matplotlib.pyplot as plt
import seaborn as sns

results_df = pd.DataFrame.from_dict(results, orient='index', columns=['RMSE']).reset_index().rename(columns={'index': 'Experiment'})

plt.figure(figsize=(12, 6))
sns.barplot(x="Experiment", y="RMSE", data=results_df)
plt.xticks(rotation=45, ha="right")
plt.title("MF_SVD Performance with Optimized Synthetic Data")
plt.tight_layout()
plt.show()

In [None]:
bench_report = Benchmarks.evaluate(
    [("FinalCheck", "tvae", best_params)],
    loader,
    metrics={
    "performance": ["linear_model"],
    "privacy": ["k-anonymization", "distinct l-diversity", "identifiability_score"],
    "stats": ["feature_corr", "inv_kl_divergence", "prdc"]
},
    synthetic_size=len(best_synth_df),
    repeats=1
)

Benchmarks.print(bench_report)

In [None]:
print(bench_report["FinalCheck"])


In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import seaborn as sns

real_data = train_real_df[["user_id", "movie_id", "rating"]]
synthetic_data = best_synth_df[["user_id", "movie_id", "rating"]]

# Combine both real and synthetic data
combined_data = pd.concat([real_data, synthetic_data], axis=0)

pca = PCA(n_components=2)
principal_components = pca.fit_transform(combined_data)

pca_df = pd.DataFrame(data=principal_components, columns=['PCA1', 'PCA2'])
pca_df['Type'] = ['Real'] * len(real_data) + ['Synthetic'] * len(synthetic_data)

plt.figure(figsize=(10, 6))
sns.scatterplot(x='PCA1', y='PCA2', hue='Type', data=pca_df, palette=['blue', 'red'], alpha=0.6)
plt.title('PCA Visualization of Real vs Synthetic Data')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Data Type')
plt.show()


In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=42)
tsne_components = tsne.fit_transform(combined_data)

tsne_df = pd.DataFrame(data=tsne_components, columns=['tSNE1', 'tSNE2'])
tsne_df['Type'] = ['Real'] * len(real_data) + ['Synthetic'] * len(synthetic_data)

plt.figure(figsize=(10, 6))
sns.scatterplot(x='tSNE1', y='tSNE2', hue='Type', data=tsne_df, palette=['blue', 'red'], alpha=0.6)
plt.title('t-SNE Visualization of Real vs Synthetic Data')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.legend(title='Data Type')
plt.show()
