In [None]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split

# Seed for reproducibility
random.seed(42)
np.random.seed(42)

# Total samples
n_samples = 25000

# Categorical options
diabetes_opts = ['Type 1', 'Type 2', 'None']
hypertension_opts = ['Yes', 'No']
cardiovascular_opts = ['Present', 'Absent']
digestive_opts = ['IBS', 'Celiac', 'Non-IBS']
food_allergies_opts = ['Nuts', 'Dairy', 'Shellfish', 'None']
bmi_categories = ['Underweight', 'Normal', 'Overweight', 'Obese']
weight_history_opts = ['Stable', 'Fluctuating']
exercise_type_opts = ['Cardio', 'Strength', 'Mixed', 'None']
job_activity_opts = ['Sedentary', 'Moderate', 'Active']
work_schedule_opts = ['Regular', 'Shift', 'Flexible']
sleep_quality_opts = ['Poor', 'Fair', 'Good']
stress_levels = ['Low', 'Medium', 'High']
meal_timing_opts = ['Regular', 'Irregular']
cooking_skills_opts = ['Basic', 'Intermediate', 'Advanced']
food_budget_opts = ['Low', 'Medium', 'High']
alcohol_opts = ['None', 'Occasional', 'Regular']
smoking_opts = ['Non-smoker', 'Smoker', 'Former']
snacking_opts = ['Regular', 'Average', 'Irregular']
travel_opts = ['Rarely', 'Monthly', 'Weekly']
diet_type_opts = ['Vegetarian', 'Vegan', 'Non-spicy', 'Pescatarian']
meal_size_opts = ['Small frequent', 'Regular 3 meals', 'Large infrequent']
spice_tolerance_opts = ['Low', 'Medium', 'High']
cuisine_opts = ['Asian', 'Western', 'Mediterranean']
texture_opts = ['Soft', 'Crunchy', 'Mixed']
portion_control_opts = ['Good', 'Fair', 'Poor']
prev_diet_success_opts = ['Yes', 'No']
food_intolerances_opts = ['Lactose', 'Gluten', 'None']
meal_complexity_opts = ['Simple', 'Moderate', 'Complex']
seasonal_pref_opts = ['Yes', 'No']

# Helper function to generate balanced data
def generate_balanced_choices(options, total):
    base = total // len(options)
    rem = total % len(options)
    values = options * base + random.choices(options, k=rem)
    random.shuffle(values)
    return values

# Balanced features
diabetes = generate_balanced_choices(diabetes_opts, n_samples)
hypertension = generate_balanced_choices(hypertension_opts, n_samples)
cardiovascular = generate_balanced_choices(cardiovascular_opts, n_samples)
food_allergies = generate_balanced_choices(food_allergies_opts, n_samples)
bmi_distribution = generate_balanced_choices(bmi_categories, n_samples)

# Random fields
def random_choices(opts):
    return np.random.choice(opts, n_samples).tolist()

# Generate height and corresponding weight based on BMI
height = np.random.normal(170, 10, n_samples).astype(int)
weight = np.zeros(n_samples)
bmi_category = []

for i in range(n_samples):
    h_m = height[i] / 100
    if bmi_distribution[i] == 'Underweight':
        bmi_val = np.random.uniform(16, 18.4)
    elif bmi_distribution[i] == 'Normal':
        bmi_val = np.random.uniform(18.5, 24.9)
    elif bmi_distribution[i] == 'Overweight':
        bmi_val = np.random.uniform(25, 29.9)
    else:
        bmi_val = np.random.uniform(30, 35)
    weight[i] = round(bmi_val * h_m ** 2, 1)
    bmi_category.append(bmi_distribution[i])

# Assemble dataset
df = pd.DataFrame({
    'Diabetes': diabetes,
    'Hypertension': hypertension,
    'Cardiovascular': cardiovascular,
    'Digestive Disorders': random_choices(digestive_opts),
    'Food Allergies': food_allergies,
    'Height (cm)': height,
    'Weight (kg)': weight,
    'BMI Category': bmi_category,
    'Target Weight (kg)': (weight + np.random.normal(0, 5, n_samples)).round(1),
    'Weight Change History': random_choices(weight_history_opts),
    'Exercise Frequency': np.random.randint(0, 8, n_samples),
    'Exercise Duration (min)': np.random.randint(0, 121, n_samples),
    'Exercise Type': random_choices(exercise_type_opts),
    'Daily Steps Count': np.random.randint(1000, 20000, n_samples),
    'Physical Job Activity Level': random_choices(job_activity_opts),
    'Work Schedule': random_choices(work_schedule_opts),
    'Sleep Duration (hrs)': np.random.randint(4, 10, n_samples),
    'Sleep Quality': random_choices(sleep_quality_opts),
    'Stress Level': random_choices(stress_levels),
    'Meal Timing Regularity': random_choices(meal_timing_opts),
    'Cooking Skills': random_choices(cooking_skills_opts),
    'Available Cooking Time (min)': np.random.randint(10, 121, n_samples),
    'Food Budget': random_choices(food_budget_opts),
    'Alcohol Consumption': random_choices(alcohol_opts),
    'Smoking Status': random_choices(smoking_opts),
    'Water Intake (cups)': np.random.randint(4, 15, n_samples),
    'Eating Out Frequency': np.random.randint(0, 8, n_samples),
    'Snacking Behavior': random_choices(snacking_opts),
    'Food Prep Time Availability (min)': np.random.randint(5, 120, n_samples),
    'Travel Frequency': random_choices(travel_opts),
    'Diet Type': random_choices(diet_type_opts),
    'Meal Size Preference': random_choices(meal_size_opts),
    'Spice Tolerance': random_choices(spice_tolerance_opts),
    'Cuisine Preferences': random_choices(cuisine_opts),
    'Food Texture Preferences': random_choices(texture_opts),
    'Portion Control Ability': random_choices(portion_control_opts),
    'Previous Diet Success History': random_choices(prev_diet_success_opts),
    'Food Intolerances': random_choices(food_intolerances_opts),
    'Meal Complexity Preference': random_choices(meal_complexity_opts),
    'Seasonal Diet Preference': random_choices(seasonal_pref_opts)
})

# Split dataset into train (20,000) and test (5,000)
train_df, test_df = train_test_split(df, test_size=5000, random_state=42)

# Save to CSV files
train_df.to_csv("train_user_parameters.csv", index=False)
test_df.to_csv("test_user_parameters.csv", index=False)

print("Training set saved as 'diet_train_20000.csv'")
print("Test set saved as 'diet_test_5000.csv'")



Training set saved as 'diet_train_20000.csv'
Test set saved as 'diet_test_5000.csv'


In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("train_user_parameters.csv")

# Check counts for key categorical features
print("Diabetes Distribution:\n", df['Diabetes'].value_counts())
print("\nHypertension Distribution:\n", df['Hypertension'].value_counts())
print("\nCardiovascular Distribution:\n", df['Cardiovascular'].value_counts())
print("\nFood Allergies Distribution:\n", df['Food Allergies'].value_counts())
print("\nBMI Category Distribution:\n", df['BMI Category'].value_counts())


Diabetes Distribution:
 Diabetes
Type 1    6674
Type 2    6667
Name: count, dtype: int64

Hypertension Distribution:
 Hypertension
No     10023
Yes     9977
Name: count, dtype: int64

Cardiovascular Distribution:
 Cardiovascular
Present    10076
Absent      9924
Name: count, dtype: int64

Food Allergies Distribution:
 Food Allergies
Nuts         5019
Dairy        4993
Shellfish    4954
Name: count, dtype: int64

BMI Category Distribution:
 BMI Category
Normal         5049
Underweight    5006
Overweight     4991
Obese          4954
Name: count, dtype: int64
