In [3]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Generate sample data
data = {
    'Age': np.random.randint(30, 70, size=1000),
    'Gender': np.random.choice(['Male', 'Female'], size=1000),
    'Ingredient Name': np.random.choice(
        ['Olive Oil', 'Chicken', 'Rice', 'Broccoli', 'Potato', 'Cheese', 'Bread', 'Fish', 'Beef', 'Pasta'],
        size=1000
    ),
    'Category': np.random.choice(
        ['Fats', 'Proteins', 'Carbohydrates', 'Vegetables', 'Dairy'],
        size=1000
    ),
    'Quantity Consumed': np.random.uniform(1, 100, size=1000),
    'Frequency of Consumption': np.random.choice(['Daily', 'Weekly', 'Monthly'], size=1000),
    'Processed Food': np.random.choice([0, 1], size=1000),
    'Cooking Method': np.random.choice(['Frying', 'Boiling', 'Grilling', 'Baking'], size=1000),
}

# Create DataFrame
df = pd.DataFrame(data)

# Simulate health impacts based on simplistic rules
def determine_health_impacts(row):
    diabetes_impact = 'Medium'
    heart_disease_impact = 'Medium'
    hypertension_impact = 'Medium'

    if row['Quantity Consumed'] > 50 and row['Frequency of Consumption'] == 'Daily':
        diabetes_impact = 'High'
        heart_disease_impact = 'High'
        hypertension_impact = 'High'
    elif row['Quantity Consumed'] <= 50 and row['Frequency of Consumption'] == 'Weekly':
        diabetes_impact = 'Medium'
        heart_disease_impact = 'Medium'
        hypertension_impact = 'Low'
    elif row['Quantity Consumed'] < 20 and row['Frequency of Consumption'] == 'Monthly':
        diabetes_impact = 'Low'
        heart_disease_impact = 'Low'
        hypertension_impact = 'Low'
        
    return pd.Series([diabetes_impact, heart_disease_impact, hypertension_impact])

# Apply the function to generate health impacts
df[['Diabetes Impact', 'Heart Disease Impact', 'Hypertension Impact']] = df.apply(determine_health_impacts, axis=1)

# Save to CSV
df.to_csv('synthetic_food_ingredient_health_impacts.csv', index=False)

print("Synthetic dataset created with 1000 entries including health impacts.")


Synthetic dataset created with 1000 entries including health impacts.


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


In [5]:
df = pd.read_csv('synthetic_food_ingredient_health_impacts.csv')

In [27]:
# Create a dictionary to hold the LabelEncoders
label_encoders = {}

# Encode categorical variables
categorical_columns = ['Gender', 'Ingredient Name', 'Category', 'Frequency of Consumption', 'Cooking Method']
# Update the LabelEncoder fitting to include 'Unknown' in the classes
for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    # Add 'Unknown' to the classes
    label_encoders[col].fit(np.append(df[col].unique(), 'Unknown'))
    df[col] = label_encoders[col].transform(df[col])

    joblib.dump(label_encoders[col], f'{col.lower().replace(" ", "_")}_encoder.joblib')


# Fit and transform the target variables using separate LabelEncoders
label_encoder_diabetes = LabelEncoder()
df['Diabetes Impact'] = label_encoder_diabetes.fit_transform(df['Diabetes Impact'])

label_encoder_heart = LabelEncoder()
df['Heart Disease Impact'] = label_encoder_heart.fit_transform(df['Heart Disease Impact'])

label_encoder_hypertension = LabelEncoder()
df['Hypertension Impact'] = label_encoder_hypertension.fit_transform(df['Hypertension Impact'])


In [13]:
# Define features (X) and target variables (y)
X = df[['Age', 'Gender', 'Ingredient Name', 'Category', 'Quantity Consumed', 'Frequency of Consumption', 'Processed Food', 'Cooking Method']]
y_diabetes = df['Diabetes Impact']
y_heart = df['Heart Disease Impact']
y_hypertension = df['Hypertension Impact']

# Split the data into training and testing sets
X_train, X_test, y_train_diabetes, y_test_diabetes = train_test_split(X, y_diabetes, test_size=0.3, random_state=42)
X_train, X_test, y_train_heart, y_test_heart = train_test_split(X, y_heart, test_size=0.3, random_state=42)
X_train, X_test, y_train_hypertension, y_test_hypertension = train_test_split(X, y_hypertension, test_size=0.3, random_state=42)


In [14]:
# Train Random Forest Classifier for each health impact
model_diabetes = RandomForestClassifier(random_state=42)
model_diabetes.fit(X_train, y_train_diabetes)

model_heart = RandomForestClassifier(random_state=42)
model_heart.fit(X_train, y_train_heart)

model_hypertension = RandomForestClassifier(random_state=42)
model_hypertension.fit(X_train, y_train_hypertension)


In [9]:
def predict_health_impacts(new_data):
    # Create a DataFrame for the new data
    new_df = pd.DataFrame(new_data)

    # Handle unseen values in categorical variables
    for col in categorical_columns:
        # Replace unseen labels with 'Unknown' for handling
        unseen_labels = set(new_df[col]) - set(label_encoders[col].classes_)
        new_df[col] = new_df[col].replace(unseen_labels, 'Unknown')

        # Transform using the fitted LabelEncoder
        # Ensure 'Unknown' is handled correctly
        new_df[col] = label_encoders[col].transform(new_df[col])

    # Make predictions using the trained models
    diabetes_pred = model_diabetes.predict(new_df)
    heart_pred = model_heart.predict(new_df)
    hypertension_pred = model_hypertension.predict(new_df)

    # Convert numerical predictions back to original labels
    diabetes_impact = label_encoder_diabetes.inverse_transform(diabetes_pred)
    heart_disease_impact = label_encoder_heart.inverse_transform(heart_pred)
    hypertension_impact = label_encoder_hypertension.inverse_transform(hypertension_pred)

    return diabetes_impact, heart_disease_impact, hypertension_impact


In [17]:
# Example new data for prediction
new_data = {
    'Age': [30],
    'Gender': ['F'],
    'Ingredient Name': ['Unknown Ingredient'],  # This will be an unseen value
    'Category': ['Fats'],
    'Quantity Consumed': [15.0],
    'Frequency of Consumption': ['Weekly'],
    'Processed Food': [0],  # Assume 0 means no
    'Cooking Method': ['Frying']
}

# Make predictions
predictions = predict_health_impacts(new_data)
print("Predicted Health Impacts:")
print("Diabetes Impact:", predictions[0])
print("Heart Disease Impact:", predictions[1])
print("Hypertension Impact:", predictions[2])


Predicted Health Impacts:
Diabetes Impact: [2]
Heart Disease Impact: [2]
Hypertension Impact: [1]


In [16]:
# Example new data for prediction
new_data = [
    {
        'Age': 30,
        'Gender': 'F',
        'Ingredient Name': 'Avocado',
        'Category': 'Fats',
        'Quantity Consumed': 20.0,
        'Frequency of Consumption': 'Weekly',
        'Processed Food': 0,  # Assume 0 means no
        'Cooking Method': 'Raw'
    },
    {
        'Age': 45,
        'Gender': 'M',
        'Ingredient Name': 'Pasta',
        'Category': 'Carbohydrates',
        'Quantity Consumed': 150.0,
        'Frequency of Consumption': 'Daily',
        'Processed Food': 1,  # Assume 1 means yes
        'Cooking Method': 'Boiling'
    },
    {
        'Age': 25,
        'Gender': 'F',
        'Ingredient Name': 'Canned Beans',
        'Category': 'Proteins',
        'Quantity Consumed': 100.0,
        'Frequency of Consumption': 'Weekly',
        'Processed Food': 1,
        'Cooking Method': 'Baking'
    },
    {
        'Age': 60,
        'Gender': 'M',
        'Ingredient Name': 'Red Meat',
        'Category': 'Proteins',
        'Quantity Consumed': 250.0,
        'Frequency of Consumption': 'Monthly',
        'Processed Food': 1,
        'Cooking Method': 'Grilling'
    },
    {
        'Age': 35,
        'Gender': 'F',
        'Ingredient Name': 'Broccoli',
        'Category': 'Vegetables',
        'Quantity Consumed': 50.0,
        'Frequency of Consumption': 'Daily',
        'Processed Food': 0,
        'Cooking Method': 'Steaming'
    },
    {
        'Age': 50,
        'Gender': 'M',
        'Ingredient Name': 'Sugary Drink',
        'Category': 'Sugars',
        'Quantity Consumed': 300.0,
        'Frequency of Consumption': 'Daily',
        'Processed Food': 1,
        'Cooking Method': 'N/A'
    },
    {
        'Age': 28,
        'Gender': 'F',
        'Ingredient Name': 'Unknown Ingredient',  # Unseen value
        'Category': 'Unknown Category',
        'Quantity Consumed': 50.0,
        'Frequency of Consumption': 'Weekly',
        'Processed Food': 0,
        'Cooking Method': 'Boiling'
    },
    {
        'Age': 65,
        'Gender': 'M',
        'Ingredient Name': 'Fried Chicken',
        'Category': 'Meat',
        'Quantity Consumed': 300.0,
        'Frequency of Consumption': 'Weekly',
        'Processed Food': 1,
        'Cooking Method': 'Frying'
    },
    {
        'Age': 20,
        'Gender': 'F',
        'Ingredient Name': 'Almonds',
        'Category': 'Nuts',
        'Quantity Consumed': 30.0,
        'Frequency of Consumption': 'Daily',
        'Processed Food': 0,
        'Cooking Method': 'Raw'
    },
    {
        'Age': 40,
        'Gender': 'M',
        'Ingredient Name': 'White Bread',
        'Category': 'Carbohydrates',
        'Quantity Consumed': 100.0,
        'Frequency of Consumption': 'Daily',
        'Processed Food': 1,
        'Cooking Method': 'Toasting'
    },
    {
        'Age': 38,
        'Gender': 'F',
        'Ingredient Name': 'Fish',
        'Category': 'Proteins',
        'Quantity Consumed': 200.0,
        'Frequency of Consumption': 'Weekly',
        'Processed Food': 0,
        'Cooking Method': 'Grilling'
    }
]

# Make predictions for each entry in new_data
for data in new_data:
    predictions = predict_health_impacts(data)
    print("Predicted Health Impacts for:", data)
    print("Diabetes Impact:", predictions[0])
    print("Heart Disease Impact:", predictions[1])
    print("Hypertension Impact:", predictions[2])
    print('-' * 40)


ValueError: If using all scalar values, you must pass an index

In [21]:
import pandas as pd

# Define the function to make predictions
def predict_health_impacts(data):
    # Create a DataFrame for the new data (should be a single dictionary)
    new_df = pd.DataFrame([data])  # Wrap the data in a list to create a DataFrame
    categorical_columns = ['Gender', 'Ingredient Name', 'Category', 'Frequency of Consumption', 'Cooking Method']

    # Handle unseen values in categorical variables
    for col in categorical_columns:
        # Replace unseen labels with 'Unknown' for handling
        unseen_labels = set(new_df[col]) - set(label_encoders[col].classes_)
        new_df[col] = new_df[col].replace(unseen_labels, 'Unknown')
        
        # Transform using the fitted LabelEncoder, ignoring errors for NaN values
        new_df[col] = label_encoders[col].transform(new_df[col].fillna('Unknown'))

    # Make predictions using the trained models
    diabetes_pred = model_diabetes.predict(new_df)
    heart_disease_pred = model_heart.predict(new_df)
    hypertension_pred = model_hypertension.predict(new_df)

    return diabetes_pred[0], heart_disease_pred[0], hypertension_pred[0]

# Example new data for prediction
new_data = [
    {
        'Age': 30,
        'Gender': 'F',
        'Ingredient Name': 'Avocado',
        'Category': 'Fats',
        'Quantity Consumed': 20.0,
        'Frequency of Consumption': 'Weekly',
        'Processed Food': 0,  # Assume 0 means no
        'Cooking Method': 'Raw'
    },
    {
        'Age': 45,
        'Gender': 'M',
        'Ingredient Name': 'Pasta',
        'Category': 'Carbohydrates',
        'Quantity Consumed': 150.0,
        'Frequency of Consumption': 'Daily',
        'Processed Food': 1,  # Assume 1 means yes
        'Cooking Method': 'Boiling'
    },
    {
        'Age': 25,
        'Gender': 'F',
        'Ingredient Name': 'Canned Beans',
        'Category': 'Proteins',
        'Quantity Consumed': 100.0,
        'Frequency of Consumption': 'Weekly',
        'Processed Food': 1,
        'Cooking Method': 'Baking'
    },
    {
        'Age': 60,
        'Gender': 'M',
        'Ingredient Name': 'Red Meat',
        'Category': 'Proteins',
        'Quantity Consumed': 250.0,
        'Frequency of Consumption': 'Monthly',
        'Processed Food': 1,
        'Cooking Method': 'Grilling'
    },
    {
        'Age': 35,
        'Gender': 'F',
        'Ingredient Name': 'Broccoli',
        'Category': 'Vegetables',
        'Quantity Consumed': 50.0,
        'Frequency of Consumption': 'Daily',
        'Processed Food': 0,
        'Cooking Method': 'Steaming'
    },
    {
        'Age': 50,
        'Gender': 'M',
        'Ingredient Name': 'Sugary Drink',
        'Category': 'Sugars',
        'Quantity Consumed': 300.0,
        'Frequency of Consumption': 'Daily',
        'Processed Food': 1,
        'Cooking Method': 'N/A'
    },
    {
        'Age': 28,
        'Gender': 'F',
        'Ingredient Name': 'Unknown Ingredient',  # Unseen value
        'Category': 'Unknown Category',
        'Quantity Consumed': 50.0,
        'Frequency of Consumption': 'Weekly',
        'Processed Food': 0,
        'Cooking Method': 'Boiling'
    },
    {
        'Age': 65,
        'Gender': 'M',
        'Ingredient Name': 'Fried Chicken',
        'Category': 'Meat',
        'Quantity Consumed': 300.0,
        'Frequency of Consumption': 'Weekly',
        'Processed Food': 1,
        'Cooking Method': 'Frying'
    },
    {
        'Age': 20,
        'Gender': 'F',
        'Ingredient Name': 'Almonds',
        'Category': 'Nuts',
        'Quantity Consumed': 30.0,
        'Frequency of Consumption': 'Daily',
        'Processed Food': 0,
        'Cooking Method': 'Raw'
    },
    {
        'Age': 40,
        'Gender': 'M',
        'Ingredient Name': 'White Bread',
        'Category': 'Carbohydrates',
        'Quantity Consumed': 100.0,
        'Frequency of Consumption': 'Daily',
        'Processed Food': 1,
        'Cooking Method': 'Toasting'
    },
    {
        'Age': 38,
        'Gender': 'female',
        'Ingredient Name': 'Fish',
        'Category': 'Proteins',
        'Quantity Consumed': 200.0,
        'Frequency of Consumption': 'Weekly',
        'Processed Food': 0,
        'Cooking Method': 'Grilling'
    }
]


# Make predictions for each entry in new_data
for data in new_data:
    predictions = predict_health_impacts(data)
    print("Predicted Health Impacts for:", data)
    print("Diabetes Impact:", predictions[0])
    print("Heart Disease Impact:", predictions[1])
    print("Hypertension Impact:", predictions[2])
    print('-' * 40)


Predicted Health Impacts for: {'Age': 30, 'Gender': 'F', 'Ingredient Name': 'Avocado', 'Category': 'Fats', 'Quantity Consumed': 20.0, 'Frequency of Consumption': 'Weekly', 'Processed Food': 0, 'Cooking Method': 'Raw'}
Diabetes Impact: 2
Heart Disease Impact: 2
Hypertension Impact: 1
----------------------------------------
Predicted Health Impacts for: {'Age': 45, 'Gender': 'M', 'Ingredient Name': 'Pasta', 'Category': 'Carbohydrates', 'Quantity Consumed': 150.0, 'Frequency of Consumption': 'Daily', 'Processed Food': 1, 'Cooking Method': 'Boiling'}
Diabetes Impact: 2
Heart Disease Impact: 2
Hypertension Impact: 2
----------------------------------------
Predicted Health Impacts for: {'Age': 25, 'Gender': 'F', 'Ingredient Name': 'Canned Beans', 'Category': 'Proteins', 'Quantity Consumed': 100.0, 'Frequency of Consumption': 'Weekly', 'Processed Food': 1, 'Cooking Method': 'Baking'}
Diabetes Impact: 2
Heart Disease Impact: 2
Hypertension Impact: 2
----------------------------------------
P

In [25]:
import joblib

joblib.dump(model_diabetes, 'model_diabetes.joblib')
joblib.dump(model_heart, 'model_heart_disease.joblib')
joblib.dump(model_hypertension, 'model_hypertension.joblib')

# Save the label encoders
joblib.dump(label_encoders, 'label_encoders.joblib')
joblib.dump(label_encoder_diabetes, 'label_encoder_diabetes.joblib')



# Loading the models later
loaded_model_diabetes = joblib.load('model_diabetes.joblib')
loaded_model_heart_disease = joblib.load('model_heart_disease.joblib')
loaded_model_hypertension = joblib.load('model_hypertension.joblib')
loaded_label_encoders = joblib.load('label_encoders.joblib')
loaded_label_encoder_diabetes = joblib.load('label_encoder_diabetes.joblib')

In [26]:
joblib.dump(label_encoder_heart, 'label_encoder_heart.joblib')
joblib.dump(label_encoder_hypertension ,'label_encode_hypertension.joblib')

['label_encode_hypertension.joblib']