# preparing data

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 이전:  음 식 명,에너지(kcal),탄수화물(g),단백질(g),지방(g),고기,채소,해산물
# i've merged two files: food_calories.csv and food_calories2.csv(more diabetes-friendly foods)

df = pd.read_csv('food_calories.csv', encoding='utf-8')

# 데이터프레임에 당뇨 위험 분류 열 추가 (0: 위험, 1: 중간, 2: 안전) 멀티클래스 분류

high_carb_threshold = 50  # grams
moderate_carb_threshold = (30, 50)  # grams range for moderate carbs
low_carb_threshold = 30  # grams

high_calorie_threshold = 300  # kcal
moderate_calorie_threshold = (100, 200)  # kcal range for moderate calories
low_calorie_threshold = 100  # kcal

balanced_protein_range = (5, 15)  # acceptable range for balanced protein


def classify_diabetes_friendliness(row):
    """
    Classify the diabetes-friendliness of a food item based on its carbohydrate, calorie, and protein content.
    
    - High Risk: High carb content (> 50g, in this case) or high calorie content (> 300 kcal, in this case)
    - Moderate Risk: Moderate carb content (30-50g, in this case) and moderate calorie content (100-200 kcal, in this
     case)
    - Low Risk: Low carb content (< 30g, in this case), low calorie content (< 100 kcal, in this case), and balanced 
    protein (5-15g, in this case)
    
    :param row: 
    :return: 0 for High Risk, 1 for Moderate Risk, 2 for Low Risk
    """
    if row['탄수화물(g)'] > high_carb_threshold or row['에너지(kcal)'] > high_calorie_threshold:
        return 0  # High Risk (least diabetes-friendly)
    elif (
            moderate_carb_threshold[0] <= row['탄수화물(g)'] <= moderate_carb_threshold[1] and
            moderate_calorie_threshold[0] <= row['에너지(kcal)'] <= moderate_calorie_threshold[1]
    ):
        return 1  # Moderate Risk (moderately diabetes-friendly)
    elif (
            row['탄수화물(g)'] < low_carb_threshold and
            row['에너지(kcal)'] < low_calorie_threshold and
            balanced_protein_range[0] <= row['단백질(g)'] <= balanced_protein_range[1]
    ):
        return 2  # Low Risk (most diabetes-friendly)
    else:
        return 1  # Default to moderate risk if criteria are partially met


df['Diabetes_Friendliness'] = df.apply(classify_diabetes_friendliness, axis=1)

X = df[['에너지(kcal)', '탄수화물(g)', '단백질(g)', '지방(g)', '고기', '채소', '해산물']]  # will adding 고기 채소 해산물 make sense?
# y = df['당뇨 위험 분류']
# y = df['음 식 명'].index
y = df['Diabetes_Friendliness']

# preparation for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.34, random_state=42)


# 'least dangerous food for diabetes' prediction

In [3]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

# Scale the training data (this is important!)
# scaler = MinMaxScaler()
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train
# model = RandomForestClassifier(
#     n_estimators=100,
#     random_state=42,
#     class_weight='balanced'
# )
# model = LogisticRegression(max_iter=1000, random_state=42)
model = MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=1000)
model.fit(X_train_scaled, y_train)

# Evaluate
y_pred = model.predict(X_test_scaled)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Prepare multiple inputs (actual food data from training)
# randomly select n food items from the training set
import random

random.seed(0)

input_data = X.iloc[
    [random.randint(0, len(X) - 1) for _ in range(4)]
]

# print the food name of the input_data
print("Input Data:")
print(df.loc[input_data.index, '음식명'], '\n')
# print(df.iloc[input_data.index, 0], '\n')  # changing to iloc due to name with space on the first column

input_data_scaled = scaler.transform(input_data)

# predict probabilities for each class
predictions = model.predict_proba(input_data_scaled)

# finding for class 2 (most diabetes-friendly)
friendly_class_index = np.argmax(model.classes_ == 2)  # Find index of the "most diabetes-friendly" class (argmax)
least_dangerous_food_index = np.argmax(predictions[:, friendly_class_index])

# Get the least dangerous food item details
least_dangerous_food = input_data.iloc[least_dangerous_food_index]
print("Least Dangerous Food for Diabetes:")
print(least_dangerous_food)

Random Forest Accuracy: 0.9352941176470588
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.96      0.97        73
           1       0.92      0.95      0.94        87
           2       0.75      0.60      0.67        10

    accuracy                           0.94       170
   macro avg       0.88      0.84      0.86       170
weighted avg       0.93      0.94      0.93       170

Input Data:
432      돼지고기 편육
197          닭갈비
388           쑥떡
455    토마토 계란 볶음
Name: 음식명, dtype: object 

Least Dangerous Food for Diabetes:
에너지(kcal)    150.0
탄수화물(g)        7.0
단백질(g)         8.0
지방(g)         10.0
고기             0.0
채소             1.0
해산물            0.0
Name: 455, dtype: float64


In [4]:
## begin save the model
import joblib

joblib.dump(model, './recommend_food.joblib')
## end save the model

['./recommend_food.joblib']

# closest food match 

In [4]:
from scipy.spatial.distance import cdist

# Normalize the dataset and user input to the same scale
scaler = StandardScaler()
normalized_data = scaler.fit_transform(df[['에너지(kcal)', '탄수화물(g)', '단백질(g)', '지방(g)', '고기', '채소', '해산물']])

# find food within this dataset that is closest to a food where it...
user_input2 = {
    '에너지(kcal)': 1000,  # has 1000 kcal
    '탄수화물(g)': 50,  # has 50g of carbs
    '단백질(g)': 30,  # has 30g of protein
    '지방(g)': 20,  # has 20g of fat
    '고기': 1.0,  # is a meat dish
    '채소': 1.0,  # is a vegetable dish
    '해산물': 0.0  # is not a seafood dish
}
user_df = pd.DataFrame([user_input2])
normalized_user_input = scaler.transform(user_df)

# Calculate the Euclidean distance between the normalized user input and each food item
# distances = cdist(normalized_user_input, normalized_data, metric='euclidean')
distances = np.linalg.norm(normalized_data - normalized_user_input, axis=1)

# Find the index of the closest food item
closest_food_index = distances.argmin()

# Retrieve the closest food item information
closest_food = df.iloc[closest_food_index]
print("Closest Food Match:")
print(closest_food[['음식명', '에너지(kcal)', '탄수화물(g)', '단백질(g)', '지방(g)', '고기', '채소', '해산물']])


Closest Food Match:
음식명          육회비빔밥
에너지(kcal)    588.0
탄수화물(g)       82.0
단백질(g)        28.0
지방(g)         16.0
고기             1.0
채소             1.0
해산물            0.0
Name: 19, dtype: object


# food recommendation

In [5]:
# User input for remaining nutritional intake and food category counts
user_input = {
    'remaining_calories': 520,  # in kcal
    'remaining_carbs': 50,  # in grams
    'remaining_protein': 20,  # in grams
    'remaining_fat': 15,  # in grams

    'meat_count': 2,  # number of meat servings consumed
    'veg_count': 1,  # number of vegetable servings consumed
    'seafood_count': 0  # number of seafood servings consumed
}

# based on the manual.txt...
max_count = max(user_input['meat_count'], user_input['veg_count'], user_input['seafood_count'])
total_count = user_input['meat_count'] + user_input['veg_count'] + user_input['seafood_count']

# felt the need to use these as weights (meat, veg, seafood)
meat_weight = (max_count - user_input['meat_count']) / total_count if total_count != 0 else 1
veg_weight = (max_count - user_input['veg_count']) / total_count if total_count != 0 else 1
seafood_weight = (max_count - user_input['seafood_count']) / total_count if total_count != 0 else 1


def calculate_score(row):
    """
    Calculate a recommendation score for a food item based on its nutritional content and category preferences.
    
    First, we calculate the nutritional score for calories, carbs, protein, and fat based on the user's remaining intake.
    
    cal_score = 1 - abs(calories - remaining_calories) / remaining_calories
    carb_score = 1 - abs(carbs - remaining_carbs) / remaining_carbs
    protein_score = 1 - abs(protein - remaining_protein) / remaining_protein
    fat_score = 1 - abs(fat - remaining_fat) / remaining_fat
    
    Second, we calculate the category preference weight based on the food item's category counts.
    
    category_weight = (meat_count * meat_weight + veg_count * veg_weight + seafood_count * seafood_weight)
    
    Finally, we combine the nutritional score and category weight to get the recommendation score.
    
    score = 0.64 * (cal_score + carb_score + protein_score + fat_score) / 4 + 0.46 * category_weight
    (you can adjust the weights based on the importance of nutritional fit vs. category preference)
    
    :param row: 
    :return: the recommendation score for the food item
    """
    # Nutritional score
    cal_score = 1 - abs(row['에너지(kcal)'] - user_input['remaining_calories']) / user_input['remaining_calories']
    carb_score = 1 - abs(row['탄수화물(g)'] - user_input['remaining_carbs']) / user_input['remaining_carbs']
    protein_score = 1 - abs(row['단백질(g)'] - user_input['remaining_protein']) / user_input['remaining_protein']
    fat_score = 1 - abs(row['지방(g)'] - user_input['remaining_fat']) / user_input['remaining_fat']

    # Category preference weight
    category_weight = (
            row['고기'] * meat_weight +
            row['채소'] * veg_weight +
            row['해산물'] * seafood_weight
    )

    # adjust the weights on your preference
    return 0.32 * (cal_score + carb_score + protein_score + fat_score) / 4 + 0.68 * category_weight


df['Recommendation_Score'] = df.apply(calculate_score, axis=1)

recommended_foods = df.sort_values(by='Recommendation_Score', ascending=False)

recommended_foods[['음식명', '에너지(kcal)', '탄수화물(g)', '단백질(g)', '지방(g)', '고기', '채소', '해산물', 'Recommendation_Score']]


Unnamed: 0,음식명,에너지(kcal),탄수화물(g),단백질(g),지방(g),고기,채소,해산물,Recommendation_Score
457,아보카도 명란 덮밥,350.0,40.0,15.0,18.0,0.0,1.0,1.0,0.921846
452,연어 샐러드,250.0,10.0,20.0,15.0,0.0,1.0,1.0,0.894462
498,훈제 연어 샐러드,200.0,0.0,22.0,12.0,0.0,1.0,1.0,0.846769
231,파전,187.0,25.0,5.0,8.0,0.0,1.0,1.0,0.811436
413,애호박새우전,115.0,9.0,6.0,6.0,0.0,1.0,1.0,0.768092
...,...,...,...,...,...,...,...,...,...
164,뼈해장국,624.0,23.0,61.0,33.0,1.0,0.0,0.0,0.000800
96,짜장라면,982.0,153.0,29.0,36.0,0.0,0.0,0.0,-0.063877
197,닭갈비,749.0,33.0,70.0,39.0,1.0,0.0,0.0,-0.070431
90,크림소스스파게티,1238.0,129.0,29.0,68.0,0.0,0.0,0.0,-0.235528


# food recommendation with constraints

In [None]:
remaining_calories = user_input['remaining_calories']
remaining_carbs = user_input['remaining_carbs']
remaining_protein = user_input['remaining_protein']
remaining_fat = user_input['remaining_fat']

# Sort foods by diabetes-friendliness and calorie efficiency
sorted_foods = df.sort_values(by=['Recommendation_Score', '에너지(kcal)'], ascending=[False, True])

# Initialize list to store selected foods and track total nutrients
selected_foods = []
total_calories, total_carbs, total_protein, total_fat = 0, 0, 0, 0

# greedy approach: add foods to the list until we reach the nutrient limits
for _, row in sorted_foods.iterrows():
    food_calories = row['에너지(kcal)']
    food_carbs = row['탄수화물(g)']
    food_protein = row['단백질(g)']
    food_fat = row['지방(g)']

    # Check if adding this food will exceed remaining allowances
    if (
            total_calories + food_calories <= remaining_calories and
            total_carbs + food_carbs <= remaining_carbs or
            total_protein + food_protein <= remaining_protein and
            total_fat + food_fat <= remaining_fat
    ):
        # Add food to selected list and update totals
        selected_foods.append(row['음식명'])
        total_calories += food_calories
        total_carbs += food_carbs
        total_protein += food_protein
        total_fat += food_fat

print("Selected Foods for User:")
print(selected_foods)
print(
    f"Total Calories: {total_calories}, Total Carbs: {total_carbs}, Total Protein: {total_protein}, Total Fat: {total_fat}")

# TODO: approach to recommend matching foods -> need to define the categories.. but how?

# unused failed attempts

In [None]:


# Support Vector Machine
svm_model = SVC(decision_function_shape='ovr')  # 'ovr' is one-vs-rest, which works for multi-class
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Logistic Regression
log_reg_model = LogisticRegression(max_iter=1000, random_state=42)
log_reg_model.fit(X_train, y_train)
y_pred = log_reg_model.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# K-Nearest Neighbors
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
y_pred = knn_model.predict(X_test)
print("KNN Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Multi-Layer Perceptron
mlp_model = MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=500)
mlp_model.fit(X_train, y_train)
y_pred = mlp_model.predict(X_test)
print("MLP Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
# user_input = {
#     '칼로리': 1000,
#     '탄수화물': 50,
#     '단백질': 30,
#     '지방': 43,
#     '고기': 1.0,
#     '채소': 1.0,
#     '해산물': 0.0
# }
user_input2 = {
    '에너지(kcal)': 1000,
    '탄수화물(g)': 50,
    '단백질(g)': 30,
    '지방(g)': 20,
    '고기': 1.0,
    '채소': 1.0,
    '해산물': 0.0
}
user_df = pd.DataFrame([user_input2])

In [None]:
df.loc[svm_model.predict(user_df)[0], '음 식 명']

In [None]:
df.loc[log_reg_model.predict(user_df)[0], '음 식 명']

In [None]:
df.loc[knn_model.predict(user_df)[0], '음 식 명']

In [None]:
df.loc[mlp_model.predict(user_df)[0], '음 식 명']

In [None]:
import joblib

# 모델 저장
joblib.dump(model, 'recommend_food.joblib')
# 모델 불러오기
loaded_model = joblib.load('recommend_food.joblib')


In [None]:


# 해당 사람은 이미 800칼로리 섭취하고 고기 1번, 채소 1번 해산물 3번 섭취함

# 데이터프레임으로 변환

In [None]:
# 특성 변수와 대상 변수 정의
x = df[['칼로리', '탄수화물', '단백질', '지방', '고기', '채소', '해산물']]
y = df['식단 이름']

In [None]:


model =
model.fit(x, y.index)

In [None]:
df.loc[model.predict(user_df)[0], '식단 이름']

In [None]:
# 모델 저장
import joblib

# 모델 저장
joblib.dump(model, 'recommend_food.joblib')
# 모델 불러오기
loaded_model = joblib.load('recommend_food.joblib')

# 로드한 모델로 예측
predictions = loaded_model.predict(user_df)

In [None]:
df.loc[model.predict(user_df)[0], '식단 이름']

```text
- recommend_food_list : 
당뇨병 환자들을 위한 건강식 목록

- 식단 추천 시스템 : 
decision tree에 recommend_food_list를 학습시킴

- user_input 
칼로리 : 사용자가 먹을 수 있는 잔여 칼로리
ex) 하루 권장 섭취 칼로리 : 1800
이미 먹은 칼로리가 800이면 1800-800 -> 1000 
- 탄단지도 똑같이 남은 것으로 계산
- 고기, 채소, 해산물 : 기존에 사용자가 고기를 많이 먹었으면
  추천 식단은 해산물 위주로 하기 위해서 설정함
->고기 2번, 채소 1번, 해산물 0번 먹었으면 
max(고기, 채소, 해산물)에서 각각 먹은 횟수를 뺀 값

-> 사용자가 얼마나 고기 채소 해산물을 먹었는지는
food_calories.csv 데이터를 반영해서 적용함
-> db에 고기, 채소, 해산물정보 추가 필요
```


