# 비만 데이터와 운동데이터를 이용한 1차 추천점수

In [78]:
# 비만데이터 불러오기

import pandas as pd
import numpy as np

df_ob = pd.read_csv('./Obesti_WHA.csv', encoding='cp949')
df_ob.head()

Unnamed: 0,Weight,Age,Height,level
0,64.0,21.0,1.62,2
1,56.0,21.0,1.52,2
2,77.0,23.0,1.8,2
3,87.0,27.0,1.8,3
4,89.8,22.0,1.78,3


In [79]:
# 운동데이터 불러오기

df_ex = pd.read_csv('./exercise_level.csv')
df_ex.head()

Unnamed: 0,exercise,Calories per kg,level
0,"Cycling, mountain bike, bmx",1.75073,3
1,"Cycling, <10 mph, leisure bicycling",0.823236,1
2,"Cycling, >20 mph, racing",3.294974,4
3,"Cycling, 10-11.9 mph, light",1.234853,3
4,"Cycling, 12-13.9 mph, moderate",1.647825,3


In [80]:
# user_item_matrix 생성
user_item_matrix = []

for _, user in df_ob.iterrows():
    user_scores = []
    
    for _, exercise in df_ex.iterrows():
        score = abs(user['level'] - exercise['level'])
        score = max(4 - score ,0) # 점수가 음수가 되는 것을 방지하기 위해 최소값을 0으로 설정합니다.
        user_scores.append(score)
        
    user_item_matrix.append(user_scores)

user_item_matrix_df = pd.DataFrame(user_item_matrix,
                                   columns=df_ex['exercise'],
                                   index=df_ob.index)

print(user_item_matrix_df)


exercise  Cycling, mountain bike, bmx  Cycling, <10 mph, leisure bicycling   
0                                 3.0                                  3.0  \
1                                 3.0                                  3.0   
2                                 3.0                                  3.0   
3                                 4.0                                  2.0   
4                                 4.0                                  2.0   
...                               ...                                  ...   
2106                              3.0                                  1.0   
2107                              3.0                                  1.0   
2108                              3.0                                  1.0   
2109                              3.0                                  1.0   
2110                              3.0                                  1.0   

exercise  Cycling, >20 mph, racing  Cycling, 10-11.9 mph, light

In [81]:
# 사후확률 user_item_matrix를 위한 copy
import copy

# user_item_matrix 복사
user_item_matrix_2 = copy.deepcopy(user_item_matrix_df)

# user_item_matrix_2 초기화: 모든 확률 동일하게 설정
for exercise in user_item_matrix_2.columns:
    user_item_matrix_2[exercise] = 1 / len(user_item_matrix_2.columns)

print(user_item_matrix_2)

exercise  Cycling, mountain bike, bmx  Cycling, <10 mph, leisure bicycling   
0                            0.004032                             0.004032  \
1                            0.004032                             0.004032   
2                            0.004032                             0.004032   
3                            0.004032                             0.004032   
4                            0.004032                             0.004032   
...                               ...                                  ...   
2106                         0.004032                             0.004032   
2107                         0.004032                             0.004032   
2108                         0.004032                             0.004032   
2109                         0.004032                             0.004032   
2110                         0.004032                             0.004032   

exercise  Cycling, >20 mph, racing  Cycling, 10-11.9 mph, light

# SVD(특이값 분해 알고리즘)사용하여 특정 ID당 상위 10%와 하위 10%의 운동중 1가지씩 랜덤으로 추천하게끔 나머지는 그냥 랜덤
# 처음 추천 시

In [82]:
# Python Surprise 라이브러리를 사용하여 SVD 알고리즘을 적용하려면, 먼저 데이터를 Surprise 라이브러리가 인식할 수 있는 형태로 변환해야 함


from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
import random

# Surprise 라이브러리에서 요구하는 데이터 형식으로 변환하기 위한 과정입니다.
# userID와 ExerciseName을 문자열로 변환
data = []
for user_id in user_item_matrix_df.index:
    for exercise_name in user_item_matrix_df.columns:
        score = user_item_matrix_df.loc[user_id, exercise_name]
        data.append((user_id, str(exercise_name), score))

# Rating Scale은 추천 점수의 범위인 0~4입니다.
reader = Reader(rating_scale=(0, 4))
data = Dataset.load_from_df(pd.DataFrame(data, columns=["UserID", "ExerciseName", "Score"]), reader)

trainset = data.build_full_trainset()

# SVD 모델 학습
model = SVD()
model.fit(trainset)

# user-item matrix)을 Surprise 패키지가 요구하는 (user id, item id, rating) 형태의 리스트로 변환
# 그 후 Reader 클래스를 이용하여 점수의 범위를 설정
# Dataset.load_from_df() 함수로 데이터 프레임을 로드
# 전체 데이터셋에 대해 학습을 진행

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2636dda4bd0>

In [83]:
print('Number of users: ', trainset.n_users)
print('Number of items: ', trainset.n_items)

Number of users:  2111
Number of items:  248


In [84]:
# userID가 6인 사용자에 대한 예측 결과
testset = trainset.build_testset()
testset_user6 = [x for x in testset if x[0] == 6]
predictions = model.test(testset_user6)

In [85]:
# 상위 및 하위 10% 계산 
top_10_percent_threshold = np.percentile([pred.est for pred in predictions], 90)
bottom_10_percent_threshold = np.percentile([pred.est for pred in predictions], 10)

# 상위 및 하위 예측 선택 
top_preds = [pred for pred in predictions if pred.est >= top_10_percent_threshold]
bottom_preds= [pred for pred in predictions if pred.est <= bottom_10_percent_threshold]

random.shuffle(top_preds)
random.shuffle(bottom_preds)

recommendation_top1= top_preds[:1]
recommendation_bottom1 = bottom_preds[:1]

# 전체 예측 중에서 랜덤으로 두 개 선택
random.shuffle(predictions)
recommendations_random2 = predictions[:2]

# 추천 목록 합치기
recommendations = recommendation_top1 + recommendation_bottom1 + recommendations_random2

for pred in recommendations:
    print(f"Recommended exercise: {pred.iid}, Estimated score: {pred.est}")

Recommended exercise: Martial arts, kick boxing, Estimated score: 3.7771559494657105
Recommended exercise: Running, 8 mph (7.5 min mile), Estimated score: 2.0383583202073767
Recommended exercise: Tai chi, Estimated score: 3.008815926441795
Recommended exercise: Walking, under 2.0 mph, very slow, Estimated score: 3.0135897815248747


In [86]:
a = []
for pred in recommendations:
    # list에 차례로 집어넣기
    a.append(pred.iid)
print(a)

['Martial arts, kick boxing', 'Running, 8 mph (7.5 min mile)', 'Tai chi', 'Walking, under 2.0 mph, very slow']


In [87]:
# 사용자로부터 운동 선택 받기
def get_choices(user_id):
    choices = {}
    for exercise in a:
        choice = input(f"Did you choose the exercise {exercise}? (yes/no): ")
        if choice.lower() == 'yes':
            if exercise in choices:
                choices[exercise] += 1
            else:
                choices[exercise] = 1
    return choices

In [88]:
choices = {}
choses_sum = 0
for exercise in a:
    choice = input(f"Did you choose the exercise {exercise}? (yes/no): ")
    if choice.lower() == 'yes':
        choses_sum += 1
        if exercise in choices:
            choices[exercise] += 1
        else:
            choices[exercise] = 1

In [89]:
print(choices.values())

dict_values([1, 1, 1])


In [90]:
print(choses_sum)

3


In [91]:
print(choices.items())

dict_items([('Martial arts, kick boxing', 1), ('Running, 8 mph (7.5 min mile)', 1), ('Walking, under 2.0 mph, very slow', 1)])


# user_item_matrix_2 업데이트: 선택된 운동에 대해 점수 증가
def update_scores(user_id, choices):
    total_choices = choses_sum
    for exercise, count in choices.items():
        old_score = user_item_matrix_2.loc[user_id, exercise]
        print(old_score)
        new_score = old_score + (count / total_choices)  # 비율에 따라 점수 증가
        print(new_score)
        user_item_matrix_2.loc[user_id, exercise] = new_score

In [92]:
def update_scores(user_id, choices):
    total_choices = sum(choices.values())
    for exercise, count in choices.items():
        # 사후확률 계산하여 업데이트
        new_score = count / total_choices
        user_item_matrix_2.loc[user_id, exercise] = new_score

In [93]:
update_scores(6, choices)

# 1번째 이후 추천

In [94]:
min_rating = user_item_matrix_2.min().min()
max_rating = user_item_matrix_2.max().max()
print(min_rating, max_rating)

0.004032258064516129 0.3333333333333333


In [96]:
#user_item_matrix_2 인덱스 확인
print(user_item_matrix_2.index)

RangeIndex(start=0, stop=2111, step=1)


In [97]:
print(user_item_matrix.index)

<built-in method index of list object at 0x000002637D880440>


In [109]:
print(user_item_matrix_df.columns[user_item_matrix_df.loc[10] == 1])

Index(['Cycling, <10 mph, leisure bicycling', 'Unicycling',
       'Stationary cycling, very light', 'Calisthenics, light',
       'Weight lifting, light workout', 'Rowing machine, light',
       'Aerobics, low impact', 'Stretching, hatha yoga', 'Mild stretching',
       'Water aerobics',
       ...
       'Gardening, general', 'Bagging grass, leaves',
       'Watering lawn or garden', 'Weeding, cultivating garden',
       'Carpentry, general', 'General cleaning', 'Cleaning, dusting',
       'Taking out trash', 'Walking, pushing a wheelchair',
       'Teach physical education,exercise class'],
      dtype='object', name='exercise', length=105)


In [101]:
print(user_item_matrix_df.loc[6])

exercise
Cycling, mountain bike, bmx                3.0
Cycling, <10 mph, leisure bicycling        3.0
Cycling, >20 mph, racing                   2.0
Cycling, 10-11.9 mph, light                3.0
Cycling, 12-13.9 mph, moderate             3.0
                                          ... 
General cleaning                           3.0
Cleaning, dusting                          3.0
Taking out trash                           3.0
Walking, pushing a wheelchair              3.0
Teach physical education,exercise class    3.0
Name: 6, Length: 248, dtype: float64


In [121]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split


data_2 = []
for user_id in user_item_matrix_2.index:
    for exercise_name in user_item_matrix_2.columns:
        score = user_item_matrix_2.loc[user_id, exercise_name]
        data_2.append((user_id, str(exercise_name), score))
# 데이터 로딩
reader = Reader(rating_scale=(min_rating, max_rating)) # rating_scale: 평점의 최소값과 최대값을 지정
data_2 = Dataset.load_from_df(pd.DataFrame(data_2, columns=["UserID", "ExerciseName", "Score"]), reader)

trainset = data.build_full_trainset()

# SVD 알고리즘 사용
algo = SVD()

# 모델 학습
algo.fit(trainset)

def make_recommendation_after(user_id):

    # userID가 6인 사용자에 대한 예측 결과
    testset = trainset.build_testset()
    testset_user6 = [x for x in testset if x[0] == user_id]
    predictions = model.test(testset_user6)
    
    # user_item_matrix에서 rating 점수가 각각 1-2인 운동, 3인 운동, 4인 운동 중에서 랜덤으로 선택
    ratings = [1, 2, 3, 4]
    preper_from_each_rating = []
   
    
    for rating in ratings:
         exercises_with_this_ratings=user_item_matrix_df.columns[user_item_matrix_df.loc[user_id]==rating].tolist()
         
         if len(exercises_with_this_ratings)>0 :
             selected_exercise=np.random.choice(exercises_with_this_ratings)
             preper_from_each_rating.append(selected_exercise)

    # 사후확률 top10%에 해당하는 운동 중에서 랜덤으로 1개 선택
    top_10_percent_threshold = np.percentile([pred.est for pred in predictions], 90)

    top_preds = [pred for pred in predictions if pred.est >= top_10_percent_threshold]

    random.shuffle(top_preds)

    recommendation_top1= [top_preds[0].iid]

    # preper_from_each_rating과 recommendation_top1을 합쳐서 최종 추천 목록 생성
    recommendations = preper_from_each_rating + recommendation_top1

    return recommendations

print(make_recommendation_after(6))

['Running, 10.9 mph (5.5 min mile)', 'Mild stretching', 'Soccer, competitive', 'Crew, sculling, rowing, competition']


In [122]:
top_preds[0].iid

'Martial arts, kick boxing'