In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("유저_관심순위.csv")
df = df.drop('Unnamed: 0',axis=1)

In [3]:
df.head()

Unnamed: 0,userId,관심레저활동1순위값,관심레저활동2순위값,관심레저활동3순위값,관심레저활동4순위값,관심레저활동5순위값
0,53322222,골프 직접 하기,국내 프로스포츠 영상 시청,산책-걷기,종교활동,국내 프로스포츠 현장 관람
1,53322236,영상 컨텐츠 시청,수다-잡담하기,쇼핑,독서,없음
2,53322268,쇼핑,영상 컨텐츠 시청,국내 여행,계모임-동창회-사교모임,산책-걷기
3,53322911,보드게임-퍼즐-바둑-장기-체스 등,가족-친지 만남,산책-걷기,맛집-카페 탐방,게임
4,53322912,쇼핑,수다-잡담하기,온천-사우나-찜질방,산책-걷기,계모임-동창회-사교모임


In [4]:
print(df['userId'].value_counts())

userId
53325421    7
53360515    7
53363542    7
53345899    7
53349503    6
           ..
53351231    1
53351207    1
53350745    1
53349791    1
53375645    1
Name: count, Length: 16199, dtype: int64


In [5]:
df[df['userId']==53349503]

Unnamed: 0,userId,관심레저활동1순위값,관심레저활동2순위값,관심레저활동3순위값,관심레저활동4순위값,관심레저활동5순위값
271,53349503,만화책 보기,없음,없음,없음,없음
6607,53349503,만화책 보기,낮잠자기,맛집-카페 탐방,없음,없음
7134,53349503,SNS -인터넷 커뮤니티 활동,없음,없음,없음,없음
14516,53349503,낮잠자기,친구-이성친구 만남,영화관 관람,어학-기술-자격증 취득,없음
21886,53349503,만화책 보기,기타 자기계발-자기관리,영화관 관람,미술관-전시관 관람,없음
28732,53349503,낮잠자기,만화책 보기,게임,산책-걷기,쇼핑


In [6]:
# 가중치 정의
weights = {
    '관심레저활동1순위값': 5,
    '관심레저활동2순위값': 4,
    '관심레저활동3순위값': 3,
    '관심레저활동4순위값': 2,
    '관심레저활동5순위값': 1
}

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31590 entries, 0 to 31589
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   userId      31590 non-null  int64 
 1   관심레저활동1순위값  31590 non-null  object
 2   관심레저활동2순위값  31590 non-null  object
 3   관심레저활동3순위값  31590 non-null  object
 4   관심레저활동4순위값  31590 non-null  object
 5   관심레저활동5순위값  31590 non-null  object
dtypes: int64(1), object(5)
memory usage: 1.4+ MB


In [8]:
# 모든 유니크한 활동 목록 생성
all_activities = set()
for col in df.columns:
    if '관심레저활동' in col:
        all_activities.update(df[col].unique())
all_activities.discard('없음')
all_activities = list(all_activities)

# 사용자별 활동 점수 계산
user_activity_scores = {}

for idx, row in df.iterrows():
    user_id = row['userId']
    if user_id not in user_activity_scores:
        user_activity_scores[user_id] = {activity: {'total_score': 0, 'count': 0} for activity in all_activities}
    
    for col, weight in weights.items():
        activity = row[col]
        if activity != '없음':
            user_activity_scores[user_id][activity]['total_score'] += weight
            user_activity_scores[user_id][activity]['count'] += 1

# 평균 점수 계산 및 데이터프레임 생성
user_activity_df = pd.DataFrame(index=user_activity_scores.keys(), columns=all_activities)

for user_id, activities in user_activity_scores.items():
    for activity, scores in activities.items():
        if scores['count'] > 0:
            user_activity_df.at[user_id, activity] = scores['total_score'] / scores['count']
        else:
            user_activity_df.at[user_id, activity] = 0

# NaN 값을 0으로 변경
user_activity_df = user_activity_df.fillna(0)

print("사용자별 활동 평균 점수:")
print(user_activity_df)

# 전체 활동 평균 점수 계산
activity_avg_scores = user_activity_df.mean().sort_values(ascending=False)

print("\n전체 활동 평균 점수:")
print(activity_avg_scores)

사용자별 활동 평균 점수:
          수영 직접 하기  문학 직접 활동 및 수강하기  영상 컨텐츠 시청  음악 공연 관람  반려동물 돌보기  \
53322222       0.0              0.0        0.0       0.0       0.0   
53322236       0.0              0.0        5.0       0.0       0.0   
53322268       0.0              0.0        4.0       3.0       0.0   
53322911       0.0              0.0        0.0       0.0       0.0   
53322912       0.0              0.0        2.0       0.0       0.0   
...            ...              ...        ...       ...       ...   
53373763       0.0              0.0        0.0       0.0       0.0   
53374109       0.0              0.0        0.0       0.0       0.0   
53375079       0.0              0.0        0.0       0.0       0.0   
53375415       0.0              0.0        0.0       0.0       0.0   
53375645       0.0              0.0        0.0       0.0       0.0   

          국내 프로스포츠 영상 시청  축제 참가  연극-뮤지컬 관람  전통예술 직접 활동 및 수강하기  종교활동  ...  \
53322222             4.0    0.0        0.0                0.0   1.5 

In [9]:
len(user_activity_df.index.unique())

16199

In [11]:
user_activity_df

Unnamed: 0,수영 직접 하기,문학 직접 활동 및 수강하기,영상 컨텐츠 시청,음악 공연 관람,반려동물 돌보기,국내 프로스포츠 영상 시청,축제 참가,연극-뮤지컬 관람,전통예술 직접 활동 및 수강하기,종교활동,...,익스트림스포츠 직접하기,기타 관광-여행,SNS -인터넷 커뮤니티 활동,영상 컨텐츠 제작,문학 행사 관람,겨울레저스포츠 직접 하기,골프 영상 시청,요리-베이킹,산책-걷기,기타 문화예술관람
53322222,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,1.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.00,0.0
53322236,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0
53322268,0.0,0.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.00,0.0
53322911,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.00,0.0
53322912,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.75,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53373763,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.00,0.0
53374109,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0
53375079,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0
53375415,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0


In [12]:
user_activity_df= user_activity_df.astype(float)

In [13]:
user_activity_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16199 entries, 53322222 to 53375645
Data columns (total 85 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   수영 직접 하기              16199 non-null  float64
 1   문학 직접 활동 및 수강하기       16199 non-null  float64
 2   영상 컨텐츠 시청             16199 non-null  float64
 3   음악 공연 관람              16199 non-null  float64
 4   반려동물 돌보기              16199 non-null  float64
 5   국내 프로스포츠 영상 시청        16199 non-null  float64
 6   축제 참가                 16199 non-null  float64
 7   연극-뮤지컬 관람             16199 non-null  float64
 8   전통예술 직접 활동 및 수강하기     16199 non-null  float64
 9   종교활동                  16199 non-null  float64
 10  쇼핑                    16199 non-null  float64
 11  만화책 보기                16199 non-null  float64
 12  박물관 관람                16199 non-null  float64
 13  해외 프로스포츠 영상 시청        16199 non-null  float64
 14  자전거-싸이클-산악자전거 직접 타기   16199 non-null  float64
 15  음악 직접 활동 및 수강하

In [14]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

def sgd_plus_plus(R, K, steps=100, gamma=0.007, lambda_user=0.005, lambda_item=0.005):
    num_users, num_items = R.shape
    
    # Initialize user and item latent feature matrices
    np.random.seed(1)
    P = np.random.normal(scale=1./K, size=(num_users, K))
    Q = np.random.normal(scale=1./K, size=(num_items, K))
    
    # Initialize biases
    bu = np.zeros(num_users)
    bi = np.zeros(num_items)
    
    # Create a list of training samples
    samples = [(i, j, R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] > 0]
    
    # Stochastic gradient descent
    for step in range(steps):
        np.random.shuffle(samples)
        
        for i, j, r in samples:
            # Compute prediction and error
            prediction = np.dot(P[i], Q[j]) + bu[i] + bi[j]
            e = r - prediction
            
            # Update biases
            bu[i] += gamma * (e - lambda_user * bu[i])
            bi[j] += gamma * (e - lambda_item * bi[j])
            
            # Update latent features
            P[i] += gamma * (e * Q[j] - lambda_user * P[i])
            Q[j] += gamma * (e * P[i] - lambda_item * Q[j])
        
        # Compute total error
        total_error = 0
        for i, j, r in samples:
            prediction = np.dot(P[i], Q[j]) + bu[i] + bi[j]
            total_error += (r - prediction) ** 2
        total_error += lambda_user * (np.sum(bu**2) + np.sum(np.sum(P**2)))
        total_error += lambda_item * (np.sum(bi**2) + np.sum(np.sum(Q**2)))
        
        print(f"Step {step+1}/{steps} - Total error: {total_error:.4f}")
        
        # Reduce learning rate
        gamma *= 0.95
        
    return P, Q, bu, bi

# 사용자 활동 데이터프레임 준비 (이전 코드에서 생성한 user_activity_df 사용)
R = user_activity_df.values

# NaN 값을 0으로 대체
R = np.nan_to_num(R)

# 행렬 분해 실행
K = 50  # 잠재 요인의 수
P, Q, bu, bi = sgd_plus_plus(R, K)

# 예측 행렬 생성
pred_matrix = np.dot(P, Q.T) + bu[:, np.newaxis] + bi

Step 1/100 - Total error: 160817.2887
Step 2/100 - Total error: 154746.6870
Step 3/100 - Total error: 151757.8608
Step 4/100 - Total error: 148928.8129
Step 5/100 - Total error: 144834.6340
Step 6/100 - Total error: 137411.5683
Step 7/100 - Total error: 126025.9173
Step 8/100 - Total error: 112117.7489
Step 9/100 - Total error: 98805.1570
Step 10/100 - Total error: 87151.2917
Step 11/100 - Total error: 77263.3446
Step 12/100 - Total error: 68801.9808
Step 13/100 - Total error: 61668.3985
Step 14/100 - Total error: 55817.0023
Step 15/100 - Total error: 50860.1616
Step 16/100 - Total error: 46696.8712
Step 17/100 - Total error: 43190.2946
Step 18/100 - Total error: 40174.4270
Step 19/100 - Total error: 37570.3281
Step 20/100 - Total error: 35311.9225
Step 21/100 - Total error: 33308.1981
Step 22/100 - Total error: 31545.6010
Step 23/100 - Total error: 29965.7900
Step 24/100 - Total error: 28550.0820
Step 25/100 - Total error: 27283.7350
Step 26/100 - Total error: 26124.3044
Step 27/100 -

In [15]:
pred_matrix

array([[3.90112507, 3.11004216, 3.97005277, ..., 3.50951859, 2.96873772,
        3.08714641],
       [3.75213245, 3.38018593, 4.86404584, ..., 3.19252179, 3.35509758,
        3.41251213],
       [2.963598  , 3.40938198, 3.7867695 , ..., 2.90627493, 1.04602682,
        3.32699935],
       ...,
       [3.94057029, 3.43564551, 3.61607195, ..., 3.21947628, 2.86910055,
        3.47186048],
       [3.13822895, 3.0798349 , 3.12795039, ..., 1.81086152, 3.11530541,
        3.28200236],
       [3.94861212, 3.5817692 , 3.31100701, ..., 3.59486672, 3.36160684,
        3.60719212]])

In [16]:
print("\n예측 행렬:")
print(pred_matrix)

# RMSE 계산
mask = R > 0
mse = mean_squared_error(R[mask], pred_matrix[mask])
rmse = np.sqrt(mse)
print(f"\nRMSE: {rmse:.4f}")


예측 행렬:
[[3.90112507 3.11004216 3.97005277 ... 3.50951859 2.96873772 3.08714641]
 [3.75213245 3.38018593 4.86404584 ... 3.19252179 3.35509758 3.41251213]
 [2.963598   3.40938198 3.7867695  ... 2.90627493 1.04602682 3.32699935]
 ...
 [3.94057029 3.43564551 3.61607195 ... 3.21947628 2.86910055 3.47186048]
 [3.13822895 3.0798349  3.12795039 ... 1.81086152 3.11530541 3.28200236]
 [3.94861212 3.5817692  3.31100701 ... 3.59486672 3.36160684 3.60719212]]

RMSE: 0.3552


In [17]:
pred_matrix_df = pd.DataFrame(pred_matrix, columns=user_activity_df.columns, index=user_activity_df.index)
pred_matrix_df.head()

Unnamed: 0,수영 직접 하기,문학 직접 활동 및 수강하기,영상 컨텐츠 시청,음악 공연 관람,반려동물 돌보기,국내 프로스포츠 영상 시청,축제 참가,연극-뮤지컬 관람,전통예술 직접 활동 및 수강하기,종교활동,...,익스트림스포츠 직접하기,기타 관광-여행,SNS -인터넷 커뮤니티 활동,영상 컨텐츠 제작,문학 행사 관람,겨울레저스포츠 직접 하기,골프 영상 시청,요리-베이킹,산책-걷기,기타 문화예술관람
53322222,3.901125,3.110042,3.970053,3.428037,2.460823,3.978343,2.21853,2.702163,2.960116,1.475879,...,3.74219,2.706662,3.170689,3.174083,3.106403,3.301727,2.714337,3.509519,2.968738,3.087146
53322236,3.752132,3.380186,4.864046,3.35776,2.966372,3.392649,3.15576,2.920281,3.344588,3.188529,...,4.040004,3.184226,3.177659,3.377873,3.515396,3.636813,3.160295,3.192522,3.355098,3.412512
53322268,2.963598,3.409382,3.786769,3.019266,3.536517,4.005479,3.051423,3.172852,3.171638,3.444474,...,3.908929,2.235304,2.684443,3.58288,3.136575,3.623751,3.037541,2.906275,1.046027,3.326999
53322911,4.191307,3.441244,3.786112,3.324956,3.950341,2.963904,3.292538,3.312461,3.247224,3.311733,...,4.071964,2.916988,3.780583,3.471895,3.353087,3.650606,3.065188,3.305563,3.035759,3.442237
53322912,3.381006,3.135195,2.11815,3.456791,3.048804,2.587473,2.347299,2.465954,3.105513,2.403204,...,3.870915,2.683991,2.34553,3.362446,3.075495,3.366414,3.173215,2.662225,3.709687,3.167153


In [18]:
def get_unseen_movies(ratings_matrix, userId):
    # userId로 입력받은 사용자의 모든 영화정보 추출하여 Series로 반환함. 
    # 반환된 user_rating 은 영화명(title)을 index로 가지는 Series 객체임. 
    user_rating = ratings_matrix.loc[userId,:]
    
    # user_rating이 0보다 크면 기존에 관람한 영화임. 대상 index를 추출하여 list 객체로 만듬
    already_seen = user_rating[ user_rating > 0].index.tolist()
    
    # 모든 영화명을 list 객체로 만듬. 
    movies_list = ratings_matrix.columns.tolist()
    
    # list comprehension으로 already_seen에 해당하는 movie는 movies_list에서 제외함. 
    unseen_list = [ movie for movie in movies_list if movie not in already_seen]
    
    return unseen_list

In [19]:
def recomm_movie_by_userid(pred_df, userId, unseen_list, top_n=10):
    # 예측 평점 DataFrame에서 사용자id index와 unseen_list로 들어온 영화명 컬럼을 추출하여
    # 가장 예측 평점이 높은 순으로 정렬함. 
    recomm_movies = pred_df.loc[userId, unseen_list].sort_values(ascending=False)[:top_n]
    return recomm_movies

In [25]:
# 사용자가 관람하지 않는 영화명 추출   
unseen_list = get_unseen_movies(user_activity_df, 53349503)
# print(unseen_list)

# 아이템 기반의 인접 이웃 협업 필터링으로 영화 추천 
recomm_movies = recomm_movie_by_userid(pred_matrix_df, 53349503, unseen_list, top_n=10)
# print(recomm_movies)

# # 평점 데이타를 DataFrame으로 생성. 
recomm_movies = pd.DataFrame(data=recomm_movies.values, index=recomm_movies.index, columns=['pred_score'])
recomm_movies

Unnamed: 0,pred_score
골프 직접 하기,4.314515
수영 직접 하기,4.117108
구기스포츠 직접 하기,4.029661
익스트림스포츠 직접하기,4.027955
수상레저스포츠 직접 하기,3.919339
연극-뮤지컬 관람,3.797816
요가-필라테스 직접 하기,3.788838
댄스스포츠 직접 하기,3.785587
영상 컨텐츠 시청,3.715632
무도-격투기 직접 하기,3.709906
