In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict

In [None]:
pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3163752 sha256=aaaa13e1abf7546feccb521dad4ee4f392bfb9bf66b96d5d86355cd5850cdff4
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


In [None]:
from surprise import Reader, Dataset, SVD, KNNWithMeans
from surprise.model_selection import train_test_split
from surprise import accuracy

In [None]:
df = pd.read_csv('/content/sampled_result_encoded_500000.csv')
df.head()

Unnamed: 0,UserID,StreamID,StreamerName,TimeStart,TimeStop,WatchTime,encoded_StreamerName,StartTime,StopTime
0,49907,33938860608,alcasthq,1105,1106,10,2331,16:10,16:20
1,59803,33915106176,stray228,847,848,10,73836,21:10,21:20
2,13819,33904450096,senseifaruk,741,743,20,68363,03:30,03:50
3,32503,34402554000,xgladd,6055,6058,30,85835,01:10,01:40
4,38317,34125070304,dafran,3102,3103,10,15825,13:00,13:10


In [None]:
# 이상치 제거 함수
# Function to remove outliers from specified columns in a DataFrame
def get_outlier(df, column, weight=1.5):
    for c in column:
        quantile_25 = np.percentile(df[c].values, 25)
        quantile_75 = np.percentile(df[c].values, 75)

        iqr = quantile_75 - quantile_25
        iqr_weight = iqr * weight

        lowest = quantile_25 - iqr_weight
        highest = quantile_75 + iqr_weight

        outlier_idx = df[c][(df[c] < lowest) | (df[c] > highest)].index
        df = df.drop(outlier_idx)
    return df

# 데이터 전처리 함수
# Data preprocessing function to clean and filter the dataset
# Removing users with a low number of interactions
# Removing streams with low viewership
# Removing outliers based on watch time
def preprocess(df, weight=1.5, percent=0.1):
    # Removing users with a low number of interactions
    user_counts = df['UserID'].value_counts().reset_index()
    user_counts.columns = ['UserID', 'count']
    top_percent_value = user_counts['count'].quantile(1 - percent)
    filtered_users = user_counts[user_counts['count'] < top_percent_value]
    df = df[~df['UserID'].isin(filtered_users['UserID'])]

    # Removing streams with low viewership
    stream_counts = df['StreamID'].value_counts().reset_index()
    stream_counts.columns = ['StreamID', 'count']
    top_percent_value = stream_counts['count'].quantile(1 - percent)
    filtered_streams = stream_counts[stream_counts['count'] < top_percent_value]
    df = df[~df['StreamID'].isin(filtered_streams['StreamID'])]

    # Removing outliers based on watch time
    df['WatchTime'] = (df['TimeStop'] - df['TimeStart']) * 10
    df = get_outlier(df, ['WatchTime'], weight)
    df = df.reset_index(drop=True)

    return df

# 데이터 샘플링 및 전처리
df_cleaned = preprocess(df, 1.5, 0.1)
# Applying preprocessing to the entire dataset and a sampled subset
sampled_df = df.sample(n=150000, random_state=42)
sampled_df_cleaned = preprocess(sampled_df, 1.5, 0.1)

df_cleaned, sampled_df_cleaned


(       UserID     StreamID  StreamerName  TimeStart  TimeStop  WatchTime  \
 0       38317  34125070304        dafran       3102      3103         10   
 1       36286  34392526368       solaaaa       5872      5873         10   
 2       28019  33915571520      rainbow6        855       856         10   
 3        6603  34320625760  timthetatman       5153      5154         10   
 4       97815  33894454656   moonmoon_ow        623       624         10   
 ...       ...          ...           ...        ...       ...        ...   
 42378   73123  34410333264    nanajam777       6102      6104         20   
 42379   15112  33995804480     innocents       1699      1700         10   
 42380   37801  34040314016          chap       2178      2179         10   
 42381   74334  34152472176    methodjosh       3434      3435         10   
 42382   17790  33844677744      dasmehdi        155       157         20   
 
        encoded_StreamerName StartTime StopTime  
 0                     1

In [None]:
# Converting 'StartTime' and 'StopTime' to datetime objects and then formatting as strings to display only hours and minutes
sampled_df_cleaned['StartTime'] = pd.to_datetime(sampled_df_cleaned['StartTime'], format='%H:%M').dt.strftime('%H:%M')
sampled_df_cleaned['StopTime'] = pd.to_datetime(sampled_df_cleaned['StopTime'], format='%H:%M').dt.strftime('%H:%M')

# Extracting time of day features
sampled_df_cleaned['TimeOfDay'] = pd.to_datetime(sampled_df_cleaned['StartTime'], format='%H:%M').dt.hour.apply(
    lambda x: 'Morning' if 5 <= x < 12 else ('Afternoon' if 12 <= x < 17 else ('Evening' if 17 <= x < 21 else 'Night'))
)

# Display the first few rows to verify the changes
sampled_df_cleaned.head()


Unnamed: 0,UserID,StreamID,StreamerName,TimeStart,TimeStop,WatchTime,encoded_StreamerName,StartTime,StopTime,TimeOfDay
0,78354,33868545328,elmiillor,412,413,10,21504,20:40,20:50,Evening
1,2974,34308736544,asmongold,5006,5007,10,5111,18:20,18:30,Evening
2,93089,33877459952,playhearthstone,506,508,20,60525,12:20,12:40,Afternoon
3,50653,34106277712,iddqd,2902,2903,10,32669,03:40,03:50,Night
4,32981,34334225232,timthetatman,5301,5304,30,78888,19:30,20:00,Evening


In [None]:
# 시간대별 시청 패턴 매트릭스 생성
# Creating a viewing pattern matrix by time zone
time_of_day_watching = sampled_df_cleaned.groupby(['UserID', 'TimeOfDay'])['WatchTime'].sum().unstack(fill_value=0)
time_of_day_watching = time_of_day_watching.div(time_of_day_watching.sum(axis=1), axis=0)

In [None]:
from scipy.sparse import csr_matrix

# User ID와 encoded_StreamerName에 대한 숫자 인덱스 할당
user_ids = sampled_df_cleaned['UserID'].astype('category').cat.codes
streamer_names = sampled_df_cleaned['StreamerName'].astype('category').cat.codes

# CSR 행렬 생성
watch_time = sampled_df_cleaned['WatchTime'].astype(float)
user_item_matrix = csr_matrix((watch_time, (user_ids, streamer_names)),
                              shape=(user_ids.max() + 1, streamer_names.max() + 1))

#스트리머 이름과 인덱스 매핑
streamer_index_to_name_mapping = dict(enumerate(sampled_df_cleaned['StreamerName'].astype('category').cat.categories))

# CSR 행렬 확인
print(user_item_matrix)


  (0, 1209)	10.0
  (0, 1211)	10.0
  (1, 245)	10.0
  (1, 1598)	10.0
  (2, 72)	10.0
  (2, 646)	20.0
  (2, 1634)	10.0
  (3, 381)	10.0
  (3, 1256)	20.0
  (3, 1522)	10.0
  (4, 111)	10.0
  (4, 376)	10.0
  (4, 1136)	20.0
  (4, 1711)	10.0
  (5, 566)	10.0
  (6, 372)	30.0
  (7, 108)	10.0
  (7, 413)	20.0
  (7, 933)	10.0
  (7, 1366)	10.0
  (8, 327)	20.0
  (8, 463)	20.0
  (8, 662)	10.0
  (8, 1635)	10.0
  (9, 1634)	20.0
  :	:
  (7152, 1804)	10.0
  (7153, 600)	10.0
  (7154, 312)	10.0
  (7155, 1670)	30.0
  (7156, 364)	20.0
  (7156, 642)	10.0
  (7156, 1151)	20.0
  (7157, 1299)	10.0
  (7157, 1474)	10.0
  (7158, 1317)	20.0
  (7158, 1735)	20.0
  (7159, 413)	10.0
  (7159, 1518)	10.0
  (7159, 1580)	30.0
  (7159, 1635)	10.0
  (7160, 737)	20.0
  (7160, 1048)	30.0
  (7160, 1444)	20.0
  (7160, 1634)	10.0
  (7161, 27)	30.0
  (7161, 608)	20.0
  (7161, 1145)	10.0
  (7161, 1521)	30.0
  (7162, 986)	20.0
  (7162, 1137)	10.0


In [None]:
from surprise import Reader, Dataset, SVD, KNNWithMeans
from surprise.model_selection import cross_validate, train_test_split
from surprise import accuracy
# Surprise에 사용하기 위한 데이터셋/ CSR 행렬에서 데이터프레임으로 변환
data_df = pd.DataFrame({
    'uid': user_ids,
    'iname': streamer_names,
    'rating': watch_time
})

# Surprise의 Reader 객체 생성/Rating scale은 최소 및 최대 시청 시간
reader = Reader(rating_scale=(data_df['rating'].min(), data_df['rating'].max()))

data = Dataset.load_from_df(data_df[['uid', 'iname', 'rating']], reader)

trainset, testset = train_test_split(data, test_size=0.25)


In [None]:
# SVD 모델 훈련(Gradient Descent, Regularization 최적화를 수행)
# 행렬 분해 기반 협업 필터링
# 사용자와 아이템 간의 상호작용(시청 시간)을 기반으로 행렬을 분해하고, 사용자가 아직 평가하지 않은 아이템에 대한 예측 평가를 생성.
svd = SVD()
svd.fit(trainset)

# KNNWithMeans 모델 훈련
# 이웃 기반의 협업 필터링
# 유사한 시청기록, 유사도를 바탕으로 사용자가 평가하지 않은 아이템에 대한 평가를 예측
knn = KNNWithMeans()
knn.fit(trainset)

# 훈련 모델 평가
svd_predictions = svd.test(testset)
knn_predictions = knn.test(testset)

# RMSE 모델 평가
svd_rmse = accuracy.rmse(svd_predictions)
knn_rmse = accuracy.rmse(knn_predictions)

print(f'SVD Model RMSE: {svd_rmse}')
print(f'KNNWithMeans Model RMSE: {knn_rmse}')

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 7.2200
RMSE: 8.3684
SVD Model RMSE: 7.2200031992314395
KNNWithMeans Model RMSE: 8.368444813795128


In [None]:
# 모델의 예측을 조합하고 시간대 정보를 추가
def combine_predictions_with_time(svd_model, knn_model, time_pref_matrix, testset, current_time_of_day):
    combined_predictions = defaultdict(list)
    for svd_pred, knn_pred in zip(svd_model.test(testset), knn_model.test(testset)):
        uid = svd_pred.uid
        iid = svd_pred.iid
        avg_score = (svd_pred.est + knn_pred.est) / 2
        # 시간대 가중치를 적용
        time_weighted_score = avg_score * time_pref_matrix.loc[uid, current_time_of_day] if uid in time_pref_matrix.index else avg_score
        combined_predictions[uid].append((iid, time_weighted_score))
    return combined_predictions

# 현재 시간대 설정
current_time_of_day = 'Evening'

# 최종 결합된 예측 점수를 계산
final_predictions = combine_predictions_with_time(svd, knn, time_of_day_watching, testset, current_time_of_day)

In [None]:
# 최종 추천 목록을 생성하고, 스트리머 이름으로 출력하는 함수
# Function to generate top N recommendations for each user
def get_top_n_recommendations(predictions, n=3):
    top_n_recommendations = defaultdict(list)
    for uid, user_predictions in predictions.items():
        user_predictions.sort(key=lambda x: x[1], reverse=True)
        top_n_recommendations[uid] = user_predictions[:n]
    return top_n_recommendations

top_n_recommendations = get_top_n_recommendations(final_predictions, n=3)

# Printing the top recommendations for each user
for uid, user_recs in top_n_recommendations.items():
    print(f"User {uid}:")
    for iid, score in user_recs:
        streamer_name = streamer_index_to_name_mapping[iid]
        print(f"  Recommended Streamer: {streamer_name}, Score: {score:.2f}")


[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
  Recommended Streamer: fortnite, Score: 19.94
  Recommended Streamer: highdistortion, Score: 19.23
User 6718:
  Recommended Streamer: tfblade, Score: 14.39
User 4768:
  Recommended Streamer: luladopub, Score: 7.22
User 1258:
  Recommended Streamer: playhearthstone, Score: 17.45
  Recommended Streamer: gamesdonequick, Score: 14.43
User 5859:
  Recommended Streamer: zerator, Score: 12.47
User 1834:
  Recommended Streamer: jahrein, Score: 11.85
User 488:
  Recommended Streamer: loeya, Score: 14.19
User 918:
  Recommended Streamer: c9sneaky, Score: 14.65
User 294:
  Recommended Streamer: dakotaz, Score: 14.60
  Recommended Streamer: dakotaz, Score: 14.60
User 5252:
  Recommended Streamer: shroud, Score: 22.29
  Recommended Streamer: dogdog, Score: 20.18
User 6294:
  Recommended Streamer: solaryfortnite, Score: 5.19
  Recommended Streamer: fortnite, Score: 4.93
  Recommended Streamer: gotaga, Score: 3.80
User 1059:
  Recommended Streamer: d

In [None]:
# Precision: 추천된 항목 중 실제로 사용자가 선호하는 항목의 비율
# Recall: 사용자가 선호하는 모든 항목 중 추천된 항목의 비율

def calculate_precision_recall(testset, predictions_dict, threshold, k):
    user_actual = defaultdict(set)
    user_predicted = defaultdict(list)

    # 테스트 세트에서 각 사용자별 실제 선호 항목(30분이상시청) 추출
    for uid, iid, true_r in testset:
        if true_r >= threshold:
            user_actual[uid].add(iid)

    # 추천된 항목을 사용자별로 저장
    for uid, user_ratings in predictions_dict.items():
        for iid, est in user_ratings:
            user_predicted[uid].append((iid, est))

    precisions = dict()
    recalls = dict()

    # 각 사용자에 대해 Precision과 Recall을 계산
    for uid in user_actual:
        # 상위 k개의 추천 항목
        user_pred_k = sorted(user_predicted[uid], key=lambda x: x[1], reverse=True)[:k]
        user_pred_k = set([iid for iid, _ in user_pred_k])

        # 실제 선호 항목과 추천 항목의 교집합
        tp = len(user_actual[uid].intersection(user_pred_k))

        # Precision과 Recall 계산
        precisions[uid] = tp / len(user_pred_k) if user_pred_k else 0
        recalls[uid] = tp / len(user_actual[uid]) if user_actual[uid] else 0

    # 전체 사용자에 대한 평균 Precision과 Recall
    avg_precision = sum(prec for prec in precisions.values()) / len(precisions)
    avg_recall = sum(rec for rec in recalls.values()) / len(recalls)

    return avg_precision, avg_recall

# 시청시간 threshold 30분으로 지정

# Setting the threshold 30 minutes and calculating average precision and recall
avg_precision, avg_recall = calculate_precision_recall(testset, final_predictions, threshold=30, k=3)

print(f"Average Precision: {avg_precision}")
print(f"Average Recall: {avg_recall}")


Average Precision: 0.7719454329774607
Average Recall: 0.9919928825622776
