In [None]:
import time
start = time.time()

In [None]:
import pandas as pd
import numpy as np
import knn_improvements as knn_imp  # Import các hàm cải tiến
reviews = pd.read_csv("final_reviews.csv")

In [None]:
games_details = pd.read_csv("final_games.csv")
print(games_details)

In [None]:
print(reviews)

In [None]:
your_games = pd.read_csv("your_games.csv")
print(your_games)

In [None]:
interested_games_id = your_games[your_games['review'] == 0.5]['gameID'].tolist()

print(interested_games_id)

In [None]:
bad_games_id = set(your_games[your_games['review'] == -1]['gameID'])

print(bad_games_id)

In [None]:
fav_games = pd.read_csv("fav_games.csv")
print(fav_games)

In [None]:
my_games_id = sorted(list(your_games['gameID'].unique()))
print(my_games_id)

In [None]:
not_played_games_id = [id for id in my_games_id if id not in interested_games_id]
print(not_played_games_id)

In [None]:
import math

# Assuming 'reviews' is your DataFrame containing reviews data

# Step 1: Get the list of game IDs from my_games_id
num_games = len(my_games_id)

# CẢI TIẾN: Sử dụng optimal threshold thay vì công thức cố định
threshold = knn_imp.calculate_optimal_threshold(my_games_id, reviews, percentile=25)
print("Optimal threshold:", threshold)

# Step 2: Count the number of reviews per user for the specified game IDs
user_review_counts = reviews[reviews['app_id'].isin(set(my_games_id))].groupby('user_id')['app_id'].size()

# Step 3: Identify users who have reviewed at least threshold number of games
relevant_users = user_review_counts[user_review_counts >= threshold].index

# Step 4: Filter reviews for these relevant users
filtered_reviews = reviews[reviews['user_id'].isin(relevant_users)]
filtered_reviews.loc[:,'is_recommended'] = filtered_reviews['is_recommended'].map({True: 1, False: -1})
# Display or process the filtered reviews
print(filtered_reviews)


In [None]:
# Count the number of unique users and games in filtered reviews
num_unique_users = filtered_reviews['user_id'].nunique()
num_unique_games = filtered_reviews['app_id'].nunique()

# Display the counts
print(f"Number of unique users in filtered reviews: {num_unique_users}")
print(f"Number of unique games in filtered reviews: {num_unique_games}")


In [None]:
from scipy.sparse import csr_matrix

# Step 2: Get unique user IDs and sorted list of unique app IDs
user_id_list = sorted(filtered_reviews['user_id'].unique())
games_id_reviews = sorted(filtered_reviews['app_id'].unique())

# Step 3: Create dictionaries for index mapping
user_to_index = {user_id: idx for idx, user_id in enumerate(user_id_list)}
app_to_index = {app_id: idx for idx, app_id in enumerate(games_id_reviews)}

# Step 4: Use vectorized operations to map user and app IDs to indices
row_indices = np.array([user_to_index[user_id] for user_id in filtered_reviews['user_id']])
col_indices = np.array([app_to_index[app_id] for app_id in filtered_reviews['app_id']])
data = np.array(filtered_reviews['is_recommended'])

user_vector_sparse = csr_matrix((data, (row_indices, col_indices)), shape=(num_unique_users, num_unique_games))

print(user_vector_sparse)


In [None]:
user_id_list = list(range(num_unique_users))
print(user_id_list)

In [None]:
# Initialize a dictionary for my_vector
my_vector = {}

# Assuming your_games is a pandas DataFrame and games_id_reviews is a list of game IDs
for game_id in my_games_id:
    if game_id not in games_id_reviews:
        continue
    review_value = your_games.loc[your_games['gameID'] == game_id, 'review'].values[0]
    my_vector[games_id_reviews.index(game_id)] = review_value

# Convert my_vector to a sparse vector using scipy.sparse.csr_matrix
my_vector = csr_matrix((list(my_vector.values()), ([0]*len(my_vector), list(my_vector.keys()))), shape=(1, len(games_id_reviews)))


print(my_vector)


In [None]:
# Convert fav_games['gameID'] to a set for faster membership testing
fav_games_set = set(fav_games['gameID'])

# CẢI TIẾN: Sử dụng hàm tính weights cải tiến (sửa bug và đơn giản hóa)
# Giảm multipliers để tránh weights quá lớn/nhỏ
weights = knn_imp.calculate_weights_improved(
    user_vector_sparse, 
    games_id_reviews, 
    fav_games_set, 
    bad_games_id,
    fav_weight_multiplier=1.5,  # Giảm từ 2.0 xuống 1.5 để tránh weights quá lớn
    bad_weight_multiplier=0.7   # Tăng từ 0.5 lên 0.7 để giảm penalty
)

print("Weights calculated with improvements:")
print(f"Min: {weights.min():.4f}, Max: {weights.max():.4f}, Mean: {weights.mean():.4f}")
print(f"Total users: {len(weights)}")
print(weights[:20])  # Chỉ in 20 đầu tiên để dễ xem


In [None]:

series = pd.Series(weights)

# Count occurrences
counts = series.value_counts()

# Print counts
print(counts)

In [None]:
from sklearn.metrics.pairwise import cosine_distances

def getDistanceList(my_vector):
    # Calculate cosine distances using cosine_distances from sklearn
    distances = cosine_distances(user_vector_sparse, my_vector).flatten()
    
    # Create list of tuples (index, distance, user_vector_sparse[i])
    distance_list = [(i, distances[i], user_vector_sparse[i]) for i in range(user_vector_sparse.shape[0])]
    
    # Sort by distance
    distance_list = sorted(distance_list, key=lambda x: x[1])
    
    return distance_list


In [None]:
def getKnnVector(my_vector, k=None, use_adaptive_k=False):
    """
    CẢI TIẾN: Hàm getKnnVector - giữ gần với code gốc để đảm bảo relevance scores đúng
    Adaptive K tắt mặc định, có thể bật nếu cần
    """
    distance_list = getDistanceList(my_vector)

    # CẢI TIẾN: Adaptive K selection (TẮT mặc định)
    if use_adaptive_k and k is None:
        k = knn_imp.calculate_optimal_k(distance_list, min_k=5, max_k=50)
        print(f"Using adaptive K: {k}")
    elif k is None:
        k = min(30, len(distance_list))  # Giữ như code gốc
    
    actual_k = min(k, len(distance_list))
    
    if actual_k == 0:
        return np.zeros(my_vector.shape[1])  # Trả về vector rỗng nếu không tìm thấy ai

    # Thay k bằng actual_k trong vòng lặp
    indices = np.array([distance_list[i][0] for i in range(actual_k)])
    distances = np.array([distance_list[i][1] for i in range(actual_k)])
    
    user_vectors = user_vector_sparse[indices]

    # GIỮ NGUYÊN CÁCH TÍNH NHƯ CODE GỐC để đảm bảo relevance scores đúng
    # Compute weights (cần xử lý trường hợp distance = 0 để tránh chia cho 0)
    safe_distances = distances + 1e-9 
    weights_factors = weights[indices] / safe_distances

    # Multiply user vectors by weights
    weighted_vectors = user_vectors.multiply(weights_factors[:, None])

    # Sum the weighted vectors
    vector = weighted_vectors.sum(axis=0).A1

    return vector

In [None]:
def getMesure(rcm,vector):
    recommended_game_ids = [game[0] for game in rcm]
    recommended_game_index = [games_id_reviews.index(id) for id in recommended_game_ids]
    true_positive = 0
    false_positive = 0
    true_negative = 0
    false_negative = 0
    vector = vector.toarray().flatten()
    for i in range(len(games_id_reviews)):
        if vector[i] > 0:
            if i in recommended_game_index:
                true_positive += 1
            else:
                false_negative += 1
        elif vector[i] < 0:
            if i in recommended_game_index:
                false_positive += 1
            else:
                true_negative += 1
    print("True positive:",true_positive)
    print("True negative:",true_negative)
    print("False negative:",false_negative)
    print("False positive",false_positive)
    all_predictions = true_positive + false_positive + true_negative + false_negative
    accuracy = (true_positive + true_negative) * 100 / all_predictions
    if true_positive + false_positive == 0:
        precision = accuracy
    else:
        precision = true_positive * 100 / (true_positive + false_positive)
    if (true_positive + false_negative) == 0:
        recall = accuracy
    else:
        recall = true_positive * 100 / (true_positive + false_negative)
    return np.array([accuracy,precision,recall])

In [None]:
import numpy as np
from scipy.sparse import csr_matrix

def split_vector(vector):
    # Convert vector to CSR format if not already in CSR format
    if not isinstance(vector, csr_matrix):
        vector = csr_matrix(vector)
    
    # Extract data, indices, and indptr from the CSR matrix
    data = vector.data
    indices = vector.indices
    indptr = vector.indptr
    
    # Identify positive (>0) and negative (<0) indices
    positive_indices = [i for i, val in enumerate(data) if val > 0]
    negative_indices = [i for i, val in enumerate(data) if val < 0]
    
    # Calculate counts of positive and negative elements
    positive_count = len(positive_indices)
    negative_count = len(negative_indices)
    
    # Determine split counts based on the desired ratio (80% and 20%)
    first_subvector_positive_count = int(0.8 * positive_count)
    first_subvector_negative_count = int(0.8 * negative_count)
    
    # Shuffle positive and negative indices randomly
    np.random.shuffle(positive_indices)
    np.random.shuffle(negative_indices)
    
    # Initialize data structures for subvectors
    first_subvector_data = np.zeros_like(data)
    second_subvector_data = np.zeros_like(data)
    
    # Assign values to subvectors based on indices
    for i in positive_indices[:first_subvector_positive_count]:
        first_subvector_data[i] = data[i]
    for i in positive_indices[first_subvector_positive_count:]:
        second_subvector_data[i] = data[i]
    
    for i in negative_indices[:first_subvector_negative_count]:
        first_subvector_data[i] = data[i]
    for i in negative_indices[first_subvector_negative_count:]:
        second_subvector_data[i] = data[i]
    
    # Create CSR matrices for subvectors
    first_subvector = csr_matrix((first_subvector_data, indices, indptr), shape=vector.shape)
    second_subvector = csr_matrix((second_subvector_data, indices, indptr), shape=vector.shape)
    
    return first_subvector, second_subvector


In [None]:
def getRecommendedGameId(k=None, test=False, use_adaptive_k=True, apply_popularity_penalty=False, popularity_penalty_factor=0.05):
    """
    CẢI TIẾN: Hàm getRecommendedGameId với adaptive K
    Lưu ý: Tắt popularity penalty mặc định vì có thể làm giảm chất lượng
    """
    print("k =", k if k else "adaptive")
    train_vector = my_vector.copy()
    if test:
        train_vector, test_vector = split_vector(train_vector)
    else:
        test_vector = train_vector
    
    # Sử dụng getKnnVector cải tiến với adaptive K
    vector = getKnnVector(train_vector, k=k, use_adaptive_k=use_adaptive_k)
    
    rcm_game_id = []
    for i in range(len(vector)):
        if vector[i] > 0:
            rcm_game_id.append((games_id_reviews[i], vector[i]))
    
    # CẢI TIẾN: Áp dụng popularity penalty (TẮT MẶC ĐỊNH - chỉ bật nếu cần)
    # Popularity penalty có thể làm giảm relevance scores và chất lượng recommendations
    if apply_popularity_penalty:
        # Tạo metadata dict từ games_details
        games_metadata_dict = {}
        for _, row in games_details.iterrows():
            games_metadata_dict[row['app_id']] = {
                'user_reviews': row.get('user_reviews', 0)
            }
        
        rcm_game_id = knn_imp.apply_popularity_penalty(
            rcm_game_id, 
            games_metadata_dict, 
            penalty_factor=popularity_penalty_factor
        )
        print(f"Applied popularity penalty (factor={popularity_penalty_factor})")
    
    rcm_game_id = sorted(rcm_game_id, key=lambda x: -x[1])
    
    if test:
        measure = getMesure(rcm_game_id, test_vector)
        return rcm_game_id, measure
    
    return rcm_game_id, None

In [None]:
def getRecommendation(rcm, test = False):
    if test:
        recommended_game_ids = [game[0] for game in rcm]
    else:
        recommended_game_ids = [game[0] for game in rcm if game[0] not in not_played_games_id]
    print(recommended_game_ids)
    games_details = pd.read_csv("final_games.csv")
    recommended_game_details = games_details[(games_details['app_id'].isin(recommended_game_ids))].copy()
    relevance_df = pd.DataFrame(rcm, columns=['app_id', 'relevance'])
    recommended_game_details_with_relevance = pd.merge(recommended_game_details, relevance_df, on='app_id')
    recommended_game_details_sorted = recommended_game_details_with_relevance.sort_values(by='relevance', ascending=False)
    recommended_game_details_wish = recommended_game_details_sorted[recommended_game_details_sorted['app_id'].isin(interested_games_id)]
    return recommended_game_details_sorted[['sort_rank', 'title', 'date_release', 'relevance', 'positive_ratio', 'user_reviews']],recommended_game_details_wish[['sort_rank', 'title', 'date_release', 'relevance', 'positive_ratio', 'user_reviews']]


In [None]:
# CẢI TIẾN: Sử dụng với cài đặt an toàn (giữ gần code gốc)
# TẮT adaptive K và popularity penalty để đảm bảo relevance scores đúng
rcm, measure = getRecommendedGameId(
    k=30,  # Dùng K cố định như code gốc (có thể thử adaptive K sau)
    test=False,
    use_adaptive_k=False,  # TẮT adaptive K để giữ như code gốc
    apply_popularity_penalty=False,  # TẮT popularity penalty - giữ nguyên relevance scores
    popularity_penalty_factor=0.05
)
print(f"\nTotal recommendations: {len(rcm)}")
print("\nTop 10 recommendations:")
for i, (game_id, relevance) in enumerate(rcm[:10]):
    print(f"{i+1}. Game ID: {game_id}, Relevance: {relevance:.4f}")
if measure is not None:
    print(f"\nMetrics: Accuracy={measure[0]:.2f}%, Precision={measure[1]:.2f}%, Recall={measure[2]:.2f}%")

In [None]:
# CẢI TIẾN: Tính diversity metrics để đánh giá chất lượng recommendations
if len(rcm) > 0:
    # Tạo metadata dict cho diversity calculation
    games_metadata_dict = {}
    for _, row in games_details.iterrows():
        games_metadata_dict[row['app_id']] = {
            'genres': row.get('genres', '') if 'genres' in row else ''
        }
    
    # Tính diversity (nếu có genre data)
    try:
        diversity = knn_imp.calculate_diversity(rcm, games_metadata_dict, top_k=10)
        print(f"\nDiversity (top 10): {diversity:.3f} (càng cao càng đa dạng)")
    except:
        print("\nDiversity calculation skipped (no genre data)")
    
    # Tính coverage
    coverage = knn_imp.calculate_coverage(rcm, games_id_reviews, top_k=10)
    print(f"Coverage (top 10): {coverage:.6f} (tỷ lệ games được recommend)")
else:
    print("No recommendations to analyze")


In [None]:
recommendation, recommendation_wish = getRecommendation(rcm, test=False)
print(recommendation)

In [None]:
print(recommendation_wish)

In [None]:
recommendation.to_csv('rcm_games.csv', index=False)
recommendation_wish.to_csv("rcm_wish.csv",index=False)

In [None]:
tim = time.time()- start
print("Time:",tim)

In [None]:
import tkinter as tk
from tkinter import scrolledtext

class DataFrameViewer:
    def __init__(self, root, dataframe):
        self.root = root
        self.root.title('Recommended Games (' + str(round(tim,1)) + 's)')

        # Create scrolled text widget with monospaced font
        self.txt = scrolledtext.ScrolledText(self.root, width=100, height=20, wrap=tk.NONE, font=("Courier", 10))
        self.txt.pack(expand=True, fill=tk.BOTH)

        # Format DataFrame to string with proper alignment and fixed column widths
        try:
            df_str = self.format_dataframe(dataframe)
            self.txt.insert(tk.END, df_str)
        except AttributeError:
            self.txt.insert(tk.END, "Invalid DataFrame")

        # Disable editing
        self.txt.configure(state='disabled')

    def format_dataframe(self, dataframe):
        # Define column widths
        col_widths = {
            'Rank':10,
            'sort_rank': 10,
            'title': 60,
            'date_release': 20,
            'relevance' : 20,
            'positive_ratio': 20,
            'user_reviews':10
        }

        # Create formatted string
        header = "".join([f"{col:{col_widths[col]}}" for col in dataframe.columns]) + "\n"
        rows = "\n".join(
            "".join([f"{str(value):{col_widths[col]}}" for col, value in row.items()])
            for _, row in dataframe.iterrows()
        )

        return header + rows

def main(recommendation):
    recommendation['Rank'] = range(1, len(recommendation) + 1)

    # Insert the 'rank' column at the first position
    recommendation.insert(0, 'Rank', recommendation.pop('Rank'))
    root = tk.Tk()
    app = DataFrameViewer(root, recommendation)
    root.mainloop()


In [None]:
main(recommendation)

In [None]:
main(recommendation_wish)