In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from math import radians, sin, cos, sqrt, atan2
from datetime import datetime

In [2]:
# --- START OF HELPER FUNCTIONS ---

def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Radius of Earth in kilometers

    lat1_rad = radians(lat1)
    lon1_rad = radians(lon1)
    lat2_rad = radians(lat2)
    lon2_rad = radians(lon2)

    dlon = lon2_rad - lon1_rad
    dlat = lat2_rad - lat1_rad

    a = sin(dlat / 2)**2 + cos(lat1_rad) * cos(lat2_rad) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    return distance

def calculate_age(born_str):
    born = datetime.strptime(born_str, '%Y-%m-%d')
    today = datetime.today()
    return today.year - born.year - ((today.month, today.day) < (born.month, born.day))

def preprocess_features(df):
    # Calculate Age
    df['age'] = df['date_of_birth'].apply(calculate_age)

    # Numerical features to scale
    numerical_features = ['height', 'age']
    scaler = StandardScaler() # or MinMaxScaler()
    df[numerical_features] = scaler.fit_transform(df[numerical_features])

    # Categorical features for one-hot encoding
    # 'sex' and 'orientation' are handled separately for compatibility matrix
    # but can also be included here for general similarity if desired.
    categorical_features = ['body_type', 'job', 'drink', 'smoke', 'education_level']
    df_categorical = pd.get_dummies(df[categorical_features], prefix=categorical_features, dummy_na=False) # Handle NaNs if any

    # Binary features
    binary_features = ['interested_in_new_language', 'dropped_out_school']
    df_binary = df[binary_features].astype(int)

    # Multi-hot encode list-like features
    # Pets
    df['pets_list'] = df['pets'].apply(lambda x: [p.strip() for p in x.split(' - ')] if pd.notna(x) and x else [])
    mlb_pets = MultiLabelBinarizer()
    df_pets = pd.DataFrame(mlb_pets.fit_transform(df['pets_list']), columns=['pet_' + c for c in mlb_pets.classes_], index=df.index)

    # Interests
    df['interests_list'] = df['interests'].apply(lambda x: [i.strip() for i in x.split(' - ')] if pd.notna(x) and x else [])
    mlb_interests = MultiLabelBinarizer()
    df_interests = pd.DataFrame(mlb_interests.fit_transform(df['interests_list']), columns=['interest_' + c for c in mlb_interests.classes_], index=df.index)

    # Languages
    df['languages_list'] = df['languages'].apply(lambda x: [l.strip() for l in x.split(' - ')] if pd.notna(x) and x else [])
    # Using CountVectorizer for languages as it's more common for language proficiency features
    # However, MultiLabelBinarizer is also fine for just knowing the language.
    # Let's stick to MultiLabelBinarizer for consistency here.
    mlb_languages = MultiLabelBinarizer()
    df_languages = pd.DataFrame(mlb_languages.fit_transform(df['languages_list']), columns=['lang_' + c for c in mlb_languages.classes_], index=df.index)

    # Combine all processed features
    # We are not including lat/lon directly in cosine similarity for Matrix 1
    # as distance will be part of Matrix 2 (compatibility filter).
    # If you wanted to include location as a feature for similarity, you might
    # consider transforming lat/lon (e.g. using sin/cos transforms) or
    # using distance to a reference point, but it's often complex.
    feature_df = pd.concat([df[numerical_features], df_categorical, df_binary, df_pets, df_interests, df_languages], axis=1)

    return feature_df

# --- END OF HELPER FUNCTIONS ---

In [3]:
# Load the dataset
csv_file_path = '../data/match_profiles.csv'
df_users = pd.read_csv(csv_file_path)
df_users.set_index('id', inplace=True) # Set 'id' as index

In [4]:
# --- Matrix 1: User-User Similarity Matrix ---
print("Preprocessing features for Matrix 1...")
feature_matrix = preprocess_features(df_users.copy()) # Use .copy() to avoid SettingWithCopyWarning on original df_users
print(f"Feature matrix shape: {feature_matrix.shape}")

print("Calculating cosine similarity (Matrix 1)...")
# Fill NaN values that might have occurred if a category was all NaN before one-hot encoding (unlikely with this data)
# Or if scaler produced NaNs (also unlikely with StandardScaler on non-NaN data)
feature_matrix_filled = feature_matrix.fillna(0)
similarity_matrix_S = cosine_similarity(feature_matrix_filled)

# Convert to DataFrame for easier handling with user IDs
similarity_df = pd.DataFrame(similarity_matrix_S, index=df_users.index, columns=df_users.index)

# Set diagonal elements to 0 (user cannot match with themselves)
np.fill_diagonal(similarity_df.values, 0)
print("Matrix 1 (Similarity Matrix) created.")

Preprocessing features for Matrix 1...
Feature matrix shape: (2001, 81)
Calculating cosine similarity (Matrix 1)...
Matrix 1 (Similarity Matrix) created.


In [5]:
# --- Matrix 2: Binary User-User Compatibility Matrix ---
print("Creating Compatibility Matrix (Matrix 2)...")
num_users = len(df_users)
compatibility_matrix = pd.DataFrame(np.zeros((num_users, num_users)), index=df_users.index, columns=df_users.index)

# Pre-calculate all pairwise distances
print("Calculating pairwise distances...")
distances = np.zeros((num_users, num_users))
user_ids_list = df_users.index.tolist()
lat_lon_map = df_users[['latitude', 'longitude']].to_dict('index')

for i in range(num_users):
    for j in range(i + 1, num_users):
        user1_id = user_ids_list[i]
        user2_id = user_ids_list[j]

        lat1, lon1 = lat_lon_map[user1_id]['latitude'], lat_lon_map[user1_id]['longitude']
        lat2, lon2 = lat_lon_map[user2_id]['latitude'], lat_lon_map[user2_id]['longitude']

        dist = haversine(lat1, lon1, lat2, lon2)
        distances[i, j] = dist
        distances[j, i] = dist

distance_df = pd.DataFrame(distances, index=df_users.index, columns=df_users.index)
print("Pairwise distances calculated.")

for i_idx, user1_id in enumerate(df_users.index):
    for j_idx, user2_id in enumerate(df_users.index):
        if user1_id == user2_id:
            compatibility_matrix.loc[user1_id, user2_id] = 0
            continue

        user1 = df_users.loc[user1_id]
        user2 = df_users.loc[user2_id]

        # 1. Orientation Compatibility
        compatible_orientation = False
        s1, o1 = user1['sex'], user1['orientation']
        s2, o2 = user2['sex'], user2['orientation']

        # Define who user1 is interested in based on their orientation
        user1_interested_in_user2 = False
        if o1 == 'straight':
            if (s1 == 'male' and s2 == 'female') or \
               (s1 == 'female' and s2 == 'male'):
                user1_interested_in_user2 = True
            # non-binary straight could be interested in other non-binary, or specific gender
            elif s1 == 'non-binary' and s2 != 'non-binary': # Simplified: straight non-binary interested in male/female
                 user1_interested_in_user2 = True
        elif o1 == 'homosexual':
            if s1 == s2 : # Covers male-male, female-female, non-binary-non-binary
                user1_interested_in_user2 = True
        elif o1 == 'bisexual':
            user1_interested_in_user2 = True # Assumes bisexual can be interested in any listed sex

        # Define who user2 is interested in based on their orientation
        user2_interested_in_user1 = False
        if o2 == 'straight':
            if (s2 == 'male' and s1 == 'female') or \
               (s2 == 'female' and s1 == 'male'):
                user2_interested_in_user1 = True
            elif s2 == 'non-binary' and s1 != 'non-binary':
                 user2_interested_in_user1 = True
        elif o2 == 'homosexual':
            if s2 == s1:
                user2_interested_in_user1 = True
        elif o2 == 'bisexual':
            user2_interested_in_user1 = True

        if user1['sex'] == 'prefer not to say' or user2['sex'] == 'prefer not to say' or \
           user1['orientation'] == 'prefer not to say' or user2['orientation'] == 'prefer not to say':
            # If either has 'prefer not to say' for sex or orientation,
            # they are only compatible if both are bisexual or both 'prefer not to say' orientation
            # This is a simplification; complex rules could be added
            if (user1['orientation'] == 'bisexual' or user1['orientation'] == 'prefer not to say') and \
               (user2['orientation'] == 'bisexual' or user2['orientation'] == 'prefer not to say'):
                 compatible_orientation = True # Let similarity decide more
            else:
                compatible_orientation = False
        elif user1_interested_in_user2 and user2_interested_in_user1:
            compatible_orientation = True

        if not compatible_orientation:
            compatibility_matrix.loc[user1_id, user2_id] = 0
            continue

        # 2. Location Compatibility
        dist = distance_df.loc[user1_id, user2_id]

        pref1 = user1['location_preference']
        pref2 = user2['location_preference']

        compatible_location1 = (pref1 == -1) or (dist <= pref1)
        compatible_location2 = (pref2 == -1) or (dist <= pref2)

        if compatible_location1 and compatible_location2:
            compatibility_matrix.loc[user1_id, user2_id] = 1
        else:
            compatibility_matrix.loc[user1_id, user2_id] = 0

print("Matrix 2 (Compatibility Matrix) created.")

Creating Compatibility Matrix (Matrix 2)...
Calculating pairwise distances...
Pairwise distances calculated.
Matrix 2 (Compatibility Matrix) created.


In [6]:
# --- Final Match Percentage Matrix ---
print("Calculating final match percentage matrix...")
# Cosine similarity outputs values between -1 and 1.
# For non-negative feature vectors (after one-hot encoding etc.), it's usually 0 to 1.
# We can scale it to 0-100 if desired, or keep as 0-1.
# Let's ensure similarity_df is non-negative for match percentage.
# Since we scaled features with StandardScaler, some could be negative.
# MinMaxScaler on feature_matrix would ensure 0-1 range for cosine similarity.
# Alternatively, we can shift and scale the similarity_df: (similarity_df + 1) / 2 to map [-1,1] to [0,1]
# Or simply clip negative values if they are very small and due to numerical precision
# For this problem, given the diverse features, some negative cosine similarities are possible.
# Let's rescale similarity_df to be [0, 1] for interpretability as a "score" before combining.
min_sim = similarity_df.min().min()
max_sim = similarity_df.max().max()
if min_sim < 0 or max_sim > 1 : # Rescale if not already in approx [0,1] (after diagonal is zeroed)
    # If min_sim is very close to 0 (e.g. -1e-9), clipping might be enough
    # For a general case, rescaling:
    similarity_df_scaled = (similarity_df - min_sim) / (max_sim - min_sim)
    # Ensure diagonal is still 0 after scaling
    np.fill_diagonal(similarity_df_scaled.values, 0)
else:
    similarity_df_scaled = similarity_df.clip(lower=0) # Clip any small negative values

final_match_matrix = similarity_df_scaled * compatibility_matrix
# To represent as percentage:
final_match_matrix_percentage = final_match_matrix * 100

print("Final Match Percentage Matrix calculated.")
print("\nSample of the Final Match Percentage Matrix (Top 5x5):")
print(final_match_matrix_percentage.iloc[:5, :5])

print("\nExample: Match percentages for user 1 with users 2 to 5:")
print(final_match_matrix_percentage.loc[1, 2:6])

# You can save this matrix to a CSV if needed
# final_match_matrix_percentage.to_csv('final_user_match_percentages.csv')
print("\nDone.")# --- Final Match Percentage Matrix ---
print("Calculating final match percentage matrix...")
# Cosine similarity outputs values between -1 and 1.
# For non-negative feature vectors (after one-hot encoding etc.), it's usually 0 to 1.
# We can scale it to 0-100 if desired, or keep as 0-1.
# Let's ensure similarity_df is non-negative for match percentage.
# Since we scaled features with StandardScaler, some could be negative.
# MinMaxScaler on feature_matrix would ensure 0-1 range for cosine similarity.
# Alternatively, we can shift and scale the similarity_df: (similarity_df + 1) / 2 to map [-1,1] to [0,1]
# Or simply clip negative values if they are very small and due to numerical precision
# For this problem, given the diverse features, some negative cosine similarities are possible.
# Let's rescale similarity_df to be [0, 1] for interpretability as a "score" before combining.
min_sim = similarity_df.min().min()
max_sim = similarity_df.max().max()
if min_sim < 0 or max_sim > 1 : # Rescale if not already in approx [0,1] (after diagonal is zeroed)
    # If min_sim is very close to 0 (e.g. -1e-9), clipping might be enough
    # For a general case, rescaling:
    similarity_df_scaled = (similarity_df - min_sim) / (max_sim - min_sim)
    # Ensure diagonal is still 0 after scaling
    np.fill_diagonal(similarity_df_scaled.values, 0)
else:
    similarity_df_scaled = similarity_df.clip(lower=0) # Clip any small negative values

final_match_matrix = similarity_df_scaled * compatibility_matrix
# To represent as percentage:
final_match_matrix_percentage = final_match_matrix * 100

print("Final Match Percentage Matrix calculated.")
print("\nSample of the Final Match Percentage Matrix (Top 5x5):")
print(final_match_matrix_percentage.iloc[:5, :5])

print("\nExample: Match percentages for user 1 with users 2 to 5:")
print(final_match_matrix_percentage.loc[1, 2:6])

# You can save this matrix to a CSV if needed
# final_match_matrix_percentage.to_csv('../data/match_percentages.csv')
print("\nDone.")

Calculating final match percentage matrix...
Final Match Percentage Matrix calculated.

Sample of the Final Match Percentage Matrix (Top 5x5):
id          1    2          3          4          5
id                                                 
1    0.000000  0.0  71.647858   0.000000  55.956462
2    0.000000  0.0   0.000000   0.000000   0.000000
3   71.647858  0.0   0.000000   0.000000  61.902894
4    0.000000  0.0   0.000000   0.000000  58.004206
5   55.956462  0.0  61.902894  58.004206   0.000000

Example: Match percentages for user 1 with users 2 to 5:
id
2     0.000000
3    71.647858
4     0.000000
5    55.956462
6     0.000000
Name: 1, dtype: float64

Done.
Calculating final match percentage matrix...
Final Match Percentage Matrix calculated.

Sample of the Final Match Percentage Matrix (Top 5x5):
id          1    2          3          4          5
id                                                 
1    0.000000  0.0  71.647858   0.000000  55.956462
2    0.000000  0.0   0.0000

In [7]:
final_match_matrix_percentage

id,1,2,3,4,5,6,7,8,9,10,...,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.000000,0.0,71.647858,0.000000,55.956462,0.000000,47.555614,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,70.936857,0.000000,0.000000,0.000000
2,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,71.647858,0.0,0.000000,0.000000,61.902894,0.000000,0.000000,0.000000,0.000000,49.758115,...,0.000000,0.000000,0.000000,0.000000,0.000000,70.979704,0.000000,66.733999,0.000000,0.000000
4,0.000000,0.0,0.000000,0.000000,58.004206,0.000000,0.000000,0.000000,61.799168,0.000000,...,62.870167,0.000000,64.854645,56.149929,0.000000,0.000000,0.000000,0.000000,0.000000,67.130088
5,55.956462,0.0,61.902894,58.004206,0.000000,62.232434,53.272082,41.910694,0.000000,58.136731,...,0.000000,43.138222,0.000000,0.000000,46.489551,63.036185,59.838410,58.750353,49.471048,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1997,0.000000,0.0,70.979704,0.000000,63.036185,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1998,70.936857,0.0,0.000000,0.000000,59.838410,0.000000,42.855875,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1999,0.000000,0.0,66.733999,0.000000,58.750353,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2000,0.000000,0.0,0.000000,0.000000,49.471048,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,64.333244


In [8]:
import pandas as pd
import numpy as np
import random
from collections import defaultdict

# GIẢ SỬ BẠN ĐÃ CÓ final_match_matrix_percentage TỪ BƯỚC TRƯỚC
# Ví dụ tạo một ma trận giả lập nếu bạn chưa chạy bước trước:
# Đây là ma trận TỶ LỆ % (0-100)
# user_ids = list(range(1, 101)) # Ví dụ với 100 users
# num_users = len(user_ids)
# # Tạo ma trận ngẫu nhiên, sau đó làm đối xứng và đặt đường chéo bằng 0
# final_match_matrix_percentage_data = np.random.rand(num_users, num_users) * 100
# final_match_matrix_percentage_data = (final_match_matrix_percentage_data + final_match_matrix_percentage_data.T) / 2
# np.fill_diagonal(final_match_matrix_percentage_data, 0)
# # Giả sử chỉ một phần nhỏ có match > 0 để mô phỏng thực tế
# for i in range(num_users):
#     for j in range(i + 1, num_users):
#         if random.random() > 0.8: # ~20% có thể có match
#             pass
#         else:
#             final_match_matrix_percentage_data[i, j] = 0
#             final_match_matrix_percentage_data[j, i] = 0
# final_match_matrix_percentage = pd.DataFrame(final_match_matrix_percentage_data, index=user_ids, columns=user_ids)
# print("Ma trận ví dụ final_match_matrix_percentage (5x5 đầu):")
# print(final_match_matrix_percentage.iloc[:5,:5])
# print("-" * 30)


# --- BẮT ĐẦU TẠO CẶP MATCH NGẪU NHIÊN ---



all_user_ids = final_match_matrix_percentage.index.tolist()
min_matches_per_user = 1
max_matches_per_user = 25

# Lưu trữ các cặp đã chọn để tránh trùng lặp (user_id_1, user_id_2) với user_id_1 < user_id_2
selected_pairs_set = set()
# Lưu trữ kết quả cuối cùng
final_selected_matches = []
# Theo dõi số lượng match hiện tại của mỗi user
user_actual_match_counts = defaultdict(int)

print("Bắt đầu quá trình chọn cặp match...")

# Giai đoạn 1: Ưu tiên chọn cho mỗi user để đạt số lượng match mong muốn (random trong khoảng min-max)
# Xáo trộn danh sách user để thứ tự xử lý không ảnh hưởng quá nhiều
random.shuffle(all_user_ids)

for user1_id in all_user_ids:
    if user_actual_match_counts[user1_id] >= max_matches_per_user:
        continue

    potential_partners = []
    for user2_id in all_user_ids:
        if user1_id == user2_id:
            continue

        match_score = final_match_matrix_percentage.loc[user1_id, user2_id]
        if match_score > 0:
            # Chỉ xem xét user2 nếu họ chưa đạt max_matches
            if user_actual_match_counts[user2_id] < max_matches_per_user:
                # Đảm bảo cặp chưa được thêm vào
                pair_key = tuple(sorted((user1_id, user2_id)))
                if pair_key not in selected_pairs_set:
                    potential_partners.append({'partner_id': user2_id, 'score': match_score})

    random.shuffle(potential_partners) # Xáo trộn các đối tác tiềm năng

    # Xác định số lượng match mục tiêu cho user1_id này
    # User này cần ít nhất là min_matches_per_user, hoặc nhiều hơn nếu có thể, tối đa là max_matches_per_user

    # Số lượng match mà user1_id còn thiếu để đạt min
    needed_for_min = max(0, min_matches_per_user - user_actual_match_counts[user1_id])
    # Số lượng match user1_id có thể thêm mà không vượt max
    can_add_up_to_max = max_matches_per_user - user_actual_match_counts[user1_id]

    # Nếu user1_id chưa đủ min_matches, cố gắng đạt min_matches
    # Nếu đã đủ, thì chọn thêm ngẫu nhiên để đạt một con số trong khoảng [current_count, max_matches_per_user]
    # hoặc đơn giản là chọn một target_n_matches ngẫu nhiên trong [min_overall, max_overall]

    # Số lượng match mong muốn cho user này, nằm trong khoảng [min_matches_per_user, max_matches_per_user]
    # nhưng cũng bị giới hạn bởi số potential_partners thực tế và số slot còn trống của user1_id

    # Số match mục tiêu tổng cộng cho user1
    target_n_matches_for_user1 = random.randint(
        min(min_matches_per_user, len(potential_partners) + user_actual_match_counts[user1_id]), # đảm bảo nếu ít potential thì ko cố random quá cao
        min(max_matches_per_user, len(potential_partners) + user_actual_match_counts[user1_id])
    )

    # Số match cần chọn thêm cho user1
    num_to_select_now = max(0, target_n_matches_for_user1 - user_actual_match_counts[user1_id])


    added_this_iteration = 0
    for partner_info in potential_partners:
        if added_this_iteration >= num_to_select_now:
            break
        if user_actual_match_counts[user1_id] >= max_matches_per_user: # Re-check user1
            break

        user2_id = partner_info['partner_id']
        score = partner_info['score']

        # Kiểm tra lại user2_id vì count của nó có thể đã thay đổi do các user khác chọn
        if user_actual_match_counts[user2_id] < max_matches_per_user:
            pair_key = tuple(sorted((user1_id, user2_id)))
            if pair_key not in selected_pairs_set:
                final_selected_matches.append({
                    'user_id_1': pair_key[0],
                    'user_id_2': pair_key[1],
                    'match_percent': score
                })
                selected_pairs_set.add(pair_key)
                user_actual_match_counts[user1_id] += 1
                user_actual_match_counts[user2_id] += 1
                added_this_iteration += 1

# Giai đoạn 2: Kiểm tra và bổ sung cho những user chưa đủ min_matches_per_user
print("Giai đoạn 2: Đảm bảo số match tối thiểu...")
for user1_id in all_user_ids:
    if user_actual_match_counts[user1_id] < min_matches_per_user and \
       user_actual_match_counts[user1_id] < max_matches_per_user: # user này chưa đủ min và chưa đạt max

        needed_more = min_matches_per_user - user_actual_match_counts[user1_id]

        potential_partners_fill = []
        for user2_id in all_user_ids:
            if user1_id == user2_id:
                continue
            match_score = final_match_matrix_percentage.loc[user1_id, user2_id]
            if match_score > 0:
                pair_key = tuple(sorted((user1_id, user2_id)))
                if pair_key not in selected_pairs_set and \
                   user_actual_match_counts[user2_id] < max_matches_per_user: # user2 cũng chưa đạt max
                    potential_partners_fill.append({'partner_id': user2_id, 'score': match_score})

        random.shuffle(potential_partners_fill)

        added_in_fill_phase = 0
        for partner_info in potential_partners_fill:
            if added_in_fill_phase >= needed_more:
                break
            if user_actual_match_counts[user1_id] >= max_matches_per_user: # Re-check user1
                 break

            user2_id = partner_info['partner_id']
            score = partner_info['score']

            if user_actual_match_counts[user2_id] < max_matches_per_user: # Re-check user2
                pair_key = tuple(sorted((user1_id, user2_id)))
                # Mặc dù đã check selected_pairs_set ở trên, check lại cho chắc
                if pair_key not in selected_pairs_set:
                    final_selected_matches.append({
                        'user_id_1': pair_key[0],
                        'user_id_2': pair_key[1],
                        'match_percent': score
                    })
                    selected_pairs_set.add(pair_key)
                    user_actual_match_counts[user1_id] += 1
                    user_actual_match_counts[user2_id] += 1
                    added_in_fill_phase += 1


# Chuyển đổi danh sách kết quả thành DataFrame
output_df = pd.DataFrame(final_selected_matches)

# Sắp xếp lại để user_id_1 luôn nhỏ hơn user_id_2 (mặc dù đã cố gắng làm ở trên)
# Điều này không cần thiết nếu pair_key đã được dùng đúng cách
#mask = output_df['user_id_1'] > output_df['user_id_2']
#output_df.loc[mask, ['user_id_1', 'user_id_2']] = output_df.loc[mask, ['user_id_2', 'user_id_1']].values
#output_df = output_df.drop_duplicates(subset=['user_id_1', 'user_id_2'])


print(f"\nTổng số cặp match được tạo: {len(output_df)}")

# Kiểm tra số lượng match cho mỗi user
print("\nKiểm tra số lượng match cho mỗi user (Top 10 users):")
final_counts_check = defaultdict(int)
for _, row in output_df.iterrows():
    final_counts_check[row['user_id_1']] += 1
    final_counts_check[row['user_id_2']] += 1

# for i, user_id in enumerate(all_user_ids):
#     if i < 10:
#         print(f"User {user_id}: {final_counts_check[user_id]} matches")
#     if final_counts_check[user_id] < min_matches_per_user:
#         print(f"!!! CẢNH BÁO: User {user_id} có {final_counts_check[user_id]} matches (ít hơn {min_matches_per_user})")
#     if final_counts_check[user_id] > max_matches_per_user:
#         print(f"!!! CẢNH BÁO: User {user_id} có {final_counts_check[user_id]} matches (nhiều hơn {max_matches_per_user})")


# Phân tích số lượng match
counts_series = pd.Series(final_counts_check)
print("\nThống kê số lượng match mỗi user:")
print(counts_series.describe())
print(f"Số user có ít hơn {min_matches_per_user} match: {(counts_series < min_matches_per_user).sum()} (trong số {len(all_user_ids)} users)")
print(f"Số user có nhiều hơn {max_matches_per_user} match: {(counts_series > max_matches_per_user).sum()}")
users_less_than_min = counts_series[counts_series < min_matches_per_user].index.tolist()
if users_less_than_min:
    print(f"Các user có ít hơn {min_matches_per_user} match: {users_less_than_min[:20]}...") # In ra 20 user đầu tiên


# Hiển thị một vài dòng của kết quả
print("\nMột vài cặp match ngẫu nhiên đã tạo:")
print(output_df.head(10))

Bắt đầu quá trình chọn cặp match...
Giai đoạn 2: Đảm bảo số match tối thiểu...

Tổng số cặp match được tạo: 16586

Kiểm tra số lượng match cho mỗi user (Top 10 users):

Thống kê số lượng match mỗi user:
count    1962.000000
mean       16.907238
std         6.889198
min         1.000000
25%        11.000000
50%        18.000000
75%        24.000000
max        25.000000
dtype: float64
Số user có ít hơn 1 match: 0 (trong số 2001 users)
Số user có nhiều hơn 25 match: 0

Một vài cặp match ngẫu nhiên đã tạo:
   user_id_1  user_id_2  match_percent
0         69       1860      46.451159
1       1251       1860      52.599438
2         97       1860      50.887971
3        509       1860      50.037166
4       1280       1860      48.700275
5       1591       1860      52.168964
6        324       1860      52.607573
7       1505       1860      52.081170
8        203       1860      62.879201
9       1860       1919      56.115403

Đã lưu kết quả vào file: ../data/matched_pairs.csv


In [9]:
# Lưu vào file CSV
output_df.drop(columns=["match_percent"], inplace=True)
output_filename = '../data/matched_pairs.csv'
output_df.to_csv(output_filename, index=False)
print(f"\nĐã lưu kết quả vào file: {output_filename}")


Đã lưu kết quả vào file: ../data/matched_pairs.csv
