In [52]:
import random
import pandas as pd

# Define constants for service categories and Algerian locations
SERVICE_CATEGORIES = [
    "Plumbing", "Electrical Work", "Painting and Decoration", "Masonry",
    "Residential Cleaning", "Commercial Cleaning", "Post-Construction Cleaning",
    "Taxi Services", "Furniture Moving", "Local Delivery Services",
    "Car Repair and Maintenance", "Tire Services", "Car Wash",
    "Traditional Algerian Catering", "Event Catering",
    "Babysitting", "Elderly Companionship Services",
    "Language Lessons", "Quranic Studies", "Academic Tutoring",
    "Hairdressing", "Makeup Services", "Hammam and Spa Services"
]

LOCATIONS = [
     # Algiers (Capital) - More detailed locations
    (36.7538, 3.0588),  # Algiers city center (Place des Martyrs)
    (36.758, 3.063),    # Belouizdad
    (36.762, 3.075),    # El Madania
    (36.730, 3.058),    # Kouba
    (36.765, 3.070),    # Hydra
    (36.745, 3.043),    # El Harrach
    (36.738, 3.071),    # Bab El Oued
    (36.743, 3.090),    # Birkhadem
    (36.715, 3.127),    # Douera
    (36.776, 3.080),    # Ben Aknoun
    (36.786, 3.059),    # El Mohammadia
    (36.710, 3.075),    # Chéraga
    (36.745, 3.112),    # Bir Mourad Raïs
    (36.792, 3.085),    # El Achour
    (36.773, 3.097),    # Kouba
    (36.783, 3.068),    # Ain Naadja
    (36.720, 3.041),    # Saoula
    (36.734, 3.078),    # Bab Ezzouar
    (36.741, 3.089),    # El Harrach

    # Oran
    (35.698, -0.633),   # Oran city center
    (35.695, -0.608),   # Es Sénia
    (35.700, -0.673),   # Bir El Djir
    (35.720, -0.630),   # Ain El Turk
    (35.688, -0.605),   # Mers El Kébir

    # Tizi Ouzou
    (36.734, 4.047),    # Tizi Ouzou city center
    (36.745, 4.057),    # Draa Ben Khedda
    (36.728, 4.037),    # Azeffoun
    (36.746, 4.072),    # Bouzeguene

    # Biskra
    (34.866, 5.733),    # Biskra city center
    (34.850, 5.750),    # Sidi Okba
    (34.880, 5.730),    # Branis
    (34.870, 5.800),    # El Ouaret

    # Béchar
    (31.610, -2.226),   # Béchar city center
    (31.635, -2.217),   # Kenadsa
    (31.600, -2.250),   # Taghit
    (31.615, -2.270),   # El Golea

    # Bejaia
    (36.759, 5.084),    # Bejaia city center
    (36.750, 5.100),    # Tichy
    (36.780, 5.070),    # Aokas
    (36.760, 5.110),    # El Kseur
]

# Helper functions to generate data
def generate_location():
    return random.choice(LOCATIONS)

def generate_user_data(user_id):
    location = generate_location()
    categories_of_interest = random.sample(SERVICE_CATEGORIES, random.randint(2, 4))
    reviewed_service_ids = random.sample(range(1, 501), random.randint(0, 3))
    click_count_per_service = {sid: random.randint(1, 10) for sid in reviewed_service_ids}
    total_service_views = sum(click_count_per_service.values()) + random.randint(0, 10)
    
    return {
        "user_id": user_id,
        "location_x": location[0],
        "location_y": location[1],
        "age": random.randint(18, 65),
        "gender": random.choice(["Male", "Female"]),
        "service_categories_interest": categories_of_interest,
        "reviewed_service_ids": reviewed_service_ids,
        "click_count_per_service": click_count_per_service,
        "total_service_views": total_service_views
    }

def generate_service_data(service_id):
    location = generate_location()
    category = random.choice(SERVICE_CATEGORIES)
    click_count = random.randint(0, 100)
    
    return {
        "service_id": service_id,
        "provider_location_x": location[0],
        "provider_location_y": location[1],
        "service_category": category,
        "review_avg": round(random.uniform(3.0, 5.0), 1),
        "review_count": random.randint(1, 50),
        "click_count": click_count,
        "provider_age": random.randint(25, 60),
        "provider_experience": random.randint(1, 20)
    }

# Generate datasets
user_data = [generate_user_data(user_id) for user_id in range(1, 2001)]
service_data = [generate_service_data(service_id) for service_id in range(1, 501)]

# Convert to DataFrames
user_df = pd.DataFrame(user_data)
service_df = pd.DataFrame(service_data)

# Display the first few rows of each dataset
user_df.head(), service_df.head()


(   user_id  location_x  location_y  age  gender  \
 0        1      36.780       5.070   48  Female   
 1        2      31.615      -2.270   64    Male   
 2        3      36.780       5.070   34  Female   
 3        4      36.720       3.041   54    Male   
 4        5      36.738       3.071   42    Male   
 
                          service_categories_interest reviewed_service_ids  \
 0  [Language Lessons, Commercial Cleaning, Car Re...                [310]   
 1  [Language Lessons, Babysitting, Hammam and Spa...                   []   
 2        [Local Delivery Services, Language Lessons]           [340, 436]   
 3           [Makeup Services, Hairdressing, Masonry]           [177, 260]   
 4  [Local Delivery Services, Electrical Work, Eld...                   []   
 
   click_count_per_service  total_service_views  
 0                {310: 6}                    8  
 1                      {}                    6  
 2        {340: 7, 436: 5}                   16  
 3       {177: 6

In [53]:
user_df

Unnamed: 0,user_id,location_x,location_y,age,gender,service_categories_interest,reviewed_service_ids,click_count_per_service,total_service_views
0,1,36.780,5.070,48,Female,"[Language Lessons, Commercial Cleaning, Car Re...",[310],{310: 6},8
1,2,31.615,-2.270,64,Male,"[Language Lessons, Babysitting, Hammam and Spa...",[],{},6
2,3,36.780,5.070,34,Female,"[Local Delivery Services, Language Lessons]","[340, 436]","{340: 7, 436: 5}",16
3,4,36.720,3.041,54,Male,"[Makeup Services, Hairdressing, Masonry]","[177, 260]","{177: 6, 260: 10}",16
4,5,36.738,3.071,42,Male,"[Local Delivery Services, Electrical Work, Eld...",[],{},1
...,...,...,...,...,...,...,...,...,...
1995,1996,34.870,5.800,37,Male,"[Painting and Decoration, Hammam and Spa Servi...","[160, 441]","{160: 10, 441: 9}",28
1996,1997,31.610,-2.226,53,Male,"[Event Catering, Language Lessons]","[181, 161, 312]","{181: 2, 161: 9, 312: 2}",15
1997,1998,36.734,3.078,23,Female,"[Hairdressing, Language Lessons, Hammam and Sp...",[336],{336: 6},12
1998,1999,36.710,3.075,63,Female,"[Tire Services, Masonry, Elderly Companionship...",[240],{240: 5},11


In [36]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder
from geopy.distance import geodesic

# Helper functions to calculate Haversine distance and encode features
def haversine(lat1, lon1, lat2, lon2):
    coords_1 = (lat1, lon1)
    coords_2 = (lat2, lon2)
    return geodesic(coords_1, coords_2).km

def encode_service_interests(df):
    encoder = OneHotEncoder(sparse_output=False)  # Use sparse_output=False in newer versions
    service_interests = df['service_categories_interest'].apply(lambda x: [1 if cat in x else 0 for cat in SERVICE_CATEGORIES])
    service_interests = np.array(service_interests.tolist())
    return service_interests

def encode_reviewed_services(df, service_ids_range):
    # Create a binary vector for each reviewed service ID (1 if reviewed, 0 otherwise)
    reviewed_services = df['reviewed_service_ids'].apply(lambda x: [1 if i in x else 0 for i in range(1, service_ids_range+1)])
    return np.array(reviewed_services.tolist())  # Ensure it's a 2D array

# Encode gender as binary (Male=0, Female=1)
def encode_gender(df):
    return df['gender'].apply(lambda x: 0 if x == "Male" else 1).values.reshape(-1, 1)  # Reshape to 2D

# Feature extraction from the dataset
def generate_features(user_df, service_ids_range):
    # Encode all relevant features
    service_interests = encode_service_interests(user_df)  # 2D array
    reviewed_services = encode_reviewed_services(user_df, service_ids_range)  # 2D array
    gender = encode_gender(user_df)  # 2D array
    
    # Concatenate all features: location, age, gender, service interests, reviewed services
    features = np.concatenate([
        user_df[['location_x', 'location_y']].values,  # Location (2D array)
        user_df[['age']].values,  # Age (2D array)
        gender,  # Gender (reshaped to 2D array)
        service_interests,  # Service categories (2D array)
        reviewed_services  # Reviewed services (2D array)
    ], axis=1)
    
    return features

# Perform clustering with KMeans
def cluster_users(user_df, service_ids_range, n_clusters=10):
    # Prepare features for clustering
    user_features = generate_features(user_df, service_ids_range)
    
    # Perform KMeans clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    user_df['cluster'] = kmeans.fit_predict(user_features)
    
    return user_df, kmeans

# Example usage
user_df_filtered, kmeans_model = cluster_users(user_df, service_ids_range=500, n_clusters=150)

# Display the first few rows of the clustered user data
user_df_filtered[['user_id', 'cluster', 'location_x', 'location_y', 'age', 'gender']].head()


Unnamed: 0,user_id,cluster,location_x,location_y,age,gender
0,1,104,34.85,5.75,23,Female
1,2,66,36.738,3.071,19,Male
2,3,114,31.61,-2.226,31,Female
3,4,128,36.746,4.072,29,Female
4,5,37,36.776,3.08,39,Female


In [51]:
user_df_filtered

Unnamed: 0,user_id,location_x,location_y,age,gender,service_categories_interest,reviewed_service_ids,click_count_per_service,total_service_views,cluster
0,1,34.850,5.750,23,Female,"[Hairdressing, Car Repair and Maintenance, Tax...","[479, 204, 428]","{479: 10, 204: 5, 428: 1}",22,1
1,2,36.738,3.071,19,Male,"[Makeup Services, Furniture Moving, Plumbing, ...","[183, 469, 134]","{183: 4, 469: 3, 134: 5}",15,1
2,3,31.610,-2.226,31,Female,"[Plumbing, Masonry]","[286, 187, 53]","{286: 3, 187: 5, 53: 9}",20,1
3,4,36.746,4.072,29,Female,"[Local Delivery Services, Masonry, Taxi Servic...","[398, 326]","{398: 6, 326: 9}",17,1
4,5,36.776,3.080,39,Female,"[Electrical Work, Hammam and Spa Services]","[17, 299]","{17: 3, 299: 9}",17,1
...,...,...,...,...,...,...,...,...,...,...
1995,1996,36.759,5.084,25,Male,"[Post-Construction Cleaning, Tire Services, Qu...","[121, 317, 277]","{121: 4, 317: 10, 277: 10}",32,1
1996,1997,31.615,-2.270,53,Male,"[Language Lessons, Hairdressing]",[73],{73: 3},5,1
1997,1998,36.745,3.043,40,Female,"[Car Wash, Hammam and Spa Services, Tire Servi...",[149],{149: 5},8,1
1998,1999,36.734,4.047,58,Male,"[Plumbing, Elderly Companionship Services, Pai...",[],{},1,1


In [40]:
import pandas as pd
import numpy as np
from geopy.distance import geodesic

# Function to calculate Haversine distance
def haversine(lat1, lon1, lat2, lon2):
    coords_1 = (lat1, lon1)
    coords_2 = (lat2, lon2)
    return geodesic(coords_1, coords_2).km

# Function to calculate distance for all users within the same cluster
def calculate_distances_within_cluster(user_df):
    distance_matrix = {}  # To store distances within each cluster
    
    # Loop through each cluster
    for cluster_id in user_df['cluster'].unique():
        cluster_users = user_df[user_df['cluster'] == cluster_id]
        distances = []  # List to store pairwise distances for the current cluster
        
        # Compare each pair of users within the same cluster
        for i, user1 in cluster_users.iterrows():
            for j, user2 in cluster_users.iterrows():
                if i != j:  # Avoid calculating distance to self
                    distance = haversine(user1['location_x'], user1['location_y'], user2['location_x'], user2['location_y'])
                    distances.append({
                        'cluster': cluster_id,
                        'user1_id': user1['user_id'],
                        'user2_id': user2['user_id'],
                        'distance_km': distance
                    })
        
        # Save distances for the current cluster
        distance_matrix[cluster_id] = pd.DataFrame(distances)
    
    return distance_matrix

# Calculate pairwise distances within each cluster
distance_matrix = calculate_distances_within_cluster(user_df)

# Display the distances for the first cluster
distance_matrix[0].head()  # Adjust the cluster index to display other clusters


In [47]:
distance_matrix[10].head()[]

Unnamed: 0,cluster,user1_id,user2_id,distance_km
0,10,52,189,3.660422
1,10,52,382,4.678991
2,10,52,480,3.234012
3,10,52,564,9.143497
4,10,52,610,8.125106


In [46]:
l = []
for i in range(150):
    print(distance_matrix[i].max())

    

Series([], dtype: float64)
cluster           1.000000
user1_id       1947.000000
user2_id       1947.000000
distance_km      95.894033
dtype: float64
cluster           2.000000
user1_id       1930.000000
user2_id       1930.000000
distance_km       6.297096
dtype: float64
cluster           3.000000
user1_id       1989.000000
user2_id       1989.000000
distance_km      95.894033
dtype: float64
cluster           4.000000
user1_id       1012.000000
user2_id       1012.000000
distance_km       3.342356
dtype: float64
cluster           5.00000
user1_id       1966.00000
user2_id       1966.00000
distance_km     181.31313
dtype: float64
cluster           6.000000
user1_id       1963.000000
user2_id       1963.000000
distance_km       5.495699
dtype: float64
cluster           7.000000
user1_id       1986.000000
user2_id       1986.000000
distance_km       5.495699
dtype: float64
cluster           8.000000
user1_id       1938.000000
user2_id       1938.000000
distance_km       9.947629
dtype: f

In [None]:
##second version
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from geopy.distance import geodesic

# Helper function to calculate Haversine distance
def haversine(lat1, lon1, lat2, lon2):
    coords_1 = (lat1, lon1)
    coords_2 = (lat2, lon2)
    return geodesic(coords_1, coords_2).km

# Encode service interests as one-hot
def encode_service_interests(df):
    service_interests = df['service_categories_interest'].apply(lambda x: [1 if cat in x else 0 for cat in SERVICE_CATEGORIES])
    return np.array(service_interests.tolist())  # Ensure 2D array

# Encode reviewed services as a binary vector
def encode_reviewed_services(df, service_ids_range):
    reviewed_services = df['reviewed_service_ids'].apply(lambda x: [1 if i in x else 0 for i in range(1, service_ids_range+1)])
    return np.array(reviewed_services.tolist())  # Ensure 2D array

# Encode gender as binary (Male=0, Female=1)
def encode_gender(df):
    return df['gender'].apply(lambda x: 0 if x == "Male" else 1).values.reshape(-1, 1)  # Reshape to 2D

# Generate weighted features for clustering
def generate_weighted_features(user_df, service_ids_range):
    service_interests = encode_service_interests(user_df)  # 2D array
    reviewed_services = encode_reviewed_services(user_df, service_ids_range)  # 2D array
    gender = encode_gender(user_df)  # 2D array
    
    # Concatenate features with weights
    features = np.concatenate([
        user_df[['location_x', 'location_y']].values * 10,  # Scale location (priority for clustering)
        user_df[['age']].values,  # Age
        gender,  # Gender
        service_interests,  # Service categories
        reviewed_services  # Reviewed services
    ], axis=1)
    
    return StandardScaler().fit_transform(features)  # Normalize all features

# Perform KMeans clustering
def cluster_users(user_df, service_ids_range, n_clusters=150):
    user_features = generate_weighted_features(user_df, service_ids_range)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    user_df['cluster'] = kmeans.fit_predict(user_features)
    return user_df, kmeans

# Calculate pairwise distances within clusters
def calculate_distances_within_cluster(user_df):
    distance_matrix = []  # To store distances within each cluster
    
    for cluster_id in user_df['cluster'].unique():
        cluster_users = user_df[user_df['cluster'] == cluster_id].reset_index(drop=True)
        
        # Compare each pair of users within the same cluster
        for i, user1 in cluster_users.iterrows():
            for j, user2 in cluster_users.iterrows():
                if i < j:  # Avoid self-comparison and duplicate pairs
                    distance = haversine(user1['location_x'], user1['location_y'], user2['location_x'], user2['location_y'])
                    distance_matrix.append({
                        'cluster': cluster_id,
                        'user1_id': user1['user_id'],
                        'user2_id': user2['user_id'],
                        'distance_km': distance
                    })
    
    return pd.DataFrame(distance_matrix)

# Main code example
# Assuming user_df is already loaded
SERVICE_CATEGORIES = [  # Replace with your service categories
    "Plumbing", "Electrical Work", "Painting and Decoration", "Masonry",
    "Residential Cleaning", "Commercial Cleaning", "Post-Construction Cleaning",
    "Taxi Services", "Furniture Moving", "Local Delivery Services",
    "Car Repair and Maintenance", "Tire Services", "Car Wash",
    "Traditional Algerian Catering", "Event Catering",
    "Babysitting", "Elderly Companionship Services",
    "Language Lessons", "Quranic Studies", "Academic Tutoring",
    "Hairdressing", "Makeup Services", "Hammam and Spa Services"
]

user_df_filtered, kmeans_model = cluster_users(user_df, service_ids_range=500, n_clusters=150)
distance_matrix = calculate_distances_within_cluster(user_df_filtered)

# Display distances for the first few clusters
print(distance_matrix.head())
