In [93]:
import random
import pandas as pd

# Define constants for service categories and Algerian locations
SERVICE_CATEGORIES = [
    "Plumbing", "Electrical Work", "Painting and Decoration", "Masonry",
    "Residential Cleaning", "Commercial Cleaning", "Post-Construction Cleaning",
    "Taxi Services", "Furniture Moving", "Local Delivery Services",
    "Car Repair and Maintenance", "Tire Services", "Car Wash",
    "Traditional Algerian Catering", "Event Catering",
    "Babysitting", "Elderly Companionship Services",
    "Language Lessons", "Quranic Studies", "Academic Tutoring",
    "Hairdressing", "Makeup Services", "Hammam and Spa Services"
]

LOCATIONS = {
    "Algiers": [
        (36.7538, 3.0588), (36.758, 3.063), (36.762, 3.075), (36.730, 3.058),
        (36.765, 3.070), (36.745, 3.043), (36.738, 3.071), (36.743, 3.090),
        (36.715, 3.127), (36.776, 3.080), (36.786, 3.059), (36.710, 3.075),
        (36.745, 3.112), (36.792, 3.085), (36.773, 3.097), (36.783, 3.068),
        (36.720, 3.041), (36.734, 3.078), (36.741, 3.089)
    ],
    "Oran": [
        (35.698, -0.633), (35.695, -0.608), (35.700, -0.673), (35.720, -0.630), (35.688, -0.605)
    ],
    "Tizi Ouzou": [
        (36.734, 4.047), (36.745, 4.057), (36.728, 4.037), (36.746, 4.072)
    ],
    "Biskra": [
        (34.866, 5.733), (34.850, 5.750), (34.880, 5.730), (34.870, 5.800)
    ],
    "Béchar": [
        (31.610, -2.226), (31.635, -2.217), (31.600, -2.250), (31.615, -2.270)
    ],
    "Bejaia": [
        (36.759, 5.084), (36.750, 5.100), (36.780, 5.070), (36.760, 5.110)
    ]
}

# Helper function to generate location with city and avoid duplicates
def generate_location(assigned_locations):
    city = random.choice(list(LOCATIONS.keys()))
    available_locations = [loc for loc in LOCATIONS[city] if loc not in assigned_locations]
    if not available_locations:
        return None, None  # No available locations left
    location = random.choice(available_locations)
    assigned_locations.add(location)
    return city, location

# Generate user data with city assignment, avoiding duplicate locations
def generate_user_data(user_id, assigned_locations):
    city, location = generate_location(assigned_locations)
    if city is None:
        return None  # No available location left
    categories_of_interest = random.sample(SERVICE_CATEGORIES, random.randint(2, 4))
    reviewed_service_ids = random.sample(range(1, 501), random.randint(0, 3))
    click_count_per_service = {sid: random.randint(1, 10) for sid in reviewed_service_ids}
    total_service_views = sum(click_count_per_service.values()) + random.randint(0, 10)
    
    return {
        "user_id": user_id,
        "city": city,
        "location_x": location[0],
        "location_y": location[1],
        "age": random.randint(18, 65),
        "gender": random.choice(["Male", "Female"]),
        "service_categories_interest": categories_of_interest,
        "reviewed_service_ids": reviewed_service_ids,
        "click_count_per_service": click_count_per_service,
        "total_service_views": total_service_views
    }

# Generate service data with city assignment, avoiding duplicate locations
def generate_service_data(service_id, assigned_locations):
    city, location = generate_location(assigned_locations)
    if city is None:
        return None  # No available location left
    category = random.choice(SERVICE_CATEGORIES)
    click_count = random.randint(0, 100)
    
    return {
        "service_id": service_id,
        "city": city,
        "provider_location_x": location[0],
        "provider_location_y": location[1],
        "service_category": category,
        "review_avg": round(random.uniform(3.0, 5.0), 1),
        "review_count": random.randint(1, 50),
        "click_count": click_count,
        "provider_age": random.randint(25, 60),
        "provider_experience": random.randint(1, 20)
    }

# Initialize set to track assigned locations
assigned_locations = set()

# Generate datasets
user_data = []
for user_id in range(1, 2001):
    user = generate_user_data(user_id, assigned_locations)
    if user:
        user_data.append(user)

service_data = []
assigned_locations.clear()  # Reset for services to avoid overlap with users' locations
for service_id in range(1, 501):
    service = generate_service_data(service_id, assigned_locations)
    if service:
        service_data.append(service)

# Convert to DataFrames
user_df = pd.DataFrame(user_data)
service_df = pd.DataFrame(service_data)

# Display the first few rows of each dataset
user_df.head(), service_df.head()


(   user_id        city  location_x  location_y  age  gender  \
 0        1      Béchar      31.600      -2.250   48    Male   
 1        2      Biskra      34.850       5.750   35  Female   
 2        3      Béchar      31.610      -2.226   58  Female   
 3        4  Tizi Ouzou      36.734       4.047   63  Female   
 4        5      Biskra      34.870       5.800   64    Male   
 
                          service_categories_interest reviewed_service_ids  \
 0             [Commercial Cleaning, Quranic Studies]           [438, 231]   
 1                         [Masonry, Makeup Services]            [58, 128]   
 2  [Post-Construction Cleaning, Quranic Studies, ...                   []   
 3  [Local Delivery Services, Commercial Cleaning,...                   []   
 4                       [Plumbing, Furniture Moving]                [240]   
 
   click_count_per_service  total_service_views  
 0        {438: 1, 231: 1}                    9  
 1         {58: 9, 128: 3}                  

In [74]:
print(clustered_user_df.groupby(['city']).size())


city
Algiers       324
Bejaia        336
Biskra        338
Béchar        317
Oran          351
Tizi Ouzou    334
dtype: int64


In [72]:
 clustered_user_df[clustered_user_df['cluster'] == 0].head(3)

Unnamed: 0,user_id,city,location_x,location_y,age,gender,service_categories_interest,reviewed_service_ids,click_count_per_service,total_service_views,cluster
0,1,Biskra,34.85,5.75,61,Female,"[Painting and Decoration, Residential Cleaning...","[427, 346, 87]","{427: 3, 346: 6, 87: 2}",16,0
1,4,Biskra,34.85,5.75,49,Female,"[Plumbing, Makeup Services, Hammam and Spa Ser...","[432, 364]","{432: 10, 364: 5}",21,0
3,12,Biskra,34.85,5.75,42,Female,"[Hairdressing, Commercial Cleaning]",[50],{50: 2},6,0


In [73]:
 clustered_user_df[clustered_user_df['cluster'] == 1].head(3)

Unnamed: 0,user_id,city,location_x,location_y,age,gender,service_categories_interest,reviewed_service_ids,click_count_per_service,total_service_views,cluster
4,22,Biskra,34.866,5.733,39,Male,"[Electrical Work, Car Wash, Traditional Algeri...","[124, 226, 21]","{124: 7, 226: 5, 21: 3}",25,1
8,34,Biskra,34.866,5.733,53,Male,"[Elderly Companionship Services, Language Less...",[],{},3,1
9,38,Biskra,34.866,5.733,46,Male,"[Tire Services, Hammam and Spa Services, Babys...",[86],{86: 9},11,1


In [67]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Define function to extract features for clustering
def generate_city_features(city_users):
    # Combine features for clustering
    features = pd.concat([
        city_users[['location_x', 'location_y']],  # Geographic location
        city_users[['age']],  # Age
        pd.get_dummies(city_users['gender'], prefix="gender"),  # Gender (one-hot)
    ], axis=1)
    
    # Normalize features
    scaler = StandardScaler()
    return scaler.fit_transform(features)

# Train K-Means clustering for each city with unique cluster IDs
def train_city_clusters_with_offset(user_df, n_clusters_per_city=5):
    city_clusters = {}
    cluster_offset = 0  # Tracks cumulative offset for unique cluster IDs
    
    for city in user_df['city'].unique():
        print(f"Clustering for city: {city}")
        
        # Filter users by city
        city_users = user_df[user_df['city'] == city].reset_index(drop=True)
        
        # Generate features for clustering
        city_features = generate_city_features(city_users)
        
        # Apply K-Means
        kmeans = KMeans(n_clusters=n_clusters_per_city, random_state=42)
        city_users['cluster'] = kmeans.fit_predict(city_features) + cluster_offset
        
        # Update cluster offset
        cluster_offset += n_clusters_per_city
        
        # Store clustered data
        city_clusters[city] = city_users
    
    return city_clusters

# Train clustering for each city with unique cluster IDs
n_clusters_per_city = 5  # Adjust based on city size
city_cluster_results = train_city_clusters_with_offset(user_df, n_clusters_per_city=n_clusters_per_city)

# Combine results into a single DataFrame
clustered_user_df = pd.concat(city_cluster_results.values(), ignore_index=True)

# Display the first few rows of the clustered data
print(clustered_user_df[['user_id', 'city', 'cluster', 'location_x', 'location_y']].head())

# Check the unique clusters
print("Unique clusters:", clustered_user_df['cluster'].unique())


Clustering for city: Biskra
Clustering for city: Algiers
Clustering for city: Béchar
Clustering for city: Oran
Clustering for city: Bejaia
Clustering for city: Tizi Ouzou
   user_id    city  cluster  location_x  location_y
0        1  Biskra        0      34.850       5.750
1        4  Biskra        0      34.850       5.750
2        7  Biskra        3      34.866       5.733
3       12  Biskra        0      34.850       5.750
4       22  Biskra        1      34.866       5.733
Unique clusters: [ 0  3  1  2  4  7  5  9  6  8 11 10 13 12 14 17 18 16 15 19 21 20 23 22
 24 27 25 29 28 26]


In [75]:
import joblib

# Save the trained KMeans model and scaler
def save_model(kmeans, scaler, filename="clustering_model.pkl"):
    joblib.dump((kmeans, scaler), filename)
    print(f"Model saved to {filename}")

# Example usage after training the model:
kmeans, scaler = train_city_clusters_with_offset(user_df, n_clusters_per_city=5)
save_model(kmeans, scaler)


Clustering for city: Biskra
Clustering for city: Algiers
Clustering for city: Béchar
Clustering for city: Oran
Clustering for city: Bejaia
Clustering for city: Tizi Ouzou


ValueError: too many values to unpack (expected 2)

In [76]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Define function to extract features for clustering
def generate_city_features(city_users):
    # Combine features for clustering
    features = pd.concat([
        city_users[['location_x', 'location_y']],  # Geographic location
        city_users[['age']],  # Age
        pd.get_dummies(city_users['gender'], prefix="gender"),  # Gender (one-hot)
    ], axis=1)
    
    # Normalize features
    scaler = StandardScaler()
    return scaler.fit_transform(features), scaler

# Train K-Means clustering for each city with unique cluster IDs
def train_city_clusters_with_offset(user_df, n_clusters_per_city=5):
    city_clusters = {}
    cluster_offset = 0  # Tracks cumulative offset for unique cluster IDs
    all_city_scalers = {}
    
    for city in user_df['city'].unique():
        print(f"Clustering for city: {city}")
        
        # Filter users by city
        city_users = user_df[user_df['city'] == city].reset_index(drop=True)
        
        # Generate features for clustering and scaler
        city_features, scaler = generate_city_features(city_users)
        
        # Apply K-Means
        kmeans = KMeans(n_clusters=n_clusters_per_city, random_state=42)
        city_users['cluster'] = kmeans.fit_predict(city_features) + cluster_offset
        
        # Update cluster offset
        cluster_offset += n_clusters_per_city
        
        # Store clustered data and the scaler
        city_clusters[city] = city_users
        all_city_scalers[city] = scaler
    
    return city_clusters, all_city_scalers

# Train clustering for each city with unique cluster IDs
n_clusters_per_city = 5  # Adjust based on city size
city_cluster_results, all_city_scalers = train_city_clusters_with_offset(user_df, n_clusters_per_city=n_clusters_per_city)

# Combine results into a single DataFrame
clustered_user_df = pd.concat(city_cluster_results.values(), ignore_index=True)

# Display the first few rows of the clustered data
print(clustered_user_df[['user_id', 'city', 'cluster', 'location_x', 'location_y']].head())

# Save model and scalers
def save_model(kmeans_dict, scalers_dict, filename="clustering_model.pkl"):
    # Save models and scalers
    joblib.dump((kmeans_dict, scalers_dict), filename)
    print(f"Model and scalers saved to {filename}")

# Save the model and scalers
save_model(city_cluster_results, all_city_scalers)


Clustering for city: Biskra
Clustering for city: Algiers
Clustering for city: Béchar
Clustering for city: Oran
Clustering for city: Bejaia
Clustering for city: Tizi Ouzou
   user_id    city  cluster  location_x  location_y
0        1  Biskra        0      34.850       5.750
1        4  Biskra        0      34.850       5.750
2        7  Biskra        3      34.866       5.733
3       12  Biskra        0      34.850       5.750
4       22  Biskra        1      34.866       5.733
Model and scalers saved to clustering_model.pkl


In [77]:
import joblib

# Load the trained KMeans models and scalers
def load_model(filename="clustering_model.pkl"):
    kmeans_dict, scalers_dict = joblib.load(filename)
    print(f"Model and scalers loaded from {filename}")
    return kmeans_dict, scalers_dict

# Load the model and scalers
kmeans_dict, scalers_dict = load_model()

# Example: Predicting cluster for new user in Algiers
new_user_data = pd.DataFrame({
    'location_x': [36.7538],  # Example location
    'location_y': [3.0588],   # Example location
    'age': [30],               # Example age
    'gender': ['Male']         # Example gender
})

# Preprocess new user data
new_user_features, scaler = scalers_dict['Algiers'].transform(new_user_data[['location_x', 'location_y', 'age']])  # Transform using the scaler for Algiers
new_user_cluster = kmeans_dict['Algiers'].predict(new_user_features)  # Predict the cluster

print("Predicted cluster for new user:", new_user_cluster)


Model and scalers loaded from clustering_model.pkl


ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- gender_Female
- gender_Male


In [83]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd

# Helper function to perform one-hot encoding consistently for service categories interest
def encode_service_categories(city_users):
    # Apply MultiLabelBinarizer to transform the service categories into binary columns
    mlb = MultiLabelBinarizer(classes=SERVICE_CATEGORIES)  # Ensure it uses the full list of categories
    return mlb.fit_transform(city_users['service_categories_interest'])

# Feature extraction for city data with one-hot encoding for gender and service categories interest
def generate_city_features(city_users):
    # One-hot encode gender
    gender_encoded = encode_gender(city_users['gender'])
    
    # One-hot encode service categories interest
    service_interest_encoded = encode_service_categories(city_users)
    
    # Combine features for clustering
    features = pd.concat([
        city_users[['location_x', 'location_y']],  # Geographic location
        city_users[['age']],  # Age
        pd.DataFrame(gender_encoded, columns=["gender_Male"]),  # One-hot encoded gender
        pd.DataFrame(service_interest_encoded, columns=SERVICE_CATEGORIES),  # One-hot encoded service categories
    ], axis=1)
    
    # Normalize features
    scaler = StandardScaler()
    return scaler.fit_transform(features), scaler

# Train K-Means clustering for each city with unique cluster IDs
def train_city_clusters_with_offset(user_df, n_clusters_per_city=5):
    city_clusters = {}
    cluster_offset = 0  # Tracks cumulative offset for unique cluster IDs
    all_city_scalers = {}
    all_city_encoders = {}
    
    for city in user_df['city'].unique():
        print(f"Clustering for city: {city}")
        
        # Filter users by city
        city_users = user_df[user_df['city'] == city].reset_index(drop=True)
        
        # Generate features for clustering and scaler
        city_features, scaler = generate_city_features(city_users)
        
        # Apply K-Means
        kmeans = KMeans(n_clusters=n_clusters_per_city, random_state=42)
        city_users['cluster'] = kmeans.fit_predict(city_features) + cluster_offset
        
        # Update cluster offset
        cluster_offset += n_clusters_per_city
        
        # Store clustered data, scaler, and encoder
        city_clusters[city] = city_users
        all_city_scalers[city] = scaler
        all_city_encoders[city] = MultiLabelBinarizer(classes=SERVICE_CATEGORIES).fit(city_users['service_categories_interest'])
    
    return city_clusters, all_city_scalers, all_city_encoders

# Train clustering for each city with unique cluster IDs
n_clusters_per_city = 5  # Adjust based on city size
city_cluster_results, all_city_scalers, all_city_encoders = train_city_clusters_with_offset(user_df, n_clusters_per_city=n_clusters_per_city)

# Save the model, scalers, and encoders
def save_model(kmeans_dict, scalers_dict, encoders_dict, filename="clustering_model.pkl"):
    joblib.dump((kmeans_dict, scalers_dict, encoders_dict), filename)
    print(f"Model and scalers saved to {filename}")

# Save the model and scalers
save_model(city_cluster_results, all_city_scalers, all_city_encoders)


Clustering for city: Béchar
Clustering for city: Algiers
Clustering for city: Biskra
Clustering for city: Oran
Clustering for city: Tizi Ouzou
Clustering for city: Bejaia
Model and scalers saved to clustering_model.pkl


In [99]:
# Ensure clusters are assigned to the user_df dataframe
def assign_clusters_to_user_df(city_cluster_results):
    # Iterate through the city clusters and assign the cluster labels to user_df
    user_df_list = []
    for city, city_users in city_cluster_results.items():
        user_df_list.append(city_users[['user_id', 'city', 'location_x', 'location_y', 'age', 'gender', 'service_categories_interest', 'cluster']])
    
    # Concatenate all city clusters back into a single dataframe
    final_user_df = pd.concat(user_df_list, ignore_index=True)
    return final_user_df

# Ensure user_df has the correct cluster assignments
user_df = assign_clusters_to_user_df(city_cluster_results)


In [95]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.cluster import KMeans
import pandas as pd
import joblib

# Define constants for service categories
SERVICE_CATEGORIES = [
    "Plumbing", "Electrical Work", "Painting and Decoration", "Masonry",
    "Residential Cleaning", "Commercial Cleaning", "Post-Construction Cleaning",
    "Taxi Services", "Furniture Moving", "Local Delivery Services",
    "Car Repair and Maintenance", "Tire Services", "Car Wash",
    "Traditional Algerian Catering", "Event Catering",
    "Babysitting", "Elderly Companionship Services",
    "Language Lessons", "Quranic Studies", "Academic Tutoring",
    "Hairdressing", "Makeup Services", "Hammam and Spa Services"
]

# Helper function to one-hot encode service categories interest
def encode_service_categories(city_users):
    # Use MultiLabelBinarizer for multi-hot encoding service interests
    mlb = MultiLabelBinarizer(classes=SERVICE_CATEGORIES)
    return mlb.fit_transform(city_users['service_categories_interest'])

# Feature extraction for city data with one-hot encoding for gender and service categories interest
def generate_city_features(city_users):
    # One-hot encode gender
    gender_encoded = encode_gender(city_users['gender'])
    
    # One-hot encode service categories interest
    service_interest_encoded = encode_service_categories(city_users)
    
    # Combine features for clustering
    features = pd.concat([
        city_users[['location_x', 'location_y']],  # Geographic location
        city_users[['age']],  # Age
        pd.DataFrame(gender_encoded, columns=["gender_Male"]),  # One-hot encoded gender
        pd.DataFrame(service_interest_encoded, columns=SERVICE_CATEGORIES),  # One-hot encoded service categories
    ], axis=1)
    
    # Normalize features
    scaler = StandardScaler()
    return scaler.fit_transform(features), scaler
from sklearn.cluster import KMeans
import numpy as np

# Train K-Means clustering for each city with unique cluster IDs, adjusting for small sample sizes
def train_city_clusters_with_offset(user_df, n_clusters_per_city=5):
    city_clusters = {}
    cluster_offset = 0  # Tracks cumulative offset for unique cluster IDs
    all_city_scalers = {}
    all_city_encoders = {}
    
    for city in user_df['city'].unique():
        print(f"Clustering for city: {city}")
        
        # Filter users by city
        city_users = user_df[user_df['city'] == city].reset_index(drop=True)
        
        # Check if the number of users is smaller than n_clusters_per_city
        num_users = len(city_users)
        clusters_to_use = min(num_users, n_clusters_per_city)  # Use the smaller of num_users or n_clusters_per_city
        
        # Generate features for clustering and scaler
        city_features, scaler = generate_city_features(city_users)
        
        # Apply K-Means with the adjusted number of clusters
        kmeans = KMeans(n_clusters=clusters_to_use, random_state=42)
        city_users['cluster'] = kmeans.fit_predict(city_features) + cluster_offset
        
        # Update cluster offset
        cluster_offset += clusters_to_use
        
        # Store clustered data, scaler, and encoder
        city_clusters[city] = city_users
        all_city_scalers[city] = scaler
        all_city_encoders[city] = MultiLabelBinarizer(classes=SERVICE_CATEGORIES).fit(city_users['service_categories_interest'])
    
    return city_clusters, all_city_scalers, all_city_encoders

# Train clustering for each city with unique cluster IDs
n_clusters_per_city = 5  # Adjust based on city size
city_cluster_results, all_city_scalers, all_city_encoders = train_city_clusters_with_offset(user_df, n_clusters_per_city=n_clusters_per_city)

# Save the model, scalers, and encoders
def save_model(kmeans_dict, scalers_dict, encoders_dict, filename="clustering_model.pkl"):
    joblib.dump((kmeans_dict, scalers_dict, encoders_dict), filename)
    print(f"Model and scalers saved to {filename}")

# Save the model and scalers
save_model(city_cluster_results, all_city_scalers, all_city_encoders)


Clustering for city: Béchar
Clustering for city: Biskra
Clustering for city: Tizi Ouzou
Clustering for city: Bejaia
Clustering for city: Algiers
Clustering for city: Oran
Model and scalers saved to clustering_model.pkl


In [107]:
user_df[user_df['cluster'] == 17]

Unnamed: 0,user_id,city,location_x,location_y,age,gender,service_categories_interest,cluster
21,26,Algiers,36.783,3.068,46,Female,"[Furniture Moving, Traditional Algerian Cateri...",17
24,56,Algiers,36.734,3.078,50,Female,"[Quranic Studies, Local Delivery Services, Tir...",17
27,64,Algiers,36.792,3.085,45,Female,"[Painting and Decoration, Local Delivery Servi...",17
28,68,Algiers,36.765,3.07,22,Female,"[Event Catering, Car Repair and Maintenance]",17
32,104,Algiers,36.7538,3.0588,45,Female,"[Traditional Algerian Catering, Taxi Services]",17
34,136,Algiers,36.786,3.059,23,Female,"[Traditional Algerian Catering, Local Delivery...",17


In [108]:
# Display the first few rows of the user dataset with cluster labels
print(user_df[['user_id', 'city', 'location_x', 'location_y', 'age', 'gender', 'service_categories_interest', 'cluster']].head())


   user_id    city  location_x  location_y  age  gender  \
0        1  Béchar      31.600      -2.250   48    Male   
1        3  Béchar      31.610      -2.226   58  Female   
2       10  Béchar      31.635      -2.217   44  Female   
3       11  Béchar      31.615      -2.270   51    Male   
4        2  Biskra      34.850       5.750   35  Female   

                         service_categories_interest  cluster  
0             [Commercial Cleaning, Quranic Studies]        2  
1  [Post-Construction Cleaning, Quranic Studies, ...        0  
2  [Quranic Studies, Tire Services, Event Caterin...        3  
3  [Furniture Moving, Elderly Companionship Servi...        1  
4                         [Masonry, Makeup Services]        6  


In [109]:
user_df.to_excel('clustered_user_data.xlsx', index=False)
print("Dataset saved as 'clustered_user_data.xlsx'")

Dataset saved as 'clustered_user_data.xlsx'


In [None]:
service_df.to_excel('service_data.xlsx', index=False)