In [4]:
import random
import pandas as pd
import numpy as np
from geopy.distance import geodesic

# Define the service categories
SERVICE_CATEGORIES = [
    "houseCleaning", "electricity", "plumbing", "gardening", "painting", "carpentry", 
    "pestControl", "acRepair", "vehicleRepair", "applianceInstallation", "itSupport", 
    "homeSecurity", "interiorDesign", "windowCleaning", "furnitureAssembly"
]

# Define the Algerian cities with geographic ranges
algerian_cities = {
    "Algiers": {"lat_min": 36.6, "lat_max": 36.9, "lon_min": 2.9, "lon_max": 3.2},
    "Oran": {"lat_min": 35.6, "lat_max": 35.8, "lon_min": -0.8, "lon_max": -0.5},
    "Constantine": {"lat_min": 36.2, "lat_max": 36.4, "lon_min": 6.5, "lon_max": 6.7},
    "Annaba": {"lat_min": 36.8, "lat_max": 37.1, "lon_min": 7.6, "lon_max": 7.8},
    "Blida": {"lat_min": 36.4, "lat_max": 36.7, "lon_min": 2.5, "lon_max": 3.2},
    "Sétif": {"lat_min": 35.5, "lat_max": 36.6, "lon_min": 5.3, "lon_max": 6.5},
    "Tébessa": {"lat_min": 34.5, "lat_max": 35.8, "lon_min": 7.5, "lon_max": 8.7}
}

# Define the ranges for service and user IDs by city
city_service_id_ranges = {
    "Algiers": (1, 200),
    "Oran": (201, 350),
    "Constantine": (351, 500),
    "Annaba": (501, 650),
    "Blida": (651, 800),
    "Sétif": (801, 950),
    "Tébessa": (951, 1100)
}

city_user_id_ranges = {
    "Algiers": (1, 400),
    "Oran": (401, 700),
    "Constantine": (701, 1000),
    "Annaba": (1001, 1300),
    "Blida": (1301, 1600),
    "Sétif": (1601, 1800),
    "Tébessa": (1801, 2000)
}

# Add a random offset to the location
def add_random_offset(location):
    lat, lon = location
    lat += random.gauss(0, 0.01)
    lon += random.gauss(0, 0.01)
    return (round(lat, 6), round(lon, 6))

# Generate a random location within a city's geographic range
def generate_location(assigned_locations, city):
    lat_min, lat_max = algerian_cities[city]["lat_min"], algerian_cities[city]["lat_max"]
    lon_min, lon_max = algerian_cities[city]["lon_min"], algerian_cities[city]["lon_max"]
    
    lat = random.uniform(lat_min, lat_max)
    lon = random.uniform(lon_min, lon_max)
    location = (round(lat, 6), round(lon, 6))
    location = add_random_offset(location)
    
    while location in assigned_locations:
        lat = random.uniform(lat_min, lat_max)
        lon = random.uniform(lon_min, lon_max)
        location = (round(lat, 6), round(lon, 6))
        location = add_random_offset(location)
    
    assigned_locations.add(location)
    return location

# Calculate distance between user and service provider locations
def calculate_distance(user_location, provider_location):
    return geodesic(user_location, provider_location).km

# Generate service data
def generate_service_data():
    assigned_locations = set()
    service_data = []
    
    for city, (start_id, end_id) in city_service_id_ranges.items():
        for service_id in range(start_id, end_id + 1):
            location = generate_location(assigned_locations, city)
            service_data.append({
                "service_id": service_id,
                "city": city,
                "provider_location_x": location[0],
                "provider_location_y": location[1],
                "service_category": random.choice(SERVICE_CATEGORIES),
                "review_avg": round(random.uniform(3.0, 5.0), 1),
                "review_count": random.randint(1, 50),
                "click_count": random.randint(0, 100),
                "provider_age": random.randint(25, 60),
                "provider_experience": random.randint(1, 20)
            })
    
    return pd.DataFrame(service_data)

# Generate user data
def generate_user_data(service_df):
    assigned_locations = set()
    user_data = []
    
    for city, (start_id, end_id) in city_user_id_ranges.items():
        # Get available service IDs for this city and match them to the service categories of the user
        city_services = service_df[service_df['city'] == city]
        
        for user_id in range(start_id, end_id + 1):
            location = generate_location(assigned_locations, city)
            
            # Get the user's service categories of interest
            service_categories_interest = random.sample(SERVICE_CATEGORIES, random.randint(2, 4))
            # Filter the services available in the city to match the user's interest
            available_services = city_services[city_services['service_category'].isin(service_categories_interest)]
            
            # Generate reviews only from the services the user is interested in
            num_reviews = random.randint(0, min(3, len(available_services)))
            reviewed_service_ids = random.sample(available_services['service_id'].tolist(), num_reviews)
            
            click_count_per_service = {
                sid: random.randint(1, 10) 
                for sid in reviewed_service_ids
            }
            
            # Create the binary "clicked" label based on if the user interacted with any service
            clicked_label = 1 if sum(click_count_per_service.values()) > 0 else 0
            
            user_data.append({
                "user_id": user_id,
                "city": city,
                "location_x": location[0],
                "location_y": location[1],
                "age": random.randint(18, 65),
                "gender": random.choice(["Male", "Female"]),
                "service_categories_interest": service_categories_interest,
                "reviewed_service_ids": reviewed_service_ids,
                "click_count_per_service": click_count_per_service,
                "total_service_views": sum(click_count_per_service.values()) + random.randint(0, 10),
                "clicked": clicked_label  # This is the label for the recommendation system
            })
    
    return pd.DataFrame(user_data)

# Generate datasets
service_df = generate_service_data()
user_df = generate_user_data(service_df)



NameError: name 'generate_location' is not defined

In [6]:
user_df

Unnamed: 0,user_id,city,location_x,location_y,age,gender,service_categories_interest,reviewed_service_ids,click_count_per_service,total_service_views,clicked
0,1,Algiers,36.863548,3.168535,34,Male,"[pestControl, houseCleaning]","[6, 86, 154]","{6: 7, 86: 1, 154: 8}",17,1
1,2,Algiers,36.869598,3.013093,55,Male,"[electricity, houseCleaning]","[181, 48, 50]","{181: 6, 48: 1, 50: 4}",15,1
2,3,Algiers,36.777095,2.937226,39,Male,"[carpentry, houseCleaning]",[],{},6,0
3,4,Algiers,36.811242,2.944797,56,Female,"[applianceInstallation, houseCleaning]",[181],{181: 3},3,1
4,5,Algiers,36.823199,2.924206,60,Female,"[plumbing, interiorDesign, houseCleaning]","[123, 34, 118]","{123: 2, 34: 8, 118: 7}",27,1
...,...,...,...,...,...,...,...,...,...,...,...
1995,1996,Tébessa,35.695434,8.188362,25,Male,"[applianceInstallation, furnitureAssembly]",[],{},4,0
1996,1997,Tébessa,34.577042,8.241045,50,Male,"[applianceInstallation, itSupport, furnitureAs...","[1051, 1086, 1020]","{1051: 3, 1086: 5, 1020: 4}",16,1
1997,1998,Tébessa,34.549716,8.372643,62,Male,"[interiorDesign, furnitureAssembly]",[],{},6,0
1998,1999,Tébessa,35.297053,8.437502,25,Female,"[plumbing, furnitureAssembly]",[],{},8,0


In [16]:
user_df[user_df.user_id==20]

Unnamed: 0,user_id,city,location_x,location_y,age,gender,service_categories_interest,reviewed_service_ids,click_count_per_service,total_service_views,clicked
19,20,Algiers,36.721456,3.096135,42,Male,"[carpentry, houseCleaning]","[107, 182, 94]","{107: 5, 182: 6, 94: 9}",25,1


In [12]:

# Save to CSV
service_df.to_csv('service_data2.csv', index=False)
user_df.to_csv('user_data2.csv', index=False)

print("Datasets generated and saved as 'service_data.csv' and 'user_data.csv'.")

Datasets generated and saved as 'service_data.csv' and 'user_data.csv'.


In [2]:
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

# Load the datasets (assuming they were saved as 'service_data.csv' and 'user_data.csv')
service_df = pd.read_csv('service_data.csv')
user_df = pd.read_csv('user_data.csv')

# Prepare the data for collaborative filtering
# We need to create an interaction matrix where rows are users, and columns are services.
# We will use `clicked` as the interaction value (1 for clicked, 0 for not clicked)

# Create a user-item interaction matrix
interaction_matrix = pd.pivot_table(user_df, index='user_id', columns='reviewed_service_ids', 
                                   values='clicked', aggfunc='max', fill_value=0)

# Check if the interaction matrix is sparse and needs dimensionality reduction
print(interaction_matrix.shape)

# Apply SVD (Singular Value Decomposition) for matrix factorization
svd = TruncatedSVD(n_components=50, random_state=42)  # You can adjust `n_components`
interaction_matrix_svd = svd.fit_transform(interaction_matrix)

# Reconstruct the matrix to get the predicted interaction values
interaction_matrix_reconstructed = svd.inverse_transform(interaction_matrix_svd)

# Convert the reconstructed matrix back into a DataFrame for easier analysis
interaction_matrix_pred = pd.DataFrame(interaction_matrix_reconstructed, 
                                       columns=interaction_matrix.columns, 
                                       index=interaction_matrix.index)

# Generate recommendations for a specific user (e.g., user_id=10)
user_id = 10  # Change this to any user_id from your dataset
user_row = interaction_matrix_pred.loc[user_id]

# Sort services based on predicted interaction values
recommended_services = user_row.sort_values(ascending=False).head(10)  # Top 10 recommendations

# Display the recommended services for the user
recommended_service_ids = recommended_services.index
recommended_services_info = service_df[service_df['service_id'].isin(recommended_service_ids)]

# Display the top recommended services with their details
print("Top 10 Recommended Services for User {}:".format(user_id))
print(recommended_services_info[['service_id', 'service_category', 'review_avg', 'provider_age']])

# You can adjust the number of recommendations, include additional features, etc.


(2000, 1387)
Top 10 Recommended Services for User 10:
Empty DataFrame
Columns: [service_id, service_category, review_avg, provider_age]
Index: []


In [3]:
# Check how many services User 10 has reviewed
user_10_data = user_df[user_df['user_id'] == 10]
print("User 10's reviewed services:")
print(user_10_data[['reviewed_service_ids', 'clicked']])

# Check the interaction matrix for User 10
user_10_interactions = interaction_matrix.loc[10]
print("\nUser 10's interactions in the matrix:")
print(user_10_interactions[user_10_interactions > 0])


User 10's reviewed services:
  reviewed_service_ids  clicked
9                [161]        1

User 10's interactions in the matrix:
reviewed_service_ids
[161]    1
Name: 10, dtype: int64


In [50]:
def cold_start_recommendations(user, service_df, top_n=10):
    # Filter services by user's city
    city_services = service_df[service_df['city'] == user['city']]
    
    # Score services: Give priority to user's interest categories
    city_services['interest_score'] = city_services['service_category'].apply(
        lambda x: 1 if x in user['service_categories_interest'] else 0
    )
    
    # Score based on review_avg and location proximity
    city_services['review_score'] = city_services['review_avg']
    city_services['distance_score'] = 1 / (
        1 + np.sqrt((city_services['provider_location_x'] - user['location_x'])**2 +
                    (city_services['provider_location_y'] - user['location_y'])**2)
    )
    
    # Weighted scoring (customizable weights for flexibility)
    city_services['final_score'] = (
        0.2 * city_services['interest_score'] +
        0.3 * city_services['review_score'] +
        0.2 * city_services['distance_score']
    )
    
    # Rank services by final score
    ranked_services = city_services.sort_values(by='final_score', ascending=False)
    return ranked_services.head(top_n)

# Example usage for a single user
user = user_df.iloc[0]
recommendations = cold_start_recommendations(user, service_df)
print(recommendations[['service_id','city', 'service_category', 'final_score']])


     service_id     city service_category  final_score
130         131  Algiers      pestControl     1.852640
129         130  Algiers    houseCleaning     1.833936
186         187  Algiers      pestControl     1.823690
1             2  Algiers      pestControl     1.816068
71           72  Algiers    houseCleaning     1.803586
29           30  Algiers    houseCleaning     1.740522
50           51  Algiers      pestControl     1.710605
115         116  Algiers      pestControl     1.693828
121         122  Algiers      pestControl     1.672472
136         137  Algiers        gardening     1.668231


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  city_services['interest_score'] = city_services['service_category'].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  city_services['review_score'] = city_services['review_avg']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  city_services['distance_score'] = 1 / (
A value is trying to be set on

In [32]:
def generate_interaction_matrix(user_df):
    # Flatten click_count_per_service into rows
    interactions = []
    for _, row in user_df.iterrows():
        for service_id, click_count in row['click_count_per_service'].items():
            interactions.append((row['user_id'], service_id, click_count))
    
    # Create DataFrame for interactions
    interactions_df = pd.DataFrame(interactions, columns=['user_id', 'service_id', 'click_count'])
    
    # Pivot to create interaction matrix
    interaction_matrix = interactions_df.pivot_table(
        index='user_id', columns='service_id', values='click_count', fill_value=0
    )
    
    return interaction_matrix

# Generate interaction matrix
interaction_matrix = generate_interaction_matrix(user_df)
print(interaction_matrix)


service_id  1     2     3     4     5     6     7     8     9     10    ...  \
user_id                                                                 ...   
1            0.0   0.0   0.0   0.0   0.0   7.0   0.0   0.0   0.0   0.0  ...   
2            0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
4            0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
5            0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
7            0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
...          ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
1993         0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
1994         0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
1995         0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
1997         0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
2000         0.0   0.0   0.0   0.0   0.0   0.0   0.0

In [33]:
from sklearn.metrics.pairwise import cosine_similarity

def generate_user_similarity(interaction_matrix):
    # Compute cosine similarity
    similarity_matrix = cosine_similarity(interaction_matrix)
    
    # Convert to DataFrame for easy lookup
    user_similarity_df = pd.DataFrame(
        similarity_matrix,
        index=interaction_matrix.index,
        columns=interaction_matrix.index
    )
    
    return user_similarity_df

# Generate user similarity matrix
user_similarity_df = generate_user_similarity(interaction_matrix)
print(user_similarity_df.head())


user_id  1         2         4     5     7         8         9         10    \
user_id                                                                       
1         1.0  0.000000  0.000000   0.0   0.0  0.523391  0.000000  0.000000   
2         0.0  1.000000  0.824163   0.0   0.0  0.074629  0.000000  0.000000   
4         0.0  0.824163  1.000000   0.0   0.0  0.000000  0.000000  0.000000   
5         0.0  0.000000  0.000000   1.0   0.0  0.000000  0.000000  0.000000   
7         0.0  0.000000  0.000000   0.0   1.0  0.000000  0.455842  0.614658   

user_id  11        12    ...  1987  1988  1989  1990  1992  1993  1994  1995  \
user_id                  ...                                                   
1         0.0  0.000000  ...   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   
2         0.0  0.000000  ...   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   
4         0.0  0.000000  ...   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   
5         0.0  0.000000  ...   0.0   0.0   0.0

In [47]:
def collaborative_filtering_recommendations(user_id, interaction_matrix, user_similarity_df, service_df, top_n=10):
    # Find similar users
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:]  # Exclude self
    
    # Weighted aggregation of similar users' interactions
    similar_users_interactions = interaction_matrix.loc[similar_users.index].mean()
    
    # Rank services by interaction scores
    recommended_services = similar_users_interactions.sort_values(ascending=False).head(top_n)
    
    # Fetch service details for recommended services
    service_recommendations = service_df[service_df['service_id'].isin(recommended_services.index)]
    
    # Get the city of the current user from the user_df
    user_city = user_df[user_df['user_id'] == user_id]['city'].iloc[0]  # Get user's city
    
    # Filter services to include only those from the same city as the user
    service_recommendations_same_city = service_recommendations[service_recommendations['city'] == user_city]
    
    # Return ranked services from the same city
    return service_recommendations_same_city.sort_values(by='click_count', ascending=False)

# Example usage
user_id = user_df.iloc[0]['user_id']
cf_recommendations = collaborative_filtering_recommendations(user_id, interaction_matrix, user_similarity_df, service_df)
print(cf_recommendations[['service_id', 'city', 'service_category', 'click_count']])


     service_id     city service_category  click_count
73           74  Algiers   windowCleaning           79
181         182  Algiers        carpentry           10
62           63  Algiers   windowCleaning            9


In [35]:
def hybrid_recommendations(user, service_df, interaction_matrix, user_similarity_df, weights=(0.4, 0.3, 0.3), top_n=10):
    # Cold-Start Recommendations
    cold_start_recs = cold_start_recommendations(user, service_df, top_n)
    cold_start_scores = pd.Series(
        [5 - i for i in range(len(cold_start_recs))], index=cold_start_recs['service_id']
    )
    
    # Collaborative Filtering Recommendations
    if user['user_id'] in interaction_matrix.index:
        cf_recs = collaborative_filtering_recommendations(
            user['user_id'], interaction_matrix, user_similarity_df, service_df, top_n
        )
        cf_scores = pd.Series(
            [5 - i for i in range(len(cf_recs))], index=cf_recs['service_id']
        )
    else:
        cf_scores = pd.Series(dtype=float)
    
    # Combine scores with weights
    combined_scores = (
        cold_start_scores.mul(weights[0], fill_value=0) +
        cf_scores.mul(weights[1], fill_value=0)
    )
    
    # Rank combined recommendations
    ranked_service_ids = combined_scores.sort_values(ascending=False).head(top_n).index
    return service_df[service_df['service_id'].isin(ranked_service_ids)]

# Example usage
hybrid_recs = hybrid_recommendations(user, service_df, interaction_matrix, user_similarity_df)
print(hybrid_recs[['service_id', 'service_category', 'review_avg', 'click_count']])


     service_id service_category  review_avg  click_count
1             2      pestControl         4.9           76
11           12      pestControl         4.3           36
29           30    houseCleaning         4.6           27
50           51      pestControl         4.5           86
62           63   windowCleaning         3.4            9
71           72    houseCleaning         4.8           97
73           74   windowCleaning         3.4           79
115         116      pestControl         4.4           44
121         122      pestControl         4.4           97
129         130    houseCleaning         4.9           96


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  city_services['interest_score'] = city_services['service_category'].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  city_services['review_score'] = city_services['review_avg']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  city_services['distance_score'] = 1 / (
A value is trying to be set on

In [51]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Function to calculate the distance between two locations (using Euclidean distance)
def calculate_distance(x1, y1, x2, y2):
    return np.sqrt((x1 - x2)**2 + (y1 - y2)**2)

# Function to generate recommendations for a user
def recommend_services(user_id, user_df, service_df):
    # Step 1: Get the user's data
    user_data = user_df[user_df['user_id'] == user_id].iloc[0]
    user_city = user_data['city']
    user_location = (user_data['location_x'], user_data['location_y'])
    user_interests = user_data['service_categories_interest']
    
    # Step 2: Filter services from the same city
    services_in_city = service_df[service_df['city'] == user_city]
    
    # Step 3: Compute the distance of each service from the user
    services_in_city['distance'] = services_in_city.apply(
        lambda row: calculate_distance(user_location[0], user_location[1], row['provider_location_x'], row['provider_location_y']), axis=1
    )
    
    # Step 4: Collaborative filtering - User similarity based on behavior (click_count and review_avg)
    interaction_matrix = user_df.pivot_table(index='user_id', columns='service_id', values='click_count', fill_value=0)
    
    # Calculate cosine similarity between the user and all other users based on their interactions
    user_vector = interaction_matrix.loc[user_id].values.reshape(1, -1)
    similarity_scores = cosine_similarity(user_vector, interaction_matrix.values)
    similarity_df = pd.DataFrame(similarity_scores.T, index=interaction_matrix.index, columns=['similarity'])
    
    # Get the most similar users (top 10 most similar users)
    similar_users = similarity_df.sort_values(by='similarity', ascending=False).iloc[1:11]
    
    # Step 5: Calculate the weighted interactions of similar users
    similar_users_interactions = interaction_matrix.loc[similar_users.index].mean()
    
    # Step 6: Content-based filtering - Services matching the user's interest categories
    # We filter services that match the user's service categories interest
    matching_services = services_in_city[services_in_city['service_category'].isin(user_interests)]
    
    # Step 7: Rank services based on collaborative filtering (click_count, reviews) and content-based (user interest)
    # We'll combine scores from both approaches
    services_in_city['cf_score'] = services_in_city['service_id'].map(similar_users_interactions)
    services_in_city['cb_score'] = services_in_city['service_category'].apply(
        lambda category: 1 if category in user_interests else 0
    )
    
    # Final ranking score based on collaborative filtering, content-based filtering, and proximity
    services_in_city['final_score'] = (
        services_in_city['cf_score'] * 0.4 + 
        services_in_city['cb_score'] * 0.3 + 
        (1 / (1 + services_in_city['distance'])) * 0.3  # The closer the service, the higher the score
    )
    
    # Step 8: Sort services by final score
    recommended_services = services_in_city.sort_values(by='final_score', ascending=False)
    
    # Return the top recommended services
    return recommended_services[['service_id', 'service_category', 'review_avg', 'click_count', 'provider_experience', 'distance', 'final_score']]

# Example usage
user_id = 19  # Replace with an actual user ID from your user_df
recommended_services = recommend_services(user_id, user_df, service_df)

# Display the top recommended services
print(recommended_services)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  services_in_city['distance'] = services_in_city.apply(


KeyError: 'click_count'

In [60]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Function to calculate the distance between two locations (using Euclidean distance)
def calculate_distance(x1, y1, x2, y2):
    return np.sqrt((x1 - x2)**2 + (y1 - y2)**2)

# Function to generate recommendations for a user
def recommend_services(user_id, user_df, service_df):
    # Step 1: Get the user's data
    user_data = user_df[user_df['user_id'] == user_id].iloc[0]
    user_city = user_data['city']
    user_location = (user_data['location_x'], user_data['location_y'])
    user_interests = user_data['service_categories_interest']
    
    # Step 2: Filter services from the same city
    services_in_city = service_df[service_df['city'] == user_city]
    
    # Step 3: Compute the distance of each service from the user
    services_in_city['distance'] = services_in_city.apply(
        lambda row: calculate_distance(user_location[0], user_location[1], row['provider_location_x'], row['provider_location_y']), axis=1
    )
    
    # Step 4: Collaborative filtering - User similarity based on behavior (click_count and review_avg)
    # We need to reshape user_df to have a user-service interaction matrix
    interaction_matrix = user_df[['user_id', 'reviewed_service_ids', 'click_count_per_service']].explode(['reviewed_service_ids', 'click_count_per_service'])
    
    # Create interaction matrix: One user can have many services they interacted with (click or review)
    interaction_matrix['service_id'] = interaction_matrix['reviewed_service_ids']
    interaction_matrix['click_count'] = interaction_matrix['click_count_per_service']
    
    # Now let's pivot the data to create a user-service matrix where interactions are stored
    user_service_matrix = interaction_matrix.pivot_table(index='user_id', columns='service_id', values='click_count', aggfunc='sum', fill_value=0)
    
    # Calculate cosine similarity between the target user and all other users
    user_vector = user_service_matrix.loc[user_id].values.reshape(1, -1)
    similarity_scores = cosine_similarity(user_vector, user_service_matrix.values)
    similarity_df = pd.DataFrame(similarity_scores.T, index=user_service_matrix.index, columns=['similarity'])
    
    # Get the most similar users (top 10 most similar users)
    similar_users = similarity_df.sort_values(by='similarity', ascending=False).iloc[1:11]
    
    # Step 5: Calculate the weighted interactions of similar users
    similar_users_interactions = user_service_matrix.loc[similar_users.index].mean()
    
    # Step 6: Content-based filtering - Services matching the user's interest categories
    # We filter services that match the user's service categories interest
    matching_services = services_in_city[services_in_city['service_category'].isin(user_interests)]
    
    # Step 7: Rank services based on collaborative filtering (click_count, reviews) and content-based (user interest)
    # We'll combine scores from both approaches
    services_in_city['cf_score'] = services_in_city['service_id'].map(similar_users_interactions)
    services_in_city['cb_score'] = services_in_city['service_category'].apply(
        lambda category: 1 if category in user_interests else 0
    )
    
    # Final ranking score based on collaborative filtering, content-based filtering, and proximity
    services_in_city['final_score'] = (
        services_in_city['cf_score'] * 0.4 + 
        services_in_city['cb_score'] * 0.3 + 
        (1 / (1 + services_in_city['distance'])) * 0.3  # The closer the service, the higher the score
    )
    
    # Step 8: Sort services by final score
    recommended_services = services_in_city.sort_values(by='final_score', ascending=False)
    
    # Return the top recommended services
    return recommended_services[['service_id','city', 'service_category', 'review_avg', 'click_count', 'provider_experience', 'distance', 'final_score']]

# Example usage
user_id = 1  # Replace with an actual user ID from your user_df
recommended_services = recommend_services(user_id, user_df, service_df)

# Display the top recommended services
recommended_services[['service_id','city', 'service_category', 'review_avg', 'click_count', 'provider_experience', 'final_score']].head(10)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  services_in_city['distance'] = services_in_city.apply(
  user_service_matrix = interaction_matrix.pivot_table(index='user_id', columns='service_id', values='click_count', aggfunc='sum', fill_value=0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  services_in_city['cf_score'] = services_in_city['service_id'].map(similar_users_interactions)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas

Unnamed: 0,service_id,city,service_category,review_avg,click_count,provider_experience,final_score
153,154,Algiers,pestControl,4.2,51,3,19.05382
186,187,Algiers,pestControl,4.8,0,11,8.055535
85,86,Algiers,pestControl,4.0,26,8,7.441083
151,152,Algiers,painting,3.2,88,20,6.377797
130,131,Algiers,pestControl,5.0,47,7,5.76896
91,92,Algiers,homeSecurity,3.4,60,19,3.92276
68,69,Algiers,applianceInstallation,4.1,6,4,3.013838
66,67,Algiers,homeSecurity,4.0,91,5,2.960642
47,48,Algiers,houseCleaning,4.1,58,15,2.482283
32,33,Algiers,pestControl,3.3,14,15,1.873038


In [58]:
user

Unnamed: 0,user_id,city,location_x,location_y,age,gender,service_categories_interest,reviewed_service_ids,click_count_per_service,total_service_views,clicked
0,1,Algiers,36.863548,3.168535,34,Male,"[pestControl, houseCleaning]","[6, 86, 154]","{6: 7, 86: 1, 154: 8}",17,1
1,2,Algiers,36.869598,3.013093,55,Male,"[electricity, houseCleaning]","[181, 48, 50]","{181: 6, 48: 1, 50: 4}",15,1
2,3,Algiers,36.777095,2.937226,39,Male,"[carpentry, houseCleaning]",[],{},6,0
3,4,Algiers,36.811242,2.944797,56,Female,"[applianceInstallation, houseCleaning]",[181],{181: 3},3,1
4,5,Algiers,36.823199,2.924206,60,Female,"[plumbing, interiorDesign, houseCleaning]","[123, 34, 118]","{123: 2, 34: 8, 118: 7}",27,1
...,...,...,...,...,...,...,...,...,...,...,...
1995,1996,Tébessa,35.695434,8.188362,25,Male,"[applianceInstallation, furnitureAssembly]",[],{},4,0
1996,1997,Tébessa,34.577042,8.241045,50,Male,"[applianceInstallation, itSupport, furnitureAs...","[1051, 1086, 1020]","{1051: 3, 1086: 5, 1020: 4}",16,1
1997,1998,Tébessa,34.549716,8.372643,62,Male,"[interiorDesign, furnitureAssembly]",[],{},6,0
1998,1999,Tébessa,35.297053,8.437502,25,Female,"[plumbing, furnitureAssembly]",[],{},8,0


In [6]:
import random
import pandas as pd
import json
from geopy.distance import geodesic

# Define the service categories
SERVICE_CATEGORIES = [
    "houseCleaning", "electricity", "plumbing", "gardening", "painting", "carpentry", 
    "pestControl", "acRepair", "vehicleRepair", "applianceInstallation", "itSupport", 
    "homeSecurity", "interiorDesign", "windowCleaning", "furnitureAssembly"
]

# Define the Algerian cities with geographic ranges
algerian_cities = {
    "Algiers": {"lat_min": 36.6, "lat_max": 36.9, "lon_min": 2.9, "lon_max": 3.2},
    "Oran": {"lat_min": 35.6, "lat_max": 35.8, "lon_min": -0.8, "lon_max": -0.5},
    "Constantine": {"lat_min": 36.2, "lat_max": 36.4, "lon_min": 6.5, "lon_max": 6.7},
    "Annaba": {"lat_min": 36.8, "lat_max": 37.1, "lon_min": 7.6, "lon_max": 7.8},
    "Blida": {"lat_min": 36.4, "lat_max": 36.7, "lon_min": 2.5, "lon_max": 3.2},
    "Sétif": {"lat_min": 35.5, "lat_max": 36.6, "lon_min": 5.3, "lon_max": 6.5},
    "Tébessa": {"lat_min": 34.5, "lat_max": 35.8, "lon_min": 7.5, "lon_max": 8.7}
}

# Define the ranges for service and user IDs by city
city_service_id_ranges = {
    "Algiers": (1, 200),
    "Oran": (201, 350),
    "Constantine": (351, 500),
    "Annaba": (501, 650),
    "Blida": (651, 800),
    "Sétif": (801, 950),
    "Tébessa": (951, 1100)
}

city_user_id_ranges = {
    "Algiers": (1, 400),
    "Oran": (401, 700),
    "Constantine": (701, 1000),
    "Annaba": (1001, 1300),
    "Blida": (1301, 1600),
    "Sétif": (1601, 1800),
    "Tébessa": (1801, 2000)
}

# Add a random offset to the location
def add_random_offset(location):
    lat, lon = location
    lat += random.gauss(0, 0.01)
    lon += random.gauss(0, 0.01)
    return (round(lat, 6), round(lon, 6))

# Generate a random location within a city's geographic range
def generate_location(assigned_locations, city):
    lat_min, lat_max = algerian_cities[city]["lat_min"], algerian_cities[city]["lat_max"]
    lon_min, lon_max = algerian_cities[city]["lon_min"], algerian_cities[city]["lon_max"]
    
    lat = random.uniform(lat_min, lat_max)
    lon = random.uniform(lon_min, lon_max)
    location = (round(lat, 6), round(lon, 6))
    location = add_random_offset(location)
    
    while location in assigned_locations:
        lat = random.uniform(lat_min, lat_max)
        lon = random.uniform(lon_min, lon_max)
        location = (round(lat, 6), round(lon, 6))
        location = add_random_offset(location)
    
    assigned_locations.add(location)
    return location

# Calculate distance between user and service provider locations
def calculate_distance(user_location, provider_location):
    return geodesic(user_location, provider_location).km

# Generate service data
def generate_service_data():
    assigned_locations = set()
    service_data = []
    
    for city, (start_id, end_id) in city_service_id_ranges.items():
        for service_id in range(start_id, end_id + 1):
            location = generate_location(assigned_locations, city)
            service_data.append({
                "service_id": service_id,
                "city": city,
                "provider_location_x": location[0],
                "provider_location_y": location[1],
                "service_category": random.choice(SERVICE_CATEGORIES),
                "review_avg": round(random.uniform(3.0, 5.0), 1),
                "review_count": random.randint(1, 50),
                "click_count": random.randint(0, 100),
                "provider_age": random.randint(25, 60),
                "provider_experience": random.randint(1, 20)
            })
    
    return pd.DataFrame(service_data)

# Enhanced user data generation with logical interests
def generate_user_data(service_df):
    assigned_locations = set()
    user_data = []
    total_categories = len(SERVICE_CATEGORIES)

    # Segment city_user_id_ranges by categories
    segmented_id_ranges = {
        city: [
            (start_id + i * ((end_id - start_id) // total_categories), 
             start_id + (i + 1) * ((end_id - start_id) // total_categories))
            for i in range(total_categories)
        ]
        for city, (start_id, end_id) in city_user_id_ranges.items()
    }

    for city, category_ranges in segmented_id_ranges.items():
        city_services = service_df[service_df['city'] == city]
        
        for category_index, (start_id, end_id) in enumerate(category_ranges):
            category = SERVICE_CATEGORIES[category_index]

            for user_id in range(start_id, end_id + 1):
                location = generate_location(assigned_locations, city)

                # Filter services by the user's primary category
                primary_category_services = city_services[
                    city_services['service_category'] == category
                ]

                # Additional interest categories
                additional_categories = random.sample(
                    SERVICE_CATEGORIES, random.randint(1, 3)
                )

                # User's full interest categories (primary + additional)
                service_categories_interest = list(set([category] + additional_categories))

                # Filter the services to match the user's interests
                available_services = city_services[
                    city_services['service_category'].isin(service_categories_interest)
                ]

                # Assign interactions only within the available services
                num_reviews = random.randint(0, min(3, len(available_services)))
                reviewed_service_ids = random.sample(
                    available_services['service_id'].tolist(), num_reviews
                )

                click_count_per_service = {
                    sid: random.randint(1, 10)
                    for sid in reviewed_service_ids
                }

                clicked_label = 1 if sum(click_count_per_service.values()) > 0 else 0

                user_data.append({
                    "user_id": user_id,
                    "city": city,
                    "location_x": location[0],
                    "location_y": location[1],
                    "age": random.randint(18, 65),
                    "gender": random.choice(["Male", "Female"]),
                    "service_categories_interest": service_categories_interest,
                    "reviewed_service_ids": reviewed_service_ids,
                    "click_count_per_service": click_count_per_service,
                    "total_service_views": sum(click_count_per_service.values()) + random.randint(0, 10),
                    "clicked": clicked_label
                })

    return pd.DataFrame(user_data)

# Generate datasets
service_df = generate_service_data()
user_df = generate_user_data(service_df)

# Save datasets to JSON files
service_df.to_json("service_data.json", orient="records", lines=True)
user_df.to_json("user_data.json", orient="records", lines=True)

print("Service and user data have been saved to JSON files.")


Service and user data have been saved to JSON files.


In [7]:
import random
import json
from geopy.distance import geodesic

# Define the service categories
SERVICE_CATEGORIES = [
    "houseCleaning", "electricity", "plumbing", "gardening", "painting", "carpentry", 
    "pestControl", "acRepair", "vehicleRepair", "applianceInstallation", "itSupport", 
    "homeSecurity", "interiorDesign", "windowCleaning", "furnitureAssembly"
]

# Define the Algerian cities with geographic ranges
algerian_cities = {
    "Algiers": {"lat_min": 36.6, "lat_max": 36.9, "lon_min": 2.9, "lon_max": 3.2},
    "Oran": {"lat_min": 35.6, "lat_max": 35.8, "lon_min": -0.8, "lon_max": -0.5},
    "Constantine": {"lat_min": 36.2, "lat_max": 36.4, "lon_min": 6.5, "lon_max": 6.7},
    "Annaba": {"lat_min": 36.8, "lat_max": 37.1, "lon_min": 7.6, "lon_max": 7.8},
    "Blida": {"lat_min": 36.4, "lat_max": 36.7, "lon_min": 2.5, "lon_max": 3.2},
    "Sétif": {"lat_min": 35.5, "lat_max": 36.6, "lon_min": 5.3, "lon_max": 6.5},
    "Tébessa": {"lat_min": 34.5, "lat_max": 35.8, "lon_min": 7.5, "lon_max": 8.7}
}

# Define the ranges for service and user IDs by city
city_service_id_ranges = {
    "Algiers": (1, 200),
    "Oran": (201, 350),
    "Constantine": (351, 500),
    "Annaba": (501, 650),
    "Blida": (651, 800),
    "Sétif": (801, 950),
    "Tébessa": (951, 1100)
}

city_user_id_ranges = {
    "Algiers": (1, 400),
    "Oran": (401, 700),
    "Constantine": (701, 1000),
    "Annaba": (1001, 1300),
    "Blida": (1301, 1600),
    "Sétif": (1601, 1800),
    "Tébessa": (1801, 2000)
}

# Add a random offset to the location
def add_random_offset(location):
    lat, lon = location
    lat += random.gauss(0, 0.01)
    lon += random.gauss(0, 0.01)
    return (round(lat, 6), round(lon, 6))

# Generate a random location within a city's geographic range
def generate_location(assigned_locations, city):
    lat_min, lat_max = algerian_cities[city]["lat_min"], algerian_cities[city]["lat_max"]
    lon_min, lon_max = algerian_cities[city]["lon_min"], algerian_cities[city]["lon_max"]
    
    lat = random.uniform(lat_min, lat_max)
    lon = random.uniform(lon_min, lon_max)
    location = (round(lat, 6), round(lon, 6))
    location = add_random_offset(location)
    
    while location in assigned_locations:
        lat = random.uniform(lat_min, lat_max)
        lon = random.uniform(lon_min, lon_max)
        location = (round(lat, 6), round(lon, 6))
        location = add_random_offset(location)
    
    assigned_locations.add(location)
    return location

# Generate service data
def generate_service_data():
    assigned_locations = set()
    service_data = []
    
    for city, (start_id, end_id) in city_service_id_ranges.items():
        for service_id in range(start_id, end_id + 1):
            location = generate_location(assigned_locations, city)
            service_data.append({
                "service_id": service_id,
                "city": city,
                "provider_location_x": location[0],
                "provider_location_y": location[1],
                "service_category": random.choice(SERVICE_CATEGORIES),
                "review_avg": round(random.uniform(3.0, 5.0), 1),
                "review_count": random.randint(1, 50),
                "click_count": random.randint(0, 100),
                "provider_age": random.randint(25, 60),
                "provider_experience": random.randint(1, 20)
            })
    
    return service_data

# Generate user data
def generate_user_data(service_data):
    assigned_locations = set()
    user_data = []
    total_categories = len(SERVICE_CATEGORIES)

    # Segment city_user_id_ranges by categories
    segmented_id_ranges = {
        city: [
            (start_id + i * ((end_id - start_id) // total_categories), 
             start_id + (i + 1) * ((end_id - start_id) // total_categories))
            for i in range(total_categories)
        ]
        for city, (start_id, end_id) in city_user_id_ranges.items()
    }

    for city, category_ranges in segmented_id_ranges.items():
        city_services = [service for service in service_data if service['city'] == city]
        
        for category_index, (start_id, end_id) in enumerate(category_ranges):
            category = SERVICE_CATEGORIES[category_index]

            for user_id in range(start_id, end_id + 1):
                location = generate_location(assigned_locations, city)

                # Filter services by the user's primary category
                primary_category_services = [service for service in city_services if service['service_category'] == category]

                # Additional interest categories
                additional_categories = random.sample(SERVICE_CATEGORIES, random.randint(1, 3))

                # User's full interest categories (primary + additional)
                service_categories_interest = list(set([category] + additional_categories))

                # Filter the services to match the user's interests
                available_services = [service for service in city_services if service['service_category'] in service_categories_interest]

                # Assign interactions only within the available services
                num_reviews = random.randint(0, min(3, len(available_services)))
                reviewed_service_ids = random.sample([service['service_id'] for service in available_services], num_reviews)

                click_count_per_service = {sid: random.randint(1, 10) for sid in reviewed_service_ids}
                clicked_label = 1 if sum(click_count_per_service.values()) > 0 else 0

                user_data.append({
                    "user_id": user_id,
                    "city": city,
                    "location_x": location[0],
                    "location_y": location[1],
                    "age": random.randint(18, 65),
                    "gender": random.choice(["Male", "Female"]),
                    "service_categories_interest": service_categories_interest,
                    "reviewed_service_ids": reviewed_service_ids,
                    "click_count_per_service": click_count_per_service,
                    "total_service_views": sum(click_count_per_service.values()) + random.randint(0, 10),
                    "clicked": clicked_label
                })

    return user_data

# Generate both service and user data
service_data = generate_service_data()
user_data = generate_user_data(service_data)

# Save the service data into a separate JSON file
with open('service_data.json', 'w') as service_file:
    json.dump(service_data, service_file, indent=4)

# Save the user data into a separate JSON file
with open('user_data.json', 'w') as user_file:
    json.dump(user_data, user_file, indent=4)

print("Service and user data have been saved to 'service_data.json' and 'user_data.json'.")


Service and user data have been saved to 'service_data.json' and 'user_data.json'.


In [9]:
import random
import json
from geopy.distance import geodesic

# Define the service categories
SERVICE_CATEGORIES = [
    "houseCleaning", "electricity", "plumbing", "gardening", "painting", "carpentry", 
    "pestControl", "acRepair", "vehicleRepair", "applianceInstallation", "itSupport", 
    "homeSecurity", "interiorDesign", "windowCleaning", "furnitureAssembly"
]

# Define the Algerian cities with geographic ranges
algerian_cities = {
    "Algiers": {"lat_min": 36.6, "lat_max": 36.9, "lon_min": 2.9, "lon_max": 3.2},
    "Oran": {"lat_min": 35.6, "lat_max": 35.8, "lon_min": -0.8, "lon_max": -0.5},
    "Constantine": {"lat_min": 36.2, "lat_max": 36.4, "lon_min": 6.5, "lon_max": 6.7},
    "Annaba": {"lat_min": 36.8, "lat_max": 37.1, "lon_min": 7.6, "lon_max": 7.8},
    "Blida": {"lat_min": 36.4, "lat_max": 36.7, "lon_min": 2.5, "lon_max": 3.2},
    "Sétif": {"lat_min": 35.5, "lat_max": 36.6, "lon_min": 5.3, "lon_max": 6.5},
    "Tébessa": {"lat_min": 34.5, "lat_max": 35.8, "lon_min": 7.5, "lon_max": 8.7}
}

# Define the ranges for service and user IDs by city
city_service_id_ranges = {
    "Algiers": (1, 200),
    "Oran": (201, 350),
    "Constantine": (351, 500),
    "Annaba": (501, 650),
    "Blida": (651, 800),
    "Sétif": (801, 950),
    "Tébessa": (951, 1100)
}

city_user_id_ranges = {
    "Algiers": (1, 400),
    "Oran": (401, 700),
    "Constantine": (701, 1000),
    "Annaba": (1001, 1300),
    "Blida": (1301, 1600),
    "Sétif": (1601, 1800),
    "Tébessa": (1801, 2000)
}

# Add a random offset to the location
def add_random_offset(location):
    lat, lon = location
    lat += random.gauss(0, 0.01)
    lon += random.gauss(0, 0.01)
    return (round(lat, 6), round(lon, 6))

# Generate a random location within a city's geographic range
def generate_location(assigned_locations, city):
    lat_min, lat_max = algerian_cities[city]["lat_min"], algerian_cities[city]["lat_max"]
    lon_min, lon_max = algerian_cities[city]["lon_min"], algerian_cities[city]["lon_max"]
    
    lat = random.uniform(lat_min, lat_max)
    lon = random.uniform(lon_min, lon_max)
    location = (round(lat, 6), round(lon, 6))
    location = add_random_offset(location)
    
    while location in assigned_locations:
        lat = random.uniform(lat_min, lat_max)
        lon = random.uniform(lon_min, lon_max)
        location = (round(lat, 6), round(lon, 6))
        location = add_random_offset(location)
    
    assigned_locations.add(location)
    return location

# Generate service data
def generate_service_data():
    assigned_locations = set()
    service_data = {}
    
    for city, (start_id, end_id) in city_service_id_ranges.items():
        for service_id in range(start_id, end_id + 1):
            location = generate_location(assigned_locations, city)
            service_data[service_id] = {
                "city": city,
                "provider_location_x": location[0],
                "provider_location_y": location[1],
                "service_category": random.choice(SERVICE_CATEGORIES),
                "review_avg": round(random.uniform(3.0, 5.0), 1),
                "review_count": random.randint(1, 50),
                "click_count": random.randint(0, 100),
                "provider_age": random.randint(25, 60),
                "provider_experience": random.randint(1, 20)
            }
    
    return service_data

# Generate user data
def generate_user_data(service_data):
    assigned_locations = set()
    user_data = {}
    total_categories = len(SERVICE_CATEGORIES)

    # Segment city_user_id_ranges by categories
    segmented_id_ranges = {
        city: [
            (start_id + i * ((end_id - start_id) // total_categories), 
             start_id + (i + 1) * ((end_id - start_id) // total_categories))
            for i in range(total_categories)
        ]
        for city, (start_id, end_id) in city_user_id_ranges.items()
    }

    for city, category_ranges in segmented_id_ranges.items():
        city_services = [service for service in service_data.values() if service['city'] == city]
        
        for category_index, (start_id, end_id) in enumerate(category_ranges):
            category = SERVICE_CATEGORIES[category_index]

            for user_id in range(start_id, end_id + 1):
                location = generate_location(assigned_locations, city)

                # Filter services by the user's primary category
                primary_category_services = [service for service in city_services if service['service_category'] == category]

                # Additional interest categories
                additional_categories = random.sample(SERVICE_CATEGORIES, random.randint(1, 3))

                # User's full interest categories (primary + additional)
                service_categories_interest = list(set([category] + additional_categories))

                # Filter the services to match the user's interests
                available_services = [service for service in city_services if service['service_category'] in service_categories_interest]

                # Assign interactions only within the available services
                num_reviews = random.randint(0, min(3, len(available_services)))
                reviewed_service_ids = random.sample([service_id for service_id in available_services], num_reviews)

                click_count_per_service = {sid: random.randint(1, 10) for sid in reviewed_service_ids}
                clicked_label = 1 if sum(click_count_per_service.values()) > 0 else 0

                user_data[user_id] = {
                    "city": city,
                    "location_x": location[0],
                    "location_y": location[1],
                    "age": random.randint(18, 65),
                    "gender": random.choice(["Male", "Female"]),
                    "service_categories_interest": service_categories_interest,
                    "reviewed_service_ids": reviewed_service_ids,
                    "click_count_per_service": click_count_per_service,
                    "total_service_views": sum(click_count_per_service.values()) + random.randint(0, 10),
                    "clicked": clicked_label
                }

    return user_data

# Generate both service and user data
service_data = generate_service_data()
user_data = generate_user_data(service_data)

# Save the service data into a separate JSON file with service_id as the key
with open('service_data.json', 'w') as service_file:
    json.dump(service_data, service_file, indent=4)

# Save the user data into a separate JSON file with user_id as the key
with open('user_data.json', 'w') as user_file:
    json.dump(user_data, user_file, indent=4)

print("Service and user data have been saved to 'service_data.json' and 'user_data.json'.")


TypeError: unhashable type: 'dict'

In [10]:
import random
import json
from geopy.distance import geodesic

# Define the service categories
SERVICE_CATEGORIES = [
    "houseCleaning", "electricity", "plumbing", "gardening", "painting", "carpentry", 
    "pestControl", "acRepair", "vehicleRepair", "applianceInstallation", "itSupport", 
    "homeSecurity", "interiorDesign", "windowCleaning", "furnitureAssembly"
]

# Define the Algerian cities with geographic ranges
algerian_cities = {
    "Algiers": {"lat_min": 36.6, "lat_max": 36.9, "lon_min": 2.9, "lon_max": 3.2},
    "Oran": {"lat_min": 35.6, "lat_max": 35.8, "lon_min": -0.8, "lon_max": -0.5},
    "Constantine": {"lat_min": 36.2, "lat_max": 36.4, "lon_min": 6.5, "lon_max": 6.7},
    "Annaba": {"lat_min": 36.8, "lat_max": 37.1, "lon_min": 7.6, "lon_max": 7.8},
    "Blida": {"lat_min": 36.4, "lat_max": 36.7, "lon_min": 2.5, "lon_max": 3.2},
    "Sétif": {"lat_min": 35.5, "lat_max": 36.6, "lon_min": 5.3, "lon_max": 6.5},
    "Tébessa": {"lat_min": 34.5, "lat_max": 35.8, "lon_min": 7.5, "lon_max": 8.7}
}

# Define the ranges for service and user IDs by city
city_service_id_ranges = {
    "Algiers": (1, 200),
    "Oran": (201, 350),
    "Constantine": (351, 500),
    "Annaba": (501, 650),
    "Blida": (651, 800),
    "Sétif": (801, 950),
    "Tébessa": (951, 1100)
}

city_user_id_ranges = {
    "Algiers": (1, 400),
    "Oran": (401, 700),
    "Constantine": (701, 1000),
    "Annaba": (1001, 1300),
    "Blida": (1301, 1600),
    "Sétif": (1601, 1800),
    "Tébessa": (1801, 2000)
}

# Add a random offset to the location
def add_random_offset(location):
    lat, lon = location
    lat += random.gauss(0, 0.01)
    lon += random.gauss(0, 0.01)
    return (round(lat, 6), round(lon, 6))

# Generate a random location within a city's geographic range
def generate_location(assigned_locations, city):
    lat_min, lat_max = algerian_cities[city]["lat_min"], algerian_cities[city]["lat_max"]
    lon_min, lon_max = algerian_cities[city]["lon_min"], algerian_cities[city]["lon_max"]
    
    lat = random.uniform(lat_min, lat_max)
    lon = random.uniform(lon_min, lon_max)
    location = (round(lat, 6), round(lon, 6))
    location = add_random_offset(location)
    
    # Use a tuple of (lat, lon) for assigned_locations to ensure it's hashable
    while location in assigned_locations:
        lat = random.uniform(lat_min, lat_max)
        lon = random.uniform(lon_min, lon_max)
        location = (round(lat, 6), round(lon, 6))
        location = add_random_offset(location)
    
    assigned_locations.add(location)
    return location

# Generate service data
def generate_service_data():
    assigned_locations = set()
    service_data = {}
    
    for city, (start_id, end_id) in city_service_id_ranges.items():
        for service_id in range(start_id, end_id + 1):
            location = generate_location(assigned_locations, city)
            service_data[service_id] = {
                "city": city,
                "provider_location_x": location[0],
                "provider_location_y": location[1],
                "service_category": random.choice(SERVICE_CATEGORIES),
                "review_avg": round(random.uniform(3.0, 5.0), 1),
                "review_count": random.randint(1, 50),
                "click_count": random.randint(0, 100),
                "provider_age": random.randint(25, 60),
                "provider_experience": random.randint(1, 20)
            }
    
    return service_data

# Generate user data
def generate_user_data(service_data):
    assigned_locations = set()
    user_data = {}
    total_categories = len(SERVICE_CATEGORIES)

    # Segment city_user_id_ranges by categories
    segmented_id_ranges = {
        city: [
            (start_id + i * ((end_id - start_id) // total_categories), 
             start_id + (i + 1) * ((end_id - start_id) // total_categories))
            for i in range(total_categories)
        ]
        for city, (start_id, end_id) in city_user_id_ranges.items()
    }

    for city, category_ranges in segmented_id_ranges.items():
        city_services = [service for service in service_data.values() if service['city'] == city]
        
        for category_index, (start_id, end_id) in enumerate(category_ranges):
            category = SERVICE_CATEGORIES[category_index]

            for user_id in range(start_id, end_id + 1):
                location = generate_location(assigned_locations, city)

                # Filter services by the user's primary category
                primary_category_services = [service for service in city_services if service['service_category'] == category]

                # Additional interest categories
                additional_categories = random.sample(SERVICE_CATEGORIES, random.randint(1, 3))

                # User's full interest categories (primary + additional)
                service_categories_interest = list(set([category] + additional_categories))

                # Filter the services to match the user's interests
                available_services = [service for service in city_services if service['service_category'] in service_categories_interest]

                # Assign interactions only within the available services
                num_reviews = random.randint(0, min(3, len(available_services)))
                reviewed_service_ids = random.sample([service_id for service_id in available_services], num_reviews)

                click_count_per_service = {sid: random.randint(1, 10) for sid in reviewed_service_ids}
                clicked_label = 1 if sum(click_count_per_service.values()) > 0 else 0

                user_data[user_id] = {
                    "city": city,
                    "location_x": location[0],
                    "location_y": location[1],
                    "age": random.randint(18, 65),
                    "gender": random.choice(["Male", "Female"]),
                    "service_categories_interest": service_categories_interest,
                    "reviewed_service_ids": reviewed_service_ids,
                    "click_count_per_service": click_count_per_service,
                    "total_service_views": sum(click_count_per_service.values()) + random.randint(0, 10),
                    "clicked": clicked_label
                }

    return user_data

# Generate both service and user data
service_data = generate_service_data()
user_data = generate_user_data(service_data)

# Save the service data into a separate JSON file with service_id as the key
with open('service_data.json', 'w') as service_file:
    json.dump(service_data, service_file, indent=4)

# Save the user data into a separate JSON file with user_id as the key
with open('user_data.json', 'w') as user_file:
    json.dump(user_data, user_file, indent=4)

print("Service and user data have been saved to 'service_data.json' and 'user_data.json'.")


TypeError: unhashable type: 'dict'

In [13]:
import random
import json
from geopy.distance import geodesic

# Define the service categories
SERVICE_CATEGORIES = [
    "houseCleaning", "electricity", "plumbing", "gardening", "painting", "carpentry", 
    "pestControl", "acRepair", "vehicleRepair", "applianceInstallation", "itSupport", 
    "homeSecurity", "interiorDesign", "windowCleaning", "furnitureAssembly"
]

# Define the Algerian cities with geographic ranges
algerian_cities = {
    "Algiers": {"lat_min": 36.6, "lat_max": 36.9, "lon_min": 2.9, "lon_max": 3.2},
    "Oran": {"lat_min": 35.6, "lat_max": 35.8, "lon_min": -0.8, "lon_max": -0.5},
    "Constantine": {"lat_min": 36.2, "lat_max": 36.4, "lon_min": 6.5, "lon_max": 6.7},
    "Annaba": {"lat_min": 36.8, "lat_max": 37.1, "lon_min": 7.6, "lon_max": 7.8},
    "Blida": {"lat_min": 36.4, "lat_max": 36.7, "lon_min": 2.5, "lon_max": 3.2},
    "Sétif": {"lat_min": 35.5, "lat_max": 36.6, "lon_min": 5.3, "lon_max": 6.5},
    "Tébessa": {"lat_min": 34.5, "lat_max": 35.8, "lon_min": 7.5, "lon_max": 8.7}
}

# Define the ranges for service and user IDs by city
city_service_id_ranges = {
    "Algiers": (1, 200),
    "Oran": (201, 350),
    "Constantine": (351, 500),
    "Annaba": (501, 650),
    "Blida": (651, 800),
    "Sétif": (801, 950),
    "Tébessa": (951, 1100)
}

city_user_id_ranges = {
    "Algiers": (1, 400),
    "Oran": (401, 700),
    "Constantine": (701, 1000),
    "Annaba": (1001, 1300),
    "Blida": (1301, 1600),
    "Sétif": (1601, 1800),
    "Tébessa": (1801, 2000)
}

# Add a random offset to the location
def add_random_offset(location):
    lat, lon = location
    lat += random.gauss(0, 0.01)
    lon += random.gauss(0, 0.01)
    return (round(lat, 6), round(lon, 6))

# Generate a random location within a city's geographic range
def generate_location(assigned_locations, city):
    lat_min, lat_max = algerian_cities[city]["lat_min"], algerian_cities[city]["lat_max"]
    lon_min, lon_max = algerian_cities[city]["lon_min"], algerian_cities[city]["lon_max"]
    
    lat = random.uniform(lat_min, lat_max)
    lon = random.uniform(lon_min, lon_max)
    location = (round(lat, 6), round(lon, 6))
    location = add_random_offset(location)
    
    while location in assigned_locations:
        lat = random.uniform(lat_min, lat_max)
        lon = random.uniform(lon_min, lon_max)
        location = (round(lat, 6), round(lon, 6))
        location = add_random_offset(location)
    
    assigned_locations.add(location)
    return location

# Generate service data
def generate_service_data():
    assigned_locations = set()
    service_data = []
    
    for city, (start_id, end_id) in city_service_id_ranges.items():
        for service_id in range(start_id, end_id + 1):
            location = generate_location(assigned_locations, city)
            service_data.append({
                "service_id": service_id,
                "city": city,
                "provider_location_x": location[0],
                "provider_location_y": location[1],
                "service_category": random.choice(SERVICE_CATEGORIES),
                "review_avg": round(random.uniform(3.0, 5.0), 1),
                "review_count": random.randint(1, 50),
                "click_count": random.randint(0, 100),
                "provider_age": random.randint(25, 60),
                "provider_experience": random.randint(1, 20)
            })
    
    return service_data

# Generate user data
def generate_user_data(service_data):
    assigned_locations = set()
    user_data = []
    total_categories = len(SERVICE_CATEGORIES)

    # Segment city_user_id_ranges by categories
    segmented_id_ranges = {
        city: [
            (start_id + i * ((end_id - start_id) // total_categories), 
             start_id + (i + 1) * ((end_id - start_id) // total_categories))
            for i in range(total_categories)
        ]
        for city, (start_id, end_id) in city_user_id_ranges.items()
    }

    for city, category_ranges in segmented_id_ranges.items():
        city_services = [service for service in service_data if service['city'] == city]
        
        for category_index, (start_id, end_id) in enumerate(category_ranges):
            category = SERVICE_CATEGORIES[category_index]

            for user_id in range(start_id, end_id + 1):
                location = generate_location(assigned_locations, city)

                # Filter services by the user's primary category
                primary_category_services = [service for service in city_services if service['service_category'] == category]

                # Additional interest categories
                additional_categories = random.sample(SERVICE_CATEGORIES, random.randint(1, 3))

                # User's full interest categories (primary + additional)
                service_categories_interest = list(set([category] + additional_categories))

                # Filter the services to match the user's interests
                available_services = [service for service in city_services if service['service_category'] in service_categories_interest]

                # Assign interactions only within the available services
                num_reviews = random.randint(0, min(3, len(available_services)))
                reviewed_service_ids = random.sample([service['service_id'] for service in available_services], num_reviews)

                click_count_per_service = {sid: random.randint(1, 10) for sid in reviewed_service_ids}
                clicked_label = 1 if sum(click_count_per_service.values()) > 0 else 0

                user_data.append({
                    "user_id": user_id,
                    "city": city,
                    "location_x": location[0],
                    "location_y": location[1],
                    "age": random.randint(18, 65),
                    "gender": random.choice(["Male", "Female"]),
                    "service_categories_interest": service_categories_interest,
                    "reviewed_service_ids": reviewed_service_ids,
                    "click_count_per_service": click_count_per_service,
                    "total_service_views": sum(click_count_per_service.values()) + random.randint(0, 10),
                    "clicked": clicked_label
                })

    return user_data

# Generate both service and user data
service_data = generate_service_data()
user_data = generate_user_data(service_data)

# Save the service data into a separate JSON file
with open('service_data.json', 'w') as service_file:
    json.dump(service_data, service_file, indent=4)

# Save the user data into a separate JSON file
with open('user_data.json', 'w') as user_file:
    json.dump(user_data, user_file, indent=4)

print("Service and user data have been saved to 'service_data.json' and 'user_data.json'.")


Service and user data have been saved to 'service_data.json' and 'user_data.json'.


In [14]:
import random
import json

# Define the service categories
SERVICE_CATEGORIES = [
    "houseCleaning", "electricity", "plumbing", "gardening", "painting", "carpentry", 
    "pestControl", "acRepair", "vehicleRepair", "applianceInstallation", "itSupport", 
    "homeSecurity", "interiorDesign", "windowCleaning", "furnitureAssembly"
]

# Define the Algerian cities with geographic ranges
algerian_cities = {
    "Algiers": {"lat_min": 36.6, "lat_max": 36.9, "lon_min": 2.9, "lon_max": 3.2},
    "Oran": {"lat_min": 35.6, "lat_max": 35.8, "lon_min": -0.8, "lon_max": -0.5},
    "Constantine": {"lat_min": 36.2, "lat_max": 36.4, "lon_min": 6.5, "lon_max": 6.7},
    "Annaba": {"lat_min": 36.8, "lat_max": 37.1, "lon_min": 7.6, "lon_max": 7.8},
    "Blida": {"lat_min": 36.4, "lat_max": 36.7, "lon_min": 2.5, "lon_max": 3.2},
    "Sétif": {"lat_min": 35.5, "lat_max": 36.6, "lon_min": 5.3, "lon_max": 6.5},
    "Tébessa": {"lat_min": 34.5, "lat_max": 35.8, "lon_min": 7.5, "lon_max": 8.7}
}

# Define the ranges for service and user IDs by city
city_service_id_ranges = {
    "Algiers": (1, 200),
    "Oran": (201, 350),
    "Constantine": (351, 500),
    "Annaba": (501, 650),
    "Blida": (651, 800),
    "Sétif": (801, 950),
    "Tébessa": (951, 1100)
}

city_user_id_ranges = {
    "Algiers": (1, 400),
    "Oran": (401, 700),
    "Constantine": (701, 1000),
    "Annaba": (1001, 1300),
    "Blida": (1301, 1600),
    "Sétif": (1601, 1800),
    "Tébessa": (1801, 2000)
}

# Add a random offset to the location
def add_random_offset(location):
    lat, lon = location
    lat += random.gauss(0, 0.01)
    lon += random.gauss(0, 0.01)
    return (round(lat, 6), round(lon, 6))

# Generate a random location within a city's geographic range
def generate_location(assigned_locations, city):
    lat_min, lat_max = algerian_cities[city]["lat_min"], algerian_cities[city]["lat_max"]
    lon_min, lon_max = algerian_cities[city]["lon_min"], algerian_cities[city]["lon_max"]
    
    lat = random.uniform(lat_min, lat_max)
    lon = random.uniform(lon_min, lon_max)
    location = (round(lat, 6), round(lon, 6))
    location = add_random_offset(location)
    
    # Ensure the location tuple is not repeated
    while location in assigned_locations:
        lat = random.uniform(lat_min, lat_max)
        lon = random.uniform(lon_min, lon_max)
        location = (round(lat, 6), round(lon, 6))
        location = add_random_offset(location)
    
    assigned_locations.add(location)
    return location

# Generate service data
def generate_service_data():
    assigned_locations = set()  # Set of tuples (lat, lon)
    service_data = {}
    
    for city, (start_id, end_id) in city_service_id_ranges.items():
        for service_id in range(start_id, end_id + 1):
            location = generate_location(assigned_locations, city)
            service_data[service_id] = {
                "city": city,
                "provider_location_x": location[0],
                "provider_location_y": location[1],
                "service_category": random.choice(SERVICE_CATEGORIES),
                "review_avg": round(random.uniform(3.0, 5.0), 1),
                "review_count": random.randint(1, 50),
                "click_count": random.randint(0, 100),
                "provider_age": random.randint(25, 60),
                "provider_experience": random.randint(1, 20)
            }
    
    return service_data

# Generate user data
def generate_user_data(service_data):
    assigned_locations = set()  # Set of tuples (lat, lon)
    user_data = {}
    total_categories = len(SERVICE_CATEGORIES)

    # Segment city_user_id_ranges by categories
    segmented_id_ranges = {
        city: [
            (start_id + i * ((end_id - start_id) // total_categories), 
             start_id + (i + 1) * ((end_id - start_id) // total_categories))
            for i in range(total_categories)
        ]
        for city, (start_id, end_id) in city_user_id_ranges.items()
    }

    for city, category_ranges in segmented_id_ranges.items():
        # Get service IDs for services in this city
        city_services = {
            service_id: service 
            for service_id, service in service_data.items() 
            if service['city'] == city
        }
        
        for category_index, (start_id, end_id) in enumerate(category_ranges):
            category = SERVICE_CATEGORIES[category_index]

            for user_id in range(start_id, end_id + 1):
                location = generate_location(assigned_locations, city)

                # Filter services by the user's primary category
                primary_category_services = {
                    service_id: service 
                    for service_id, service in city_services.items() 
                    if service['service_category'] == category
                }

                # Additional interest categories
                additional_categories = random.sample(SERVICE_CATEGORIES, random.randint(1, 3))

                # User's full interest categories (primary + additional)
                service_categories_interest = list(set([category] + additional_categories))

                # Filter the services to match the user's interests
                available_services = {
                    service_id: service 
                    for service_id, service in city_services.items() 
                    if service['service_category'] in service_categories_interest
                }

                # Assign interactions only within the available services
                num_reviews = random.randint(0, min(3, len(available_services)))
                reviewed_service_ids = random.sample(list(available_services.keys()), num_reviews)

                click_count_per_service = {str(sid): random.randint(1, 10) for sid in reviewed_service_ids}
                clicked_label = 1 if sum(click_count_per_service.values()) > 0 else 0

                user_data[user_id] = {
                    "city": city,
                    "location_x": location[0],
                    "location_y": location[1],
                    "age": random.randint(18, 65),
                    "gender": random.choice(["Male", "Female"]),
                    "service_categories_interest": service_categories_interest,
                    "reviewed_service_ids": reviewed_service_ids,
                    "click_count_per_service": click_count_per_service,
                    "total_service_views": sum(click_count_per_service.values()) + random.randint(0, 10),
                    "clicked": clicked_label
                }

    return user_data

# Generate both service and user data
service_data = generate_service_data()
user_data = generate_user_data(service_data)

# Save the service data into a separate JSON file with service_id as the key
with open('service_data.json', 'w') as service_file:
    json.dump(service_data, service_file, indent=4)

# Save the user data into a separate JSON file with user_id as the key
with open('user_data.json', 'w') as user_file:
    json.dump(user_data, user_file, indent=4)

print("Service and user data have been saved to 'service_data.json' and 'user_data.json'.")

Service and user data have been saved to 'service_data.json' and 'user_data.json'.
