In [1]:
import random
import pandas as pd
import numpy as np
from geopy.distance import geodesic

# Define the service categories
SERVICE_CATEGORIES = [
    "houseCleaning", "electricity", "plumbing", "gardening", "painting", "carpentry", 
    "pestControl", "acRepair", "vehicleRepair", "applianceInstallation", "itSupport", 
    "homeSecurity", "interiorDesign", "windowCleaning", "furnitureAssembly"
]

# Define the Algerian cities with geographic ranges
algerian_cities = {
    "Algiers": {"lat_min": 36.6, "lat_max": 36.9, "lon_min": 2.9, "lon_max": 3.2},
    "Oran": {"lat_min": 35.6, "lat_max": 35.8, "lon_min": -0.8, "lon_max": -0.5},
    "Constantine": {"lat_min": 36.2, "lat_max": 36.4, "lon_min": 6.5, "lon_max": 6.7},
    "Annaba": {"lat_min": 36.8, "lat_max": 37.1, "lon_min": 7.6, "lon_max": 7.8},
    "Blida": {"lat_min": 36.4, "lat_max": 36.7, "lon_min": 2.5, "lon_max": 3.2},
    "Sétif": {"lat_min": 35.5, "lat_max": 36.6, "lon_min": 5.3, "lon_max": 6.5},
    "Tébessa": {"lat_min": 34.5, "lat_max": 35.8, "lon_min": 7.5, "lon_max": 8.7}
}

# Define the ranges for service and user IDs by city
city_service_id_ranges = {
    "Algiers": (1, 200),
    "Oran": (201, 350),
    "Constantine": (351, 500),
    "Annaba": (501, 650),
    "Blida": (651, 800),
    "Sétif": (801, 950),
    "Tébessa": (951, 1100)
}

city_user_id_ranges = {
    "Algiers": (1, 400),
    "Oran": (401, 700),
    "Constantine": (701, 1000),
    "Annaba": (1001, 1300),
    "Blida": (1301, 1600),
    "Sétif": (1601, 1800),
    "Tébessa": (1801, 2000)
}

# Add a random offset to the location
def add_random_offset(location):
    lat, lon = location
    lat += random.gauss(0, 0.01)
    lon += random.gauss(0, 0.01)
    return (round(lat, 6), round(lon, 6))

# Generate a random location within a city's geographic range
def generate_location(assigned_locations, city):
    lat_min, lat_max = algerian_cities[city]["lat_min"], algerian_cities[city]["lat_max"]
    lon_min, lon_max = algerian_cities[city]["lon_min"], algerian_cities[city]["lon_max"]
    
    lat = random.uniform(lat_min, lat_max)
    lon = random.uniform(lon_min, lon_max)
    location = (round(lat, 6), round(lon, 6))
    location = add_random_offset(location)
    
    while location in assigned_locations:
        lat = random.uniform(lat_min, lat_max)
        lon = random.uniform(lon_min, lon_max)
        location = (round(lat, 6), round(lon, 6))
        location = add_random_offset(location)
    
    assigned_locations.add(location)
    return location

# Calculate distance between user and service provider locations
def calculate_distance(user_location, provider_location):
    return geodesic(user_location, provider_location).km

# Generate service data
def generate_service_data():
    assigned_locations = set()
    service_data = []
    
    for city, (start_id, end_id) in city_service_id_ranges.items():
        for service_id in range(start_id, end_id + 1):
            location = generate_location(assigned_locations, city)
            service_data.append({
                "service_id": service_id,
                "city": city,
                "provider_location_x": location[0],
                "provider_location_y": location[1],
                "service_category": random.choice(SERVICE_CATEGORIES),
                "review_avg": round(random.uniform(3.0, 5.0), 1),
                "review_count": random.randint(1, 50),
                "click_count": random.randint(0, 100),
                "provider_age": random.randint(25, 60),
                "provider_experience": random.randint(1, 20)
            })
    
    return pd.DataFrame(service_data)

# Generate user data
def generate_user_data(service_df):
    assigned_locations = set()
    user_data = []
    
    for city, (start_id, end_id) in city_user_id_ranges.items():
        # Get available service IDs for this city and match them to the service categories of the user
        city_services = service_df[service_df['city'] == city]
        
        for user_id in range(start_id, end_id + 1):
            location = generate_location(assigned_locations, city)
            
            # Get the user's service categories of interest
            service_categories_interest = random.sample(SERVICE_CATEGORIES, random.randint(2, 4))
            # Filter the services available in the city to match the user's interest
            available_services = city_services[city_services['service_category'].isin(service_categories_interest)]
            
            # Generate reviews only from the services the user is interested in
            num_reviews = random.randint(0, min(3, len(available_services)))
            reviewed_service_ids = random.sample(available_services['service_id'].tolist(), num_reviews)
            
            click_count_per_service = {
                sid: random.randint(1, 10) 
                for sid in reviewed_service_ids
            }
            
            # Create the binary "clicked" label based on if the user interacted with any service
            clicked_label = 1 if sum(click_count_per_service.values()) > 0 else 0
            
            user_data.append({
                "user_id": user_id,
                "city": city,
                "location_x": location[0],
                "location_y": location[1],
                "age": random.randint(18, 65),
                "gender": random.choice(["Male", "Female"]),
                "service_categories_interest": service_categories_interest,
                "reviewed_service_ids": reviewed_service_ids,
                "click_count_per_service": click_count_per_service,
                "total_service_views": sum(click_count_per_service.values()) + random.randint(0, 10),
                "clicked": clicked_label  # This is the label for the recommendation system
            })
    
    return pd.DataFrame(user_data)

# Generate datasets
service_df = generate_service_data()
user_df = generate_user_data(service_df)

# Save to CSV
service_df.to_csv('service_data.csv', index=False)
user_df.to_csv('user_data.csv', index=False)

print("Datasets generated and saved as 'service_data.csv' and 'user_data.csv'.")


Datasets generated and saved as 'service_data.csv' and 'user_data.csv'.


In [2]:
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

# Load the datasets (assuming they were saved as 'service_data.csv' and 'user_data.csv')
service_df = pd.read_csv('service_data.csv')
user_df = pd.read_csv('user_data.csv')

# Prepare the data for collaborative filtering
# We need to create an interaction matrix where rows are users, and columns are services.
# We will use `clicked` as the interaction value (1 for clicked, 0 for not clicked)

# Create a user-item interaction matrix
interaction_matrix = pd.pivot_table(user_df, index='user_id', columns='reviewed_service_ids', 
                                   values='clicked', aggfunc='max', fill_value=0)

# Check if the interaction matrix is sparse and needs dimensionality reduction
print(interaction_matrix.shape)

# Apply SVD (Singular Value Decomposition) for matrix factorization
svd = TruncatedSVD(n_components=50, random_state=42)  # You can adjust `n_components`
interaction_matrix_svd = svd.fit_transform(interaction_matrix)

# Reconstruct the matrix to get the predicted interaction values
interaction_matrix_reconstructed = svd.inverse_transform(interaction_matrix_svd)

# Convert the reconstructed matrix back into a DataFrame for easier analysis
interaction_matrix_pred = pd.DataFrame(interaction_matrix_reconstructed, 
                                       columns=interaction_matrix.columns, 
                                       index=interaction_matrix.index)

# Generate recommendations for a specific user (e.g., user_id=10)
user_id = 10  # Change this to any user_id from your dataset
user_row = interaction_matrix_pred.loc[user_id]

# Sort services based on predicted interaction values
recommended_services = user_row.sort_values(ascending=False).head(10)  # Top 10 recommendations

# Display the recommended services for the user
recommended_service_ids = recommended_services.index
recommended_services_info = service_df[service_df['service_id'].isin(recommended_service_ids)]

# Display the top recommended services with their details
print("Top 10 Recommended Services for User {}:".format(user_id))
print(recommended_services_info[['service_id', 'service_category', 'review_avg', 'provider_age']])

# You can adjust the number of recommendations, include additional features, etc.


(2000, 1387)
Top 10 Recommended Services for User 10:
Empty DataFrame
Columns: [service_id, service_category, review_avg, provider_age]
Index: []


In [3]:
# Check how many services User 10 has reviewed
user_10_data = user_df[user_df['user_id'] == 10]
print("User 10's reviewed services:")
print(user_10_data[['reviewed_service_ids', 'clicked']])

# Check the interaction matrix for User 10
user_10_interactions = interaction_matrix.loc[10]
print("\nUser 10's interactions in the matrix:")
print(user_10_interactions[user_10_interactions > 0])


User 10's reviewed services:
  reviewed_service_ids  clicked
9                [161]        1

User 10's interactions in the matrix:
reviewed_service_ids
[161]    1
Name: 10, dtype: int64
