In [1]:
import pandas as pd
import numpy as np
import random

# Number of users
num_users = 1000

# Generate synthetic user data
user_ids = np.arange(1, num_users + 1)
user_ages = np.random.randint(18, 70, size=num_users)  # Ages between 18 and 70
user_genders = np.random.choice(['Male', 'Female', 'Other'], size=num_users, p=[0.45, 0.45, 0.10])
user_occupation = np.random.choice(['Student', 'Professional', 'Retired', 'Self-employed'], size=num_users, p=[0.3, 0.4, 0.2, 0.1])

# Create DataFrame
users = pd.DataFrame({
    'user_id': user_ids,
    'age': user_ages,
    'gender': user_genders,
    'occupation': user_occupation
})

print(users.head())

# Number of products
num_products = 500

# Generate synthetic product data
product_ids = np.arange(1, num_products + 1)
product_categories = np.random.choice(['Electronics', 'Books', 'Clothing', 'Home', 'Beauty', 'Sports'], size=num_products)
product_prices = np.round(np.random.uniform(5.0, 500.0, size=num_products), 2)  # Prices between $5 and $500
product_ratings = np.round(np.random.uniform(1.0, 5.0, size=num_products), 1)  # Ratings between 1.0 and 5.0

# Keywords and templates for descriptions
description_keywords = {
    'Electronics': ['latest technology', 'high-performance', 'durable', 'compact', 'user-friendly'],
    'Books': ['bestselling', 'engaging', 'classic', 'informative', 'must-read'],
    'Clothing': ['stylish', 'comfortable', 'trendy', 'breathable', 'versatile'],
    'Home': ['modern', 'elegant', 'cozy', 'durable', 'functional'],
    'Beauty': ['premium quality', 'natural ingredients', 'long-lasting', 'hypoallergenic', 'luxurious'],
    'Sports': ['high-performance', 'durable', 'lightweight', 'comfortable', 'professional-grade']
}

description_templates = {
    'Electronics': [
        "This {category} item features {adjective1} design and {adjective2} functionality. It's {adjective3} and {adjective4}, perfect for everyday use.",
        "Experience the {adjective1} of our {category} with this {adjective2} product. It's {adjective3} and {adjective4}, ensuring high performance."
    ],
    'Books': [
        "Dive into this {category} with our {adjective1} and {adjective2} read. It's {adjective3} and {adjective4}, a {adjective5} addition to any collection.",
        "Our {category} offers a {adjective1} experience with {adjective2} insights. It's {adjective3} and {adjective4}, ideal for any reader."
    ],
    'Clothing': [
        "Our {category} is {adjective1} and {adjective2}, perfect for {adjective3} wear. It's {adjective4} and {adjective5}, making it a staple in any wardrobe.",
        "This {category} combines {adjective1} style with {adjective2} comfort. It's {adjective3} and {adjective4}, suitable for all occasions."
    ],
    'Home': [
        "Enhance your living space with our {adjective1} and {adjective2} {category}. It's {adjective3} and {adjective4}, perfect for any home.",
        "Our {category} is {adjective1} and {adjective2}, designed for {adjective3} use. It's {adjective4} and {adjective5}, adding a touch of elegance to your home."
    ],
    'Beauty': [
        "Experience {adjective1} and {adjective2} care with our {category}. It's {adjective3} and {adjective4}, perfect for a {adjective5} routine.",
        "Our {category} features {adjective1} ingredients and {adjective2} results. It's {adjective3} and {adjective4}, ensuring a {adjective5} glow."
    ],
    'Sports': [
        "Achieve your best with our {adjective1} and {adjective2} {category}. It's {adjective3} and {adjective4}, perfect for {adjective5} performance.",
        "This {category} is {adjective1} and {adjective2}, designed for {adjective3} activities. It's {adjective4} and {adjective5}, ideal for any sports enthusiast."
    ]
}

# Function to generate a random description based on category
def generate_description(category):
    adjectives = random.sample(description_keywords[category], 5)
    template = random.choice(description_templates[category])
    return template.format(category=category, adjective1=adjectives[0], adjective2=adjectives[1], adjective3=adjectives[2], adjective4=adjectives[3], adjective5=adjectives[4])

# Generate product descriptions
product_descriptions = [generate_description(cat) for cat in product_categories]

# Create DataFrame with descriptions
products = pd.DataFrame({
    'product_id': product_ids,
    'category': product_categories,
    'price': product_prices,
    'rating': product_ratings,
    'description': product_descriptions
})

print(products.head())

# Number of interactions
num_interactions = 10000

# Generate synthetic interaction data
interaction_user_ids = np.random.choice(user_ids, size=num_interactions)
interaction_product_ids = np.random.choice(product_ids, size=num_interactions)
interaction_ratings = np.random.randint(1, 6, size=num_interactions)  # Ratings between 1 and 5
interaction_timestamps = pd.date_range(start='2023-01-01', periods=num_interactions, freq='T')  # Random timestamps

# Create DataFrame
interactions = pd.DataFrame({
    'user_id': interaction_user_ids,
    'product_id': interaction_product_ids,
    'interaction': interaction_ratings,
    'timestamp': interaction_timestamps
})

print(interactions.head())


   user_id  age  gender occupation
0        1   28  Female    Retired
1        2   50   Other    Student
2        3   19  Female    Retired
3        4   67    Male    Retired
4        5   41    Male    Student
   product_id     category   price  rating  \
0           1     Clothing  173.49     1.1   
1           2     Clothing  157.47     3.2   
2           3       Sports   67.25     1.4   
3           4     Clothing  139.35     3.5   
4           5  Electronics  229.67     2.2   

                                         description  
0  Our Clothing is versatile and trendy, perfect ...  
1  Our Clothing is trendy and stylish, perfect fo...  
2  Achieve your best with our durable and high-pe...  
3  This Clothing combines comfortable style with ...  
4  Experience the high-performance of our Electro...  
   user_id  product_id  interaction           timestamp
0      886         412            5 2023-01-01 00:00:00
1      574          45            4 2023-01-01 00:01:00
2      549     

  interaction_timestamps = pd.date_range(start='2023-01-01', periods=num_interactions, freq='T')  # Random timestamps


In [4]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix

# Normalize numerical features
scaler = MinMaxScaler()
products[['price', 'rating']] = scaler.fit_transform(products[['price', 'rating']])

# Vectorize product descriptions using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
product_descriptions_matrix = vectorizer.fit_transform(products['description'])

# Combine product metadata into a single feature set
product_features = np.hstack((products[['price', 'rating']], product_descriptions_matrix.toarray()))

# Aggregate the interactions by taking the average rating for each user-product pair
aggregated_interactions = interactions.groupby(['user_id', 'product_id']).mean().reset_index()

# Create user-item interaction matrix
user_item_matrix = aggregated_interactions.pivot(index='user_id', columns='product_id', values='interaction').fillna(0)

# Convert to sparse matrix format
user_item_sparse = csr_matrix(user_item_matrix.values)

print(user_item_matrix.head())



product_id  1    2    3    4    5    6    7    8    9    10   ...  491  492  \
user_id                                                       ...             
1           0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
2           0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
3           0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
4           0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
5           0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   

product_id  493  494  495  496  497  498  499  500  
user_id                                             
1           0.0  0.0  0.0  0.0  0.0  0.0  2.0  0.0  
2           0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
3           0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  
4           0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
5           0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  

[5 rows x 500 columns]


In [5]:
from sklearn.decomposition import TruncatedSVD

# Apply Truncated SVD for matrix factorization
svd = TruncatedSVD(n_components=50, random_state=42)
user_factors = svd.fit_transform(user_item_sparse)
item_factors = svd.components_.T


In [6]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity between product features
cosine_similarities = cosine_similarity(product_features)

# Function to recommend similar products
def recommend_products(product_id, top_n=10):
    product_idx = products.index[products['product_id'] == product_id][0]
    similar_indices = cosine_similarities[product_idx].argsort()[-top_n:][::-1]
    similar_products = products.iloc[similar_indices]
    return similar_products

# Example usage
recommended_products = recommend_products(product_id=1)
print(recommended_products)


     product_id  category     price  rating  \
0             1  Clothing  0.339617   0.025   
295         296  Clothing  0.460820   0.075   
247         248  Clothing  0.234941   0.100   
422         423  Clothing  0.488025   0.050   
208         209  Clothing  0.365245   0.175   
391         392  Clothing  0.197540   0.075   
94           95  Clothing  0.501092   0.125   
85           86  Clothing  0.311178   0.250   
143         144  Clothing  0.557870   0.250   
428         429  Clothing  0.363142   0.325   

                                           description  
0    Our Clothing is versatile and trendy, perfect ...  
295  Our Clothing is trendy and versatile, perfect ...  
247  Our Clothing is stylish and comfortable, perfe...  
422  Our Clothing is trendy and versatile, perfect ...  
208  Our Clothing is trendy and comfortable, perfec...  
391  Our Clothing is versatile and stylish, perfect...  
94   Our Clothing is trendy and versatile, perfect ...  
85   Our Clothing is versa

In [7]:
# Function to get hybrid recommendations
def hybrid_recommendations(user_id, top_n=10):
    user_idx = users.index[users['user_id'] == user_id][0]
    user_interactions = user_item_matrix.iloc[user_idx].values
    
    # Weighted average of collaborative filtering and content-based recommendations
    cf_recommendations = user_factors[user_idx].dot(item_factors.T)
    cb_recommendations = cosine_similarities.dot(user_interactions)
    
    combined_scores = cf_recommendations + cb_recommendations
    top_indices = combined_scores.argsort()[-top_n:][::-1]
    
    return products.iloc[top_indices]

# Example usage
user_recommendations = hybrid_recommendations(user_id=1)
print(user_recommendations)


     product_id  category     price  rating  \
130         131      Home  0.997006   0.800   
371         372      Home  0.830373   0.950   
56           57      Home  0.946135   0.925   
463         464      Home  0.845261   0.900   
307         308      Home  0.817691   0.975   
187         188      Home  0.999130   0.750   
95           96      Home  0.694284   0.825   
459         460  Clothing  0.991767   1.000   
131         132  Clothing  0.584550   0.800   
345         346      Home  0.971399   0.675   

                                           description  
130  Enhance your living space with our durable and...  
371  Our Home is functional and cozy, designed for ...  
56   Our Home is durable and elegant, designed for ...  
463  Enhance your living space with our cozy and mo...  
307  Enhance your living space with our elegant and...  
187  Our Home is elegant and cozy, designed for fun...  
95   Enhance your living space with our durable and...  
459  Our Clothing is comfo

In [9]:
# Apply time decay factor to interactions
interactions['timestamp'] = pd.to_datetime(interactions['timestamp'])
max_time = interactions['timestamp'].max()

# Define a time decay function
def time_decay(t, max_time, decay_rate=0.1):
    return np.exp(-decay_rate * (max_time - t).days)

# Apply the time decay factor
interactions['decay_factor'] = interactions['timestamp'].apply(lambda t: time_decay(t, max_time))
interactions['interaction'] = interactions['interaction'] * interactions['decay_factor']

# Option 1: Aggregating duplicate entries
# aggregated_interactions = interactions.groupby(['user_id', 'product_id']).sum().reset_index()

# Option 2: Keeping only the latest interaction
latest_interactions = interactions.sort_values(by='timestamp').drop_duplicates(subset=['user_id', 'product_id'], keep='last')

# Create user-item interaction matrix
# For Option 1:
# user_item_matrix = aggregated_interactions.pivot(index='user_id', columns='product_id', values='interaction').fillna(0)

# For Option 2:
user_item_matrix = latest_interactions.pivot(index='user_id', columns='product_id', values='interaction').fillna(0)

# Convert to sparse matrix format
user_item_sparse = csr_matrix(user_item_matrix.values)

print(user_item_matrix.head())


product_id  1    2    3    4    5    6    7    8    9    10   ...  491  492  \
user_id                                                       ...             
1           0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
2           0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
3           0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
4           0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
5           0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   

product_id  493  494  495  496  497  498       499       500  
user_id                                                       
1           0.0  0.0  0.0  0.0  0.0  0.0  0.898658  0.000000  
2           0.0  0.0  0.0  0.0  0.0  0.0  0.000000  0.000000  
3           0.0  0.0  0.0  0.0  0.0  0.0  0.000000  0.449329  
4           0.0  0.0  0.0  0.0  0.0  0.0  0.000000  0.000000  
5           0.0  0.0  0.0  0.0  0.0  0.0  0.000000  0.000000  

[5 r

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Split data into training and test sets
train_data, test_data = train_test_split(interactions, test_size=0.2, random_state=42)

# Function to evaluate the model
def evaluate_model(user_factors, item_factors, test_data):
    test_preds = []
    for _, row in test_data.iterrows():
        user_idx = users.index[users['user_id'] == row['user_id']][0]
        product_idx = products.index[products['product_id'] == row['product_id']][0]
        pred_rating = user_factors[user_idx].dot(item_factors[product_idx])
        test_preds.append(pred_rating)
    
    test_rmse = np.sqrt(mean_squared_error(test_data['interaction'], test_preds))
    return test_rmse

# Example usage
test_rmse = evaluate_model(user_factors, item_factors, test_data)
print(f"Test RMSE: {test_rmse}")


Test RMSE: 1.4600050108597842


In [12]:
from flask import Flask, request, jsonify

app = Flask(__name__)

@app.route('/recommend', methods=['GET'])
def recommend():
    user_id = int(request.args.get('user_id'))
    top_n = int(request.args.get('top_n', 10))
    
    recommendations = hybrid_recommendations(user_id, top_n)
    return jsonify(recommendations.to_dict(orient='records'))

if __name__ == '__main__':
    app.run(debug=True)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
 * Restarting with stat


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
