In [40]:
import pandas as pd
import numpy as np
import random

# Number of users
num_users = 1000

# Generate synthetic user data
user_ids = np.arange(1, num_users + 1)
user_ages = np.random.randint(18, 70, size=num_users)  # Ages between 18 and 70
user_genders = np.random.choice(['Male', 'Female', 'Other'], size=num_users, p=[0.45, 0.45, 0.10])
user_occupation = np.random.choice(['Student', 'Professional', 'Retired', 'Self-employed'], size=num_users, p=[0.3, 0.4, 0.2, 0.1])

# Create DataFrame
users = pd.DataFrame({
    'user_id': user_ids,
    'age': user_ages,
    'gender': user_genders,
    'occupation': user_occupation
})

print("Users DataFrame:")
print(users.head())

# Number of products
num_products = 500

# Generate synthetic product data
product_ids = np.arange(1, num_products + 1)
product_categories = np.random.choice(['Electronics', 'Books', 'Clothing', 'Home', 'Beauty', 'Sports'], size=num_products)
product_prices = np.round(np.random.uniform(5.0, 500.0, size=num_products), 2)  # Prices between $5 and $500
product_ratings = np.round(np.random.uniform(1.0, 5.0, size=num_products), 1)  # Ratings between 1.0 and 5.0

# Generate unique video IDs for products
video_ids = np.arange(1001, 1001 + num_products)  # Unique video IDs starting from 1001

# Keywords and templates for descriptions
description_keywords = {
    'Electronics': ['latest technology', 'high-performance', 'durable', 'compact', 'user-friendly'],
    'Books': ['bestselling', 'engaging', 'classic', 'informative', 'must-read'],
    'Clothing': ['stylish', 'comfortable', 'trendy', 'breathable', 'versatile'],
    'Home': ['modern', 'elegant', 'cozy', 'durable', 'functional'],
    'Beauty': ['premium quality', 'natural ingredients', 'long-lasting', 'hypoallergenic', 'luxurious'],
    'Sports': ['high-performance', 'durable', 'lightweight', 'comfortable', 'professional-grade']
}

description_templates = {
    'Electronics': [
        "This {category} item features {adjective1} design and {adjective2} functionality. It's {adjective3} and {adjective4}, perfect for everyday use.",
        "Experience the {adjective1} of our {category} with this {adjective2} product. It's {adjective3} and {adjective4}, ensuring high performance."
    ],
    'Books': [
        "Dive into this {category} with our {adjective1} and {adjective2} read. It's {adjective3} and {adjective4}, a {adjective5} addition to any collection.",
        "Our {category} offers a {adjective1} experience with {adjective2} insights. It's {adjective3} and {adjective4}, ideal for any reader."
    ],
    'Clothing': [
        "Our {category} is {adjective1} and {adjective2}, perfect for {adjective3} wear. It's {adjective4} and {adjective5}, making it a staple in any wardrobe.",
        "This {category} combines {adjective1} style with {adjective2} comfort. It's {adjective3} and {adjective4}, suitable for all occasions."
    ],
    'Home': [
        "Enhance your living space with our {adjective1} and {adjective2} {category}. It's {adjective3} and {adjective4}, perfect for any home.",
        "Our {category} is {adjective1} and {adjective2}, designed for {adjective3} use. It's {adjective4} and {adjective5}, adding a touch of elegance to your home."
    ],
    'Beauty': [
        "Experience {adjective1} and {adjective2} care with our {category}. It's {adjective3} and {adjective4}, perfect for a {adjective5} routine.",
        "Our {category} features {adjective1} ingredients and {adjective2} results. It's {adjective3} and {adjective4}, ensuring a {adjective5} glow."
    ],
    'Sports': [
        "Achieve your best with our {adjective1} and {adjective2} {category}. It's {adjective3} and {adjective4}, perfect for {adjective5} performance.",
        "This {category} is {adjective1} and {adjective2}, designed for {adjective3} activities. It's {adjective4} and {adjective5}, ideal for any sports enthusiast."
    ]
}

# Function to generate a random description based on category
def generate_description(category):
    adjectives = random.sample(description_keywords[category], 5)
    template = random.choice(description_templates[category])
    return template.format(category=category, adjective1=adjectives[0], adjective2=adjectives[1], adjective3=adjectives[2], adjective4=adjectives[3], adjective5=adjectives[4])

# Generate product descriptions
product_descriptions = [generate_description(cat) for cat in product_categories]

# Create DataFrame with descriptions
products = pd.DataFrame({
    'product_id': product_ids,
    'category': product_categories,
    'price': product_prices,
    'rating': product_ratings,
    'description': product_descriptions,
    'video_id': video_ids  # Include the video IDs
})

print("Products DataFrame:")
print(products.head())

# Number of interactions
num_interactions = 10000

# Generate synthetic interaction data
interaction_user_ids = np.random.choice(user_ids, size=num_interactions)
interaction_product_ids = np.random.choice(product_ids, size=num_interactions)
interaction_types = np.random.choice(['view', 'click', 'like', 'comment', 'share', 'purchase'], size=num_interactions, p=[0.4, 0.3, 0.1, 0.1, 0.05, 0.05])
interaction_timestamps = pd.date_range(start='2023-01-01', periods=num_interactions, freq='T')  # Random timestamps

# Create DataFrame
interactions = pd.DataFrame({
    'user_id': interaction_user_ids,
    'product_id': interaction_product_ids,
    'interaction_type': interaction_types,
    'timestamp': interaction_timestamps
})

print("Interactions DataFrame:")
print(interactions.head())

# Save DataFrames to CSV
users.to_csv('synthetic_users.csv', index=False)
products.to_csv('synthetic_products.csv', index=False)
interactions.to_csv('synthetic_interactions.csv', index=False)


Users DataFrame:
   user_id  age  gender     occupation
0        1   43  Female   Professional
1        2   54    Male        Student
2        3   20    Male  Self-employed
3        4   34   Other   Professional
4        5   69  Female   Professional
Products DataFrame:
   product_id  category   price  rating  \
0           1    Sports  340.62     3.0   
1           2     Books  125.90     3.8   
2           3    Sports  137.85     4.4   
3           4  Clothing  113.46     1.6   
4           5     Books  330.76     2.6   

                                         description  video_id  
0  This Sports is comfortable and lightweight, de...      1001  
1  Our Books offers a must-read experience with b...      1002  
2  Achieve your best with our professional-grade ...      1003  
3  This Clothing combines trendy style with comfo...      1004  
4  Our Books offers a classic experience with inf...      1005  
Interactions DataFrame:
   user_id  product_id interaction_type           timest

  interaction_timestamps = pd.date_range(start='2023-01-01', periods=num_interactions, freq='T')  # Random timestamps


In [33]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix

# Normalize numerical features
scaler = MinMaxScaler()
products[['price', 'rating']] = scaler.fit_transform(products[['price', 'rating']])

# Vectorize product descriptions using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
product_descriptions_matrix = vectorizer.fit_transform(products['description'])

# Combine product metadata into a single feature set
product_features = np.hstack((products[['price', 'rating']], product_descriptions_matrix.toarray()))

# Aggregate the interactions by taking the average rating for each user-product pair
aggregated_interactions = interactions.groupby(['user_id', 'product_id']).mean().reset_index()
print (interactions.head())
print (aggregated_interactions)

# Create user-item interaction matrix
user_item_matrix = aggregated_interactions.pivot(index='user_id', columns='product_id', values='interaction').fillna(0)

# Convert to sparse matrix format
user_item_sparse = csr_matrix(user_item_matrix.values)

print(user_item_matrix.head())



   user_id  product_id  interaction           timestamp
0      143         117            3 2023-01-01 00:00:00
1       20         500            4 2023-01-01 00:01:00
2      192         212            4 2023-01-01 00:02:00
3      849         426            5 2023-01-01 00:03:00
4      362         142            1 2023-01-01 00:04:00
      user_id  product_id  interaction           timestamp
0           1          12          1.0 2023-01-06 08:39:00
1           1          25          5.0 2023-01-02 12:57:00
2           1          68          1.0 2023-01-03 13:42:00
3           1          82          4.0 2023-01-07 17:09:00
4           1         108          1.0 2023-01-03 09:18:00
...       ...         ...          ...                 ...
9886     1000         162          3.5 2023-01-06 04:12:00
9887     1000         256          2.0 2023-01-03 07:31:00
9888     1000         290          5.0 2023-01-01 11:22:00
9889     1000         381          2.0 2023-01-07 06:48:00
9890     1000  

In [34]:
from sklearn.decomposition import TruncatedSVD

# Apply Truncated SVD for matrix factorization
svd = TruncatedSVD(n_components=50, random_state=42)
user_factors = svd.fit_transform(user_item_sparse)
item_factors = svd.components_.T


In [35]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity between product features
cosine_similarities = cosine_similarity(product_features)

# Function to recommend similar products
def recommend_products(product_id, top_n=10):
    product_idx = products.index[products['product_id'] == product_id][0]
    similar_indices = cosine_similarities[product_idx].argsort()[-top_n:][::-1]
    similar_products = products.iloc[similar_indices]
    return similar_products

# Example usage
recommended_products = recommend_products(product_id=1)
print(recommended_products)


     product_id  video_id category     price  rating  \
0             1         1   Beauty  0.628723   0.450   
370         371       371   Beauty  0.669694   0.450   
376         377       377   Beauty  0.692623   0.500   
56           57        57   Beauty  0.594279   0.575   
215         216       216   Beauty  0.804752   0.475   
27           28        28   Beauty  0.627993   0.300   
184         185       185   Beauty  0.803677   0.425   
28           29        29   Beauty  0.673201   0.300   
394         395       395   Beauty  0.730857   0.650   
51           52        52   Beauty  0.559024   0.275   

                                           description  
0    Experience long-lasting and natural ingredient...  
370  Experience premium quality and luxurious care ...  
376  Experience hypoallergenic and natural ingredie...  
56   Experience luxurious and natural ingredients c...  
215  Experience long-lasting and natural ingredient...  
27   Experience natural ingredients and p

In [36]:
# Function to get hybrid recommendations
def hybrid_recommendations(user_id, top_n=10):
    user_idx = users.index[users['user_id'] == user_id][0]
    user_interactions = user_item_matrix.iloc[user_idx].values
    
    # Weighted average of collaborative filtering and content-based recommendations
    cf_recommendations = user_factors[user_idx].dot(item_factors.T)
    cb_recommendations = cosine_similarities.dot(user_interactions)
    
    combined_scores = cf_recommendations + cb_recommendations
    top_indices = combined_scores.argsort()[-top_n:][::-1]
    
    return products.iloc[top_indices]

# Example usage
user_recommendations = hybrid_recommendations(user_id=1)
print(user_recommendations)


     product_id  video_id  category     price  rating  \
24           25        25  Clothing  0.899447   0.275   
235         236       236  Clothing  0.981754   0.900   
476         477       477  Clothing  0.938938   0.525   
197         198       198  Clothing  0.820930   0.550   
34           35        35  Clothing  0.913394   0.375   
231         232       232  Clothing  0.876599   0.675   
252         253       253  Clothing  0.829100   0.550   
314         315       315  Clothing  0.790480   0.475   
76           77        77  Clothing  0.787540   0.575   
162         163       163  Clothing  0.949176   0.950   

                                           description  
24   This Clothing combines breathable style with c...  
235  Our Clothing is breathable and comfortable, pe...  
476  This Clothing combines breathable style with t...  
197  Our Clothing is stylish and comfortable, perfe...  
34   Our Clothing is trendy and stylish, perfect fo...  
231  Our Clothing is breathabl

In [37]:
# Apply time decay factor to interactions
interactions['timestamp'] = pd.to_datetime(interactions['timestamp'])
max_time = interactions['timestamp'].max()

# Define a time decay function
def time_decay(t, max_time, decay_rate=0.1):
    return np.exp(-decay_rate * (max_time - t).days)

# Apply the time decay factor
interactions['decay_factor'] = interactions['timestamp'].apply(lambda t: time_decay(t, max_time))
interactions['interaction'] = interactions['interaction'] * interactions['decay_factor']

# Option 1: Aggregating duplicate entries
# aggregated_interactions = interactions.groupby(['user_id', 'product_id']).sum().reset_index()

# Option 2: Keeping only the latest interaction
latest_interactions = interactions.sort_values(by='timestamp').drop_duplicates(subset=['user_id', 'product_id'], keep='last')

# Create user-item interaction matrix
# For Option 1:
# user_item_matrix = aggregated_interactions.pivot(index='user_id', columns='product_id', values='interaction').fillna(0)

# For Option 2:
user_item_matrix = latest_interactions.pivot(index='user_id', columns='product_id', values='interaction').fillna(0)

# Convert to sparse matrix format
user_item_sparse = csr_matrix(user_item_matrix.values)

print(user_item_matrix.head())


product_id  1    2    3    4    5    6    7    8    9    10   ...  491  492  \
user_id                                                       ...             
1           0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
2           0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
3           0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
4           0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
5           0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   

product_id      493  494  495  496       497  498  499  500  
user_id                                                      
1           2.01096  0.0  0.0  0.0  0.000000  0.0  0.0  0.0  
2           0.00000  0.0  0.0  0.0  3.032653  0.0  0.0  0.0  
3           0.00000  0.0  5.0  0.0  0.000000  0.0  0.0  0.0  
4           0.00000  0.0  0.0  0.0  0.000000  0.0  0.0  0.0  
5           0.00000  0.0  0.0  0.0  0.000000  0.0  0.0  0.0  

[5 rows x 5

In [38]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Split data into training and test sets
train_data, test_data = train_test_split(interactions, test_size=0.2, random_state=42)

# Function to evaluate the model
def evaluate_model(user_factors, item_factors, test_data):
    test_preds = []
    for _, row in test_data.iterrows():
        user_idx = users.index[users['user_id'] == row['user_id']][0]
        product_idx = products.index[products['product_id'] == row['product_id']][0]
        pred_rating = user_factors[user_idx].dot(item_factors[product_idx])
        test_preds.append(pred_rating)
    
    test_rmse = np.sqrt(mean_squared_error(test_data['interaction'], test_preds))
    return test_rmse

# Example usage
test_rmse = evaluate_model(user_factors, item_factors, test_data)
print(f"Test RMSE: {test_rmse}")


Test RMSE: 1.7865934032340487


In [39]:
from flask import Flask, request, jsonify

app = Flask(__name__)

@app.route('/recommend', methods=['GET'])
def recommend():
    user_id = int(request.args.get('user_id'))
    top_n = int(request.args.get('top_n', 10))
    
    recommendations = hybrid_recommendations(user_id, top_n)
    return jsonify(recommendations.to_dict(orient='records'))

if __name__ == '__main__':
    app.run(debug=True)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
 * Restarting with stat


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
