In [48]:
import pandas as pd
import numpy as np

# Number of users
num_users = 1000

# Generate synthetic user data
user_ids = np.arange(1, num_users + 1)
user_ages = np.random.randint(18, 70, size=num_users)  # Ages between 18 and 70
user_genders = np.random.choice(['Male', 'Female', 'Other'], size=num_users, p=[0.45, 0.45, 0.10])
user_occupation = np.random.choice(['Student', 'Professional', 'Retired', 'Self-employed'], size=num_users, p=[0.3, 0.4, 0.2, 0.1])

# Create DataFrame
users = pd.DataFrame({
    'user_id': user_ids,
    'age': user_ages,
    'gender': user_genders,
    'occupation': user_occupation
})

print(users.head())


   user_id  age  gender    occupation
0        1   51  Female       Student
1        2   67  Female       Student
2        3   49  Female  Professional
3        4   37  Female       Student
4        5   59  Female  Professional


In [49]:
import random
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer

# Assume that the following synthetic data and generate_description function are defined as previously discussed

# Number of products
num_products = 500

# Generate synthetic product data
product_ids = np.arange(1, num_products + 1)
product_categories = np.random.choice(['Electronics', 'Books', 'Clothing', 'Home', 'Beauty', 'Sports'], size=num_products)
product_prices = np.round(np.random.uniform(5.0, 500.0, size=num_products), 2)  # Prices between $5 and $500
product_ratings = np.round(np.random.uniform(1.0, 5.0, size=num_products), 1)  # Ratings between 1.0 and 5.0

# Keywords and templates for descriptions (assume already defined in previous examples)
description_keywords = {
    'Electronics': ['latest technology', 'high-performance', 'durable', 'compact', 'user-friendly'],
    'Books': ['bestselling', 'engaging', 'classic', 'informative', 'must-read'],
    'Clothing': ['stylish', 'comfortable', 'trendy', 'breathable', 'versatile'],
    'Home': ['modern', 'elegant', 'cozy', 'durable', 'functional'],
    'Beauty': ['premium quality', 'natural ingredients', 'long-lasting', 'hypoallergenic', 'luxurious'],
    'Sports': ['high-performance', 'durable', 'lightweight', 'comfortable', 'professional-grade']
}
description_templates = {
    'Electronics': [
        "This {category} item features {adjective1} design and {adjective2} functionality. It's {adjective3} and {adjective4}, perfect for everyday use.",
        "Experience the {adjective1} of our {category} with this {adjective2} product. It's {adjective3} and {adjective4}, ensuring high performance."
    ],
    'Books': [
        "Dive into this {category} with our {adjective1} and {adjective2} read. It's {adjective3} and {adjective4}, a {adjective5} addition to any collection.",
        "Our {category} offers a {adjective1} experience with {adjective2} insights. It's {adjective3} and {adjective4}, ideal for any reader."
    ],
    'Clothing': [
        "Our {category} is {adjective1} and {adjective2}, perfect for {adjective3} wear. It's {adjective4} and {adjective5}, making it a staple in any wardrobe.",
        "This {category} combines {adjective1} style with {adjective2} comfort. It's {adjective3} and {adjective4}, suitable for all occasions."
    ],
    'Home': [
        "Enhance your living space with our {adjective1} and {adjective2} {category}. It's {adjective3} and {adjective4}, perfect for any home.",
        "Our {category} is {adjective1} and {adjective2}, designed for {adjective3} use. It's {adjective4} and {adjective5}, adding a touch of elegance to your home."
    ],
    'Beauty': [
        "Experience {adjective1} and {adjective2} care with our {category}. It's {adjective3} and {adjective4}, perfect for a {adjective5} routine.",
        "Our {category} features {adjective1} ingredients and {adjective2} results. It's {adjective3} and {adjective4}, ensuring a {adjective5} glow."
    ],
    'Sports': [
        "Achieve your best with our {adjective1} and {adjective2} {category}. It's {adjective3} and {adjective4}, perfect for {adjective5} performance.",
        "This {category} is {adjective1} and {adjective2}, designed for {adjective3} activities. It's {adjective4} and {adjective5}, ideal for any sports enthusiast."
    ]
}

# Function to generate a random description based on category
def generate_description(category):
    adjectives = random.sample(description_keywords[category], 5)
    template = random.choice(description_templates[category])
    return template.format(category=category, adjective1=adjectives[0], adjective2=adjectives[1], adjective3=adjectives[2], adjective4=adjectives[3], adjective5=adjectives[4])

# Generate product descriptions
product_descriptions = [generate_description(cat) for cat in product_categories]

# Create DataFrame with descriptions
products = pd.DataFrame({
    'product_id': product_ids,
    'category': product_categories,
    'price': product_prices,
    'rating': product_ratings,
    'description': product_descriptions
})

# Normalize numerical features
scaler = MinMaxScaler()
products[['price', 'rating']] = scaler.fit_transform(products[['price', 'rating']])

# Vectorize product descriptions using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
product_descriptions_tfidf = vectorizer.fit_transform(products['description'])


# Convert the sparse matrix to a dense array
product_descriptions_dense = product_descriptions_tfidf.toarray()

# Combine product metadata into a single feature set
product_features = np.hstack((products[['price', 'rating']].values, product_descriptions_dense))

# Example: Show combined feature set for first 5 products
print(product_features[:5])


[[0.78194453 0.075      0.36034412 0.         0.         0.
  0.         0.36034412 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.2319279
  0.         0.         0.         0.         0.         0.18852711
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.30076895 0.22272866 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.30076895 0.         0.         0.         0.
  0.         0.         0.         0.         0.20268963 0.44545732
  0.         0.         0.30076895 0.         0.         0.
  0.         0.         0.         0.30076895 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.        ]
 [0.92532264 0.925      0.         0.         0.         0.
  0.24571604 0.         0.         0.         0.         0.
  0.         0.         0.    

In [50]:
# Number of interactions
num_interactions = 10000

# Generate synthetic interaction data
interaction_user_ids = np.random.choice(user_ids, size=num_interactions)
interaction_product_ids = np.random.choice(product_ids, size=num_interactions)
interaction_types = np.random.choice(['view', 'click', 'purchase', 'like'], size=num_interactions, p=[0.6, 0.1, 0.01,0.29])
interaction_timestamps = pd.date_range(start='2023-01-01', end='2024-06-13', periods=num_interactions)

# Create DataFrame
interactions = pd.DataFrame({
    'user_id': interaction_user_ids,
    'product_id': interaction_product_ids,
    'interaction_type': interaction_types,
    'timestamp': interaction_timestamps
})

print(interactions.head())


   user_id  product_id interaction_type                     timestamp
0      136         156             view 2023-01-01 00:00:00.000000000
1      296         232             like 2023-01-01 01:16:11.017101710
2      412         380            click 2023-01-01 02:32:22.034203420
3      709         407            click 2023-01-01 03:48:33.051305130
4      534         358             like 2023-01-01 05:04:44.068406840


In [51]:
# Save to CSV files
users.to_csv('synthetic_users.csv', index=False)
products.to_csv('synthetic_products.csv', index=False)
interactions.to_csv('synthetic_interactions.csv', index=False)


In [52]:

# Load datasets
users = pd.read_csv('synthetic_users.csv')  # Example file for user data
products = pd.read_csv('synthetic_products.csv')  # Example file for product metadata
interactions = pd.read_csv('synthetic_interactions.csv')  # Example file for user-product interactions

# Explore datasets
print(users.head())
print(products.head())
print(interactions.head())


   user_id  age  gender    occupation
0        1   51  Female       Student
1        2   67  Female       Student
2        3   49  Female  Professional
3        4   37  Female       Student
4        5   59  Female  Professional
   product_id category     price  rating  \
0           1   Sports  0.781945   0.075   
1           2   Beauty  0.925323   0.925   
2           3    Books  0.948480   0.700   
3           4   Sports  0.505926   0.950   
4           5    Books  0.150023   0.275   

                                         description  
0  Achieve your best with our high-performance an...  
1  Our Beauty features premium quality ingredient...  
2  Our Books offers a must-read experience with e...  
3  This Sports is professional-grade and durable,...  
4  Our Books offers a must-read experience with i...  
   user_id  product_id interaction_type                      timestamp
0      136         156             view  2023-01-01 00:00:00.000000000
1      296         232             

In [53]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer

# Normalize numerical features
scaler = MinMaxScaler()
products[['price', 'rating']] = scaler.fit_transform(products[['price', 'rating']])

# Vectorize product descriptions using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
# product_descriptions = vectorizer.fit_transform(products['description'])

# Combine product metadata into a single feature set
import numpy as np

product_features = np.hstack((products[['price', 'rating']], product_descriptions_tfidf.toarray()))


In [54]:
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD

interactions_dedup = interactions.drop_duplicates(subset=['user_id', 'product_id'])

# Create user-item interaction matrix
interactions_dedup['interaction_type_binary'] = interactions_dedup['interaction_type'].apply(lambda x: 1 if x in ['view', 'click', 'purchase', 'like'] else 0)
user_item_matrix = interactions_dedup.pivot(index='user_id', columns='product_id', values='interaction_type_binary').fillna(0)

# Create sparse matrix
user_item_sparse = csr_matrix(user_item_matrix.values)

# Apply Truncated SVD for matrix factorization
svd = TruncatedSVD(n_components=50, random_state=42)
user_factors = svd.fit_transform(user_item_sparse)
item_factors = svd.components_.T


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interactions_dedup['interaction_type_binary'] = interactions_dedup['interaction_type'].apply(lambda x: 1 if x in ['view', 'click', 'purchase', 'like'] else 0)


In [55]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity between product features
cosine_similarities = cosine_similarity(product_features)

# Function to recommend similar products
def recommend_products(product_id, top_n=10):
    product_idx = products.index[products['product_id'] == product_id][0]
    similar_indices = cosine_similarities[product_idx].argsort()[-top_n:][::-1]
    similar_products = products.iloc[similar_indices]
    return similar_products

# Example usage
recommended_products = recommend_products(product_id=1)
print(recommended_products)


     product_id category     price  rating  \
0             1   Sports  0.781945   0.075   
398         399   Sports  0.824753   0.075   
242         243   Sports  0.739784   0.075   
301         302   Sports  0.748881   0.025   
183         184   Sports  0.743593   0.125   
388         389   Sports  0.839340   0.125   
456         457   Sports  0.810916   0.175   
28           29   Sports  0.659657   0.025   
256         257   Sports  0.957576   0.200   
461         462   Sports  0.584473   0.000   

                                           description  
0    Achieve your best with our high-performance an...  
398  Achieve your best with our high-performance an...  
242  Achieve your best with our high-performance an...  
301  Achieve your best with our durable and profess...  
183  Achieve your best with our comfortable and lig...  
388  Achieve your best with our high-performance an...  
456  Achieve your best with our professional-grade ...  
28   Achieve your best with our durab

In [56]:
# Function to get hybrid recommendations
def hybrid_recommendations(user_id, top_n=10):
    user_idx = users.index[users['user_id'] == user_id][0]
    user_interactions = user_item_matrix.iloc[user_idx].values
    
    # Weighted average of collaborative filtering and content-based recommendations
    cf_recommendations = user_factors[user_idx].dot(item_factors.T)
    cb_recommendations = cosine_similarities.dot(user_interactions)
    
    combined_scores = cf_recommendations + cb_recommendations
    top_indices = combined_scores.argsort()[-top_n:][::-1]
    
    return products.iloc[top_indices]

# Example usage
user_recommendations = hybrid_recommendations(user_id=1)
print(user_recommendations)


     product_id  category     price  rating  \
230         231    Sports  0.974777   0.725   
246         247    Beauty  0.988634   0.775   
88           89    Beauty  0.946980   0.775   
345         346  Clothing  0.961142   0.700   
1             2    Beauty  0.925323   0.925   
487         488    Beauty  0.820094   1.000   
300         301    Beauty  0.933082   0.500   
154         155    Beauty  0.981300   0.425   
37           38    Beauty  0.855771   0.550   
347         348    Beauty  0.952025   0.725   

                                           description  
230  Achieve your best with our comfortable and hig...  
246  Experience natural ingredients and hypoallerge...  
88   Experience hypoallergenic and luxurious care w...  
345  Our Clothing is comfortable and stylish, perfe...  
1    Our Beauty features premium quality ingredient...  
487  Experience luxurious and hypoallergenic care w...  
300  Experience luxurious and long-lasting care wit...  
154  Experience luxurious 

In [57]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

# Sample interactions DataFrame for testing
# data = {
#     'user_id': [544, 719, 312, 367, 478],
#     'product_id': [21, 487, 248, 161, 492],
#     'interaction_type': ['view', 'like', 'view', 'click', 'purchase'],
#     'timestamp': ['2023-01-01', '2023-01-01', '2023-01-01', '2023-01-01', '2023-01-01']
# }
print(interactions.head())

# interactions = pd.DataFrame(data)

# Convert timestamp to datetime
interactions['timestamp'] = pd.to_datetime(interactions['timestamp'])

# Print unique values in 'interaction_type' before mapping
print("Unique values in 'interaction_type' before mapping:")
print(interactions['interaction_type'].unique())

# Define a mapping for interaction types to numeric values
interaction_type_mapping = {
    'view': 1,
    'like': 2,
    'click': 3,
    'purchase': 4
}

# Map interaction_type to numeric values
interactions['interaction_type'] = interactions['interaction_type'].map(interaction_type_mapping)

# Check for unmapped values and handle them
if interactions['interaction_type'].isnull().any():
    print("Found unmapped values in 'interaction_type'. Handling them...")
    # Option 1: Drop rows with NaN interaction_type after mapping
    interactions.dropna(subset=['interaction_type'], inplace=True)

    # Option 2: Fill NaN with a default value (e.g., 0 for no interaction)
    # interactions['interaction_type'].fillna(0, inplace=True)

# Apply time decay factor
max_time = interactions['timestamp'].max()

def time_decay(t, max_time, decay_rate=0.1):
    days_diff = (max_time - t).days
    return np.exp(-decay_rate * days_diff)

interactions['decay_factor'] = interactions['timestamp'].apply(lambda t: time_decay(t, max_time))

# Multiply interaction_type with decay_factor to get interaction_value
interactions['interaction_value'] = interactions['interaction_type'] * interactions['decay_factor']

# Handle duplicate interactions if necessary
interactions_dedup = interactions.drop_duplicates(subset=['user_id', 'product_id'])

# Create user-item interaction matrix
user_item_matrix = interactions_dedup.pivot(index='user_id', columns='product_id', values='interaction_value').fillna(0)

# Convert DataFrame to sparse matrix
user_item_sparse = csr_matrix(user_item_matrix.values)

# Display the corrected DataFrame head
print(interactions.head())


   user_id  product_id interaction_type                      timestamp
0      136         156             view  2023-01-01 00:00:00.000000000
1      296         232             like  2023-01-01 01:16:11.017101710
2      412         380            click  2023-01-01 02:32:22.034203420
3      709         407            click  2023-01-01 03:48:33.051305130
4      534         358             like  2023-01-01 05:04:44.068406840
Unique values in 'interaction_type' before mapping:
['view' 'like' 'click' 'purchase']
   user_id  product_id  interaction_type                     timestamp  \
0      136         156                 1 2023-01-01 00:00:00.000000000   
1      296         232                 2 2023-01-01 01:16:11.017101710   
2      412         380                 3 2023-01-01 02:32:22.034203420   
3      709         407                 3 2023-01-01 03:48:33.051305130   
4      534         358                 2 2023-01-01 05:04:44.068406840   

   decay_factor  interaction_value  
0  1.

In [58]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Split data into training and test sets
train_data, test_data = train_test_split(interactions, test_size=0.2, random_state=42)

# interactions_dedup_clean = interactions_dedup.dropna(subset=['interaction_type'])

# Train a LinearRegression model on the user-item interaction matrix
model = LinearRegression()
model.fit(user_item_sparse, interactions_dedup['interaction_type'])

# Evaluate the model
def evaluate_model(model, train_data, test_data):
    train_preds = model.predict(user_item_sparse)
    test_preds = model.predict(test_data.pivot(index='user_id', columns='product_id').values)

    train_rmse = np.sqrt(mean_squared_error(train_data['interaction'], train_preds))
    test_rmse = np.sqrt(mean_squared_error(test_data['interaction'], test_preds))

    return train_rmse, test_rmse

# Example usage
train_rmse, test_rmse = evaluate_model(model, train_data, test_data)
print(f"Train RMSE: {train_rmse}, Test RMSE: {test_rmse}")

ValueError: Found input variables with inconsistent numbers of samples: [1000, 9901]

In [None]:
from flask import Flask, request, jsonify

app = Flask(__name__)

@app.route('/recommend', methods=['GET'])
def recommend():
    user_id = request.args.get('user_id')
    top_n = int(request.args.get('top_n', 10))
    
    recommendations = hybrid_recommendations(user_id, top_n)
    return jsonify(recommendations.to_dict(orient='records'))

if __name__ == '__main__':
    app.run(debug=True)
