In [7]:
import pandas as pd
import matplotlib.pyplot as plt

In [8]:
customerdf = pd.read_csv("CustomerInfo.csv")

In [9]:
customerdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   CustomerID        10000 non-null  int64  
 1   Age               10000 non-null  int64  
 2   Gender            10000 non-null  int64  
 3   Income            10000 non-null  float64
 4   Location          10000 non-null  int64  
 5   HighestEducation  10000 non-null  int64  
 6   CreditScore       10000 non-null  int64  
 7   Profession        10000 non-null  int64  
 8   DisposableIncome  10000 non-null  float64
dtypes: float64(2), int64(7)
memory usage: 703.3 KB


In [10]:
productdf = pd.read_csv('ProductInfo.csv')

In [11]:
productdf.head()

Unnamed: 0,CustomerID,PersonalLoan_Owned,PersonalLoan_Amount,HomeLoan_Owned,HomeLoan_Amount,VehicleLoan_Owned,VehicleLoan_Amount,EducationLoan_Owned,EducationLoan_Amount,Insurance_Owned,Insurance_Amount,VehicleInsurance_Owned,VehicleInsurance_Amount,MutualFund_Owned,MutualFund_Amount,FixedDeposit_Owned,FixedDeposit_Amount,LastProductPurchased
0,1,0,0.0,0,0.0,1,129300.0,0,0.0,0,0.0,0,0.0,0,0.0,1,149200.0,VehicleLoan
1,2,0,0.0,0,0.0,0,0.0,1,189000.0,0,0.0,0,0.0,0,0.0,1,230800.0,EducationLoan
2,3,0,0.0,0,0.0,1,1676000.0,0,0.0,0,0.0,0,0.0,0,0.0,1,182100.0,VehicleLoan
3,4,0,0.0,0,0.0,1,332000.0,0,0.0,0,0.0,1,35500.0,0,0.0,0,0.0,VehicleInsurance
4,5,0,0.0,0,0.0,0,0.0,0,0.0,1,23400.0,0,0.0,0,0.0,0,0.0,Insurance


In [12]:
productdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   CustomerID               10000 non-null  int64  
 1   PersonalLoan_Owned       10000 non-null  int64  
 2   PersonalLoan_Amount      10000 non-null  float64
 3   HomeLoan_Owned           10000 non-null  int64  
 4   HomeLoan_Amount          10000 non-null  float64
 5   VehicleLoan_Owned        10000 non-null  int64  
 6   VehicleLoan_Amount       10000 non-null  float64
 7   EducationLoan_Owned      10000 non-null  int64  
 8   EducationLoan_Amount     10000 non-null  float64
 9   Insurance_Owned          10000 non-null  int64  
 10  Insurance_Amount         10000 non-null  float64
 11  VehicleInsurance_Owned   10000 non-null  int64  
 12  VehicleInsurance_Amount  10000 non-null  float64
 13  MutualFund_Owned         10000 non-null  int64  
 14  MutualFund_Amount      

# Modelling

In [24]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [25]:
customer_info = pd.read_csv('CustomerInfo.csv')
product_info = pd.read_csv('ProductInfo.csv')

In [26]:
df = pd.merge(customer_info, product_info, on='CustomerID')

In [31]:
# Encode categorical variables
categorical_cols = ['Gender', 'Location', 'HighestEducation', 'Profession', 'LastProductPurchased']
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_cats = encoder.fit_transform(df[categorical_cols])

In [32]:
# get_feature_names_out() 
encoded_feature_names = encoder.get_feature_names_out(categorical_cols)
encoded_df = pd.DataFrame(encoded_cats, columns=encoded_feature_names)

In [33]:
# Combine encoded categories with numerical features
numerical_cols = ['Age', 'Income', 'CreditScore', 'DisposableIncome']
features_df = pd.concat([df[numerical_cols], encoded_df], axis=1)


In [34]:
# Prepare the target variable (product ownership)
product_cols = [col for col in df.columns if col.endswith('_Owned')]
target_df = df[product_cols]


In [35]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(features_df, target_df, test_size=0.2, random_state=42)

In [37]:
import numpy as np
from scipy.sparse.linalg import svds

def collaborative_filtering(user_item_matrix):
    # Convert the matrix to float type
    user_item_matrix = user_item_matrix.astype(float)
    
    # Center the matrix (subtract mean of each user's ratings)
    user_ratings_mean = np.mean(user_item_matrix, axis=1)
    user_item_matrix_centered = user_item_matrix - user_ratings_mean.reshape(-1, 1)
    
    # Perform SVD
    U, sigma, Vt = svds(user_item_matrix_centered, k=min(user_item_matrix.shape[1]-1, 10))
    
    # Reconstruct the matrix
    sigma_diag = np.diag(sigma)
    predicted_ratings = np.dot(np.dot(U, sigma_diag), Vt) + user_ratings_mean.reshape(-1, 1)
    
    return predicted_ratings

# Create user-item matrix
user_item_matrix = y_train.values

# Apply collaborative filtering
cf_predictions = collaborative_filtering(user_item_matrix)

In [38]:
from sklearn.metrics.pairwise import cosine_similarity

def content_based_filtering(user_features):
    # Compute cosine similarity between users
    sim_matrix = cosine_similarity(user_features)
    
    return sim_matrix

# Apply content-based filtering
cb_similarities = content_based_filtering(X_train)

In [39]:
def hybrid_recommendations(cf_pred, cb_sim, user_idx, user_item_matrix, n_recommendations=5):
    # Get collaborative filtering predictions for the user
    cf_user_pred = cf_pred[user_idx]
    
    # Get content-based similarities for the user
    cb_user_sim = cb_sim[user_idx]
    
    # Combine predictions (you can adjust the weights)
    hybrid_pred = 0.7 * cf_user_pred + 0.3 * np.dot(cb_user_sim, user_item_matrix)
    
    # Get products the user doesn't already own
    unowned_products = np.where(user_item_matrix[user_idx] == 0)[0]
    
    # Get top N recommendations from unowned products
    top_n = unowned_products[np.argsort(hybrid_pred[unowned_products])[::-1][:n_recommendations]]
    
    return top_n

# Example usage
user_idx = 0  # For the first user
recommendations = hybrid_recommendations(cf_predictions, cb_similarities, user_idx, user_item_matrix)

In [40]:
from sklearn.metrics import precision_score, recall_score

def evaluate_model(y_true, y_pred):
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    
    return precision, recall

# Make predictions for all users in the test set
all_predictions = []
for user_idx in range(len(X_test)):
    recommendations = hybrid_recommendations(cf_predictions, cb_similarities, user_idx, user_item_matrix, n_recommendations=len(product_cols))
    user_pred = np.zeros(y_test.shape[1])
    user_pred[recommendations] = 1
    all_predictions.append(user_pred)

all_predictions = np.array(all_predictions)

# Evaluate the model
precision, recall = evaluate_model(y_test, all_predictions)
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

Precision: 0.2433
Recall: 0.7605


In [41]:
def make_recommendations(user_id, n_recommendations=5):
    user_idx = customer_info[customer_info['CustomerID'] == user_id].index[0]
    recommendations = hybrid_recommendations(cf_predictions, cb_similarities, user_idx, user_item_matrix, n_recommendations)
    
    recommended_products = [product_cols[i] for i in recommendations]
    return recommended_products

# Example usage
user_id = 1001  # Replace with an actual user ID
recommended_products = make_recommendations(user_id)
print(f"Recommended products for user {user_id}: {recommended_products}")

Recommended products for user 1001: ['FixedDeposit_Owned', 'PersonalLoan_Owned', 'MutualFund_Owned', 'VehicleLoan_Owned', 'EducationLoan_Owned']
