In [5]:
pip install pandas numpy scikit-learn




In [6]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Load datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')

# Assume Transactions.csv exists or create a sample DataFrame if needed
transactions = pd.DataFrame({
    'CustomerID': ['C0001', 'C0001', 'C0002', 'C0003', 'C0003'],
    'ProductID': ['P001', 'P002', 'P003', 'P001', 'P003'],
    'PurchaseAmount': [50, 20, 100, 70, 80],
    'Category': ['Electronics', 'Books', 'Clothing', 'Electronics', 'Clothing']
})


In [7]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
customers['DaysSinceSignup'] = (pd.Timestamp.now() - customers['SignupDate']).dt.days


In [8]:
transaction_summary = transactions.groupby('CustomerID').agg(
    TotalSpend=('PurchaseAmount', 'sum'),
    AvgSpend=('PurchaseAmount', 'mean'),
    MostFreqCategory=('Category', lambda x: x.mode()[0])  # Most common category
).reset_index()

# Merge with customers data
data = pd.merge(customers, transaction_summary, on='CustomerID', how='left')


In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ('region', OneHotEncoder(), ['Region']),
        ('signup_days', 'passthrough', ['DaysSinceSignup']),
        ('transaction_features', 'passthrough', ['TotalSpend', 'AvgSpend'])
    ]
)

# Transform features
customer_features = preprocessor.fit_transform(data)

# Normalize for similarity calculations
scaler = StandardScaler()
customer_features = scaler.fit_transform(customer_features)


In [11]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# ... (Your existing code for loading and preprocessing data) ...

# Impute missing values (replace NaNs) before feature scaling
imputer = SimpleImputer(strategy='mean') # or 'median', 'most_frequent', 'constant'
data[['TotalSpend', 'AvgSpend']] = imputer.fit_transform(data[['TotalSpend', 'AvgSpend']])

preprocessor = ColumnTransformer(
    transformers=[
        ('region', OneHotEncoder(), ['Region']),
        ('signup_days', 'passthrough', ['DaysSinceSignup']),
        ('transaction_features', 'passthrough', ['TotalSpend', 'AvgSpend'])
    ]
)

# Transform features
customer_features = preprocessor.fit_transform(data)

# Normalize for similarity calculations
scaler = StandardScaler()
customer_features = scaler.fit_transform(customer_features)

# Compute similarity matrix
similarity_matrix = cosine_similarity(customer_features)

In [12]:
# Generate recommendations
recommendations = {}
for i in range(20):  # First 20 customers
    similar_indices = np.argsort(-similarity_matrix[i])[1:4]  # Exclude self
    recommendations[data['CustomerID'].iloc[i]] = [
        (data['CustomerID'].iloc[j], similarity_matrix[i][j])
        for j in similar_indices
    ]


In [31]:
# Save recommendations to Lookalike.csv
output_df = pd.DataFrame({
    'CustomerID': recommendations.keys(),
    'Recommendations': recommendations.values()
})
output_df.to_csv('Lookalike.csv', index=False)


In [30]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Load datasets
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')

# Step 1: Merge Transactions with Product Data
transactions = pd.merge(transactions, products[['ProductID', 'Category', 'Price']], on='ProductID', how='left')

# Step 2: Aggregate Transaction Data by CustomerID
transaction_summary = transactions.groupby('CustomerID').agg(
    TotalSpend=('TotalValue', 'sum'),
    AvgSpend=('TotalValue', 'mean'),
    TransactionCount=('TransactionID', 'count'),
    MostFreqCategory=('Category', lambda x: x.mode()[0])  # Most frequent product category
).reset_index()

# Step 3: Merge Transaction Summary with Customer Data
data = pd.merge(customers, transaction_summary, on='CustomerID', how='left')

# Feature: Days since Signup
data['SignupDate'] = pd.to_datetime(data['SignupDate'])
data['DaysSinceSignup'] = (pd.Timestamp.now() - data['SignupDate']).dt.days

# Step 4: Data Preprocessing (Handling missing data and scaling)
preprocessor = ColumnTransformer(
    transformers=[
        ('region', OneHotEncoder(), ['Region']),
        ('signup_days', 'passthrough', ['DaysSinceSignup']),
        ('transaction_features', 'passthrough', ['TotalSpend', 'AvgSpend', 'TransactionCount'])
    ]
)
imputer = SimpleImputer(strategy='mean')
data[['TotalSpend', 'AvgSpend', 'TransactionCount']] = imputer.fit_transform(data[['TotalSpend', 'AvgSpend', 'TransactionCount']])

# Transform the features
customer_features = preprocessor.fit_transform(data)

# Normalize for similarity calculations
scaler = StandardScaler()
customer_features = scaler.fit_transform(customer_features)

# Step 5: Compute Similarity Matrix
similarity_matrix = cosine_similarity(customer_features)

# Step 6: Generate Recommendations for the first 20 customers (C0001 - C0020)
recommendations = {}
for i in range(20):  # First 20 customers
    similar_indices = np.argsort(-similarity_matrix[i])[1:4]  # Exclude self (similarity[i][i] = 1)
    recommendations[data['CustomerID'].iloc[i]] = [
        (data['CustomerID'].iloc[j], similarity_matrix[i][j])
        for j in similar_indices
    ]

# Step 7: Prepare the Output Data for Lookalike.csv
output_data = []

for cust_id, similar_customers in recommendations.items():
    similar_str = '; '.join([f'{cust}: {score:.4f}' for cust, score in similar_customers])
    output_data.append({'CustomerID': cust_id, 'Recommendations': similar_str})

# Save to Lookalike.csv
output_df = pd.DataFrame(output_data)
output_df.to_csv('Lookalike.csv', index=False)

print("Lookalike recommendations saved to 'Lookalike.csv'.")


Lookalike recommendations saved to 'Lookalike.csv'.
