In [1]:
#!pip install surprise
#!pip install xgboost
#!pip install --upgrade numpy scikit-learn
#!conda update --all
#!export OPENBLAS_NUM_THREADS=1
#!pip install node2vec
#!pip install tensorflow

In [2]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def cosine_similarity_lookalike(seed_users, candidate_users, top_n=100):
    similarities = cosine_similarity(seed_users, candidate_users)
    avg_similarities = np.mean(similarities, axis=0)
    top_indices = np.argsort(avg_similarities)[-top_n:]
    return top_indices[::-1]

# Example usage
seed_users = np.array([[1, 0, 1, 1], [1, 1, 0, 1]])
candidate_users = np.array([[1, 0, 1, 0], [0, 1, 1, 1], [1, 1, 1, 0]])
lookalike_indices = cosine_similarity_lookalike(seed_users, candidate_users, top_n=1)
print("Lookalike user indices:", lookalike_indices)

Lookalike user indices: [2]


In [3]:
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
import pandas as pd
import numpy as np



def generate_sample_data(n_users=1000, n_items=100, n_ratings=10000):
    np.random.seed(42)
    user_ids = np.random.randint(1, n_users + 1, n_ratings)
    item_ids = np.random.randint(1, n_items + 1, n_ratings)
    ratings = np.random.randint(1, 6, n_ratings)
    
    df = pd.DataFrame({
        'user_id': user_ids,
        'item_id': item_ids,
        'rating': ratings
    })
    return df

def collaborative_filtering_lookalike(ratings, seed_users, n_neighbors=20, top_n=100):
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(ratings[['user_id', 'item_id', 'rating']], reader)
    trainset = data.build_full_trainset()
    
    sim_options = {'name': 'cosine', 'user_based': True}
    model = KNNBasic(k=n_neighbors, sim_options=sim_options)
    model.fit(trainset)
    
    lookalike_users = []
    for seed_user in seed_users:
        try:
            neighbors = model.get_neighbors(trainset.to_inner_uid(seed_user), k=top_n)
            lookalike_users.extend([trainset.to_raw_uid(neighbor) for neighbor in neighbors])
        except ValueError:
            print(f"Seed user {seed_user} not found in the dataset.")
    
    return list(set(lookalike_users))

# Generate sample dataset
ratings = generate_sample_data(n_users=1000, n_items=100, n_ratings=15000)
print("Dataset shape:", ratings.shape)
print("Sample of the dataset:")
print(ratings.head())

# Select seed users (randomly choosing 5 users from the dataset)
seed_users = ratings['user_id'].sample(5).tolist()
print("\nSeed users:", seed_users)

# Find lookalike users
lookalike_users = collaborative_filtering_lookalike(ratings, seed_users, n_neighbors=10, top_n=20)
print("\nNumber of lookalike users found:", len(lookalike_users))
print("Sample of lookalike users:", lookalike_users[:10])

Dataset shape: (15000, 3)
Sample of the dataset:
   user_id  item_id  rating
0      103        3       2
1      436       67       3
2      861       71       5
3      271      100       3
4      107       35       1

Seed users: [961, 583, 15, 650, 432]
Computing the cosine similarity matrix...
Done computing similarity matrix.

Number of lookalike users found: 57
Sample of lookalike users: [770, 386, 131, 898, 647, 777, 14, 271, 274, 277]


In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, accuracy_score

# Generate a larger sample dataset
def generate_sample_data(n_samples=10000, n_features=10, seed_ratio=0.1):
    np.random.seed(42)
    X = np.random.randn(n_samples, n_features)
    y = np.random.choice([0, 1], size=n_samples, p=[1-seed_ratio, seed_ratio])
    return X, y

# Lookalike modeling using Logistic Regression
def logistic_regression_lookalike(X, y, candidate_users, threshold=0.5):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    model = LogisticRegression(random_state=42)
    model.fit(X_train_scaled, y_train)
    
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    auc = roc_auc_score(y_test, y_pred_proba)
    accuracy = accuracy_score(y_test, y_pred_proba >= threshold)
    
    candidate_users_scaled = scaler.transform(candidate_users)
    candidate_proba = model.predict_proba(candidate_users_scaled)[:, 1]
    lookalike_indices = np.where(candidate_proba >= threshold)[0]
    
    return lookalike_indices, auc, accuracy

# Lookalike modeling using Random Forest
def random_forest_lookalike(X, y, candidate_users, threshold=0.5):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_pred_proba)
    accuracy = accuracy_score(y_test, y_pred_proba >= threshold)
    
    candidate_proba = model.predict_proba(candidate_users)[:, 1]
    lookalike_indices = np.where(candidate_proba >= threshold)[0]
    
    return lookalike_indices, auc, accuracy

# Lookalike modeling using XGBoost
def xgboost_lookalike(X, y, candidate_users, threshold=0.5):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model = XGBClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_pred_proba)
    accuracy = accuracy_score(y_test, y_pred_proba >= threshold)
    
    candidate_proba = model.predict_proba(candidate_users)[:, 1]
    lookalike_indices = np.where(candidate_proba >= threshold)[0]
    
    return lookalike_indices, auc, accuracy

# Generate sample data
X, y = generate_sample_data(n_samples=10000, n_features=10, seed_ratio=0.1)
candidate_users = np.random.randn(1000, 10)  # 1000 candidate users

# Run all three models
models = [
    ("Logistic Regression", logistic_regression_lookalike),
    ("Random Forest", random_forest_lookalike),
    ("XGBoost", xgboost_lookalike)
]

for model_name, model_func in models:
    lookalike_indices, auc, accuracy = model_func(X, y, candidate_users)
    
    print(f"\n{model_name} Results:")
    print(f"AUC: {auc:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Number of lookalike users found: {len(lookalike_indices)}")
    print(f"Sample lookalike user indices: {lookalike_indices[:10]}")

# Compare feature importance (for Random Forest and XGBoost)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X, y)

xgb_model = XGBClassifier(n_estimators=100, random_state=42)
xgb_model.fit(X, y)

feature_importance_rf = pd.Series(rf_model.feature_importances_, index=[f"Feature_{i}" for i in range(X.shape[1])])
feature_importance_xgb = pd.Series(xgb_model.feature_importances_, index=[f"Feature_{i}" for i in range(X.shape[1])])

print("\nRandom Forest Feature Importance:")
print(feature_importance_rf.sort_values(ascending=False))

print("\nXGBoost Feature Importance:")
print(feature_importance_xgb.sort_values(ascending=False))


Logistic Regression Results:
AUC: 0.5189
Accuracy: 0.8950
Number of lookalike users found: 0
Sample lookalike user indices: []

Random Forest Results:
AUC: 0.5148
Accuracy: 0.8950
Number of lookalike users found: 0
Sample lookalike user indices: []

XGBoost Results:
AUC: 0.4660
Accuracy: 0.8935
Number of lookalike users found: 3
Sample lookalike user indices: [ 39 306 500]

Random Forest Feature Importance:
Feature_5    0.103437
Feature_7    0.101610
Feature_2    0.101282
Feature_9    0.100863
Feature_6    0.100784
Feature_3    0.099907
Feature_8    0.099767
Feature_0    0.098878
Feature_4    0.097367
Feature_1    0.096107
dtype: float64

XGBoost Feature Importance:
Feature_5    0.106682
Feature_4    0.102541
Feature_3    0.101570
Feature_8    0.100941
Feature_9    0.100126
Feature_2    0.099114
Feature_1    0.098084
Feature_6    0.097890
Feature_7    0.097101
Feature_0    0.095953
dtype: float32


In [5]:
from sklearn.cluster import KMeans
import numpy as np
from sklearn.preprocessing import StandardScaler

def kmeans_lookalike(seed_users, candidate_users, n_clusters=5, top_n=100):
    all_users = np.vstack([seed_users, candidate_users])
    scaler = StandardScaler()
    all_users_scaled = scaler.fit_transform(all_users)
    
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(all_users_scaled)
    
    seed_clusters = cluster_labels[:len(seed_users)]
    candidate_clusters = cluster_labels[len(seed_users):]
    
    lookalike_indices = []
    for cluster in np.unique(seed_clusters):
        cluster_candidates = np.where(candidate_clusters == cluster)[0]
        lookalike_indices.extend(cluster_candidates[:top_n])
    
    return lookalike_indices

# Example usage
seed_users = np.array([[1, 0, 1], [1, 1, 1]])
candidate_users = np.array([[1, 1, 0], [0, 1, 1], [1, 0, 0], [0, 0, 1]])
lookalike_indices = kmeans_lookalike(seed_users, candidate_users, n_clusters=2, top_n=1)
print("Lookalike user indices:", lookalike_indices)

Lookalike user indices: [0, 1]


In [6]:
import networkx as nx
from node2vec import Node2Vec
from sklearn.metrics.pairwise import cosine_similarity

def graph_based_lookalike(graph, seed_nodes, dimensions=64, walk_length=30, num_walks=200, top_n=100):
    node2vec = Node2Vec(graph, dimensions=dimensions, walk_length=walk_length, num_walks=num_walks, workers=4)
    model = node2vec.fit(window=10, min_count=1)
    
    all_embeddings = np.array([model.wv[node] for node in graph.nodes()])
    seed_embeddings = np.array([model.wv[node] for node in seed_nodes])
    
    similarities = cosine_similarity(seed_embeddings, all_embeddings)
    avg_similarities = np.mean(similarities, axis=0)
    
    top_indices = np.argsort(avg_similarities)[-top_n:]
    return [list(graph.nodes())[i] for i in top_indices[::-1] if list(graph.nodes())[i] not in seed_nodes]

# Example usage
G = nx.fast_gnp_random_graph(100, 0.5)
seed_nodes = [0, 1, 2]
lookalike_nodes = graph_based_lookalike(G, seed_nodes, top_n=10)
print("Lookalike nodes:", lookalike_nodes)

  from .autonotebook import tqdm as notebook_tqdm
Computing transition probabilities: 100%|████| 100/100 [00:00<00:00, 389.67it/s]
Generating walks (CPU: 2): 100%|██████████| 50/50 [00:00<00:00, 106.10it/s]
Generating walks (CPU: 1): 100%|██████████| 50/50 [00:00<00:00, 106.04it/s]
Generating walks (CPU: 3): 100%|██████████| 50/50 [00:00<00:00, 106.79it/s]
Generating walks (CPU: 4): 100%|██████████| 50/50 [00:00<00:00, 106.96it/s]


Lookalike nodes: [18, 88, 21, 4, 36, 46, 45]


In [7]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

def deep_learning_lookalike(X, y, candidate_users, threshold=0.7):
    model = Sequential([
        Dense(64, activation='relu', input_shape=(X.shape[1],)),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X, y, epochs=50, batch_size=32, validation_split=0.2, verbose=0)
    
    candidate_proba = model.predict(candidate_users)
    lookalike_indices = np.where(candidate_proba >= threshold)[0]
    return lookalike_indices

# Example usage
X = np.array([[1, 0, 1], [1, 1, 1], [0, 1, 0], [0, 0, 1]])
y = np.array([1, 1, 0, 0])  # 1 for seed users, 0 for non-seed users
candidate_users = np.array([[1, 1, 0], [0, 1, 1], [1, 0, 0]])
lookalike_indices = deep_learning_lookalike(X, y, candidate_users)
print("Lookalike user indices:", lookalike_indices)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
Lookalike user indices: [2]
