In [1]:
pip install --upgrade nudenet

Defaulting to user installation because normal site-packages is not writeable
Collecting nudenet
  Obtaining dependency information for nudenet from https://files.pythonhosted.org/packages/fa/7f/a72c8a36a338f7e30fd564d982ca8154859a69602ca1c0d5c500c4de2e16/nudenet-3.0.8-py3-none-any.whl.metadata
  Downloading nudenet-3.0.8-py3-none-any.whl.metadata (2.4 kB)
Collecting opencv-python-headless (from nudenet)
  Obtaining dependency information for opencv-python-headless from https://files.pythonhosted.org/packages/e3/10/31b27a7473043eb5317f698ede00e7e129b2de378903bfe0bb4d785a7baf/opencv_python_headless-4.8.1.78-cp37-abi3-win_amd64.whl.metadata
  Downloading opencv_python_headless-4.8.1.78-cp37-abi3-win_amd64.whl.metadata (20 kB)
Downloading nudenet-3.0.8-py3-none-any.whl (10.6 MB)
   ---------------------------------------- 0.0/10.6 MB ? eta -:--:--
    --------------------------------------- 0.1/10.6 MB 4.3 MB/s eta 0:00:03
   - -------------------------------------- 0.5/10.6 MB 5.8 MB/s e



In [3]:
from nudenet import NudeDetector

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel
import torch


In [None]:
user_interactions = pd.read_json('refined_interactions.json')
user_profiles = pd.read_json('refined_user_profiles.json')
data = pd.merge(user_profiles, user_interactions, on="user_id")
data.head()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

In [None]:
def create_embeddings(texts):
    tokens = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**tokens)
    return outputs.last_hidden_state[:, 0, :].cpu().numpy()

In [None]:
data['user_combined_text'] = data[['region', 'languages', 'preferences']].astype(str).agg(' '.join, axis=1)
data['content_combined_text'] = data[['content_id', 'content_creator', 'content_language', 'content_type','interaction_types']].astype(str).agg(' '.join, axis=1)


In [None]:
data[['user_combined_text','content_combined_text']].head()

In [None]:
unique_user_texts = [str(text) for text in data['user_combined_text'].unique() if text]
unique_content_texts = [str(text) for text in data['content_combined_text'].unique() if text]

In [None]:
user_embeddings = create_embeddings(unique_user_texts)
content_embeddings = create_embeddings(unique_content_texts)

In [None]:
similarity_scores = cosine_similarity(user_embeddings, content_embeddings)

In [None]:
similarity_scores

In [None]:
user_id_to_index = {user_id: index for index, user_id in enumerate(data['user_id'].unique())}

In [None]:
content_id_to_index = {id: index for index, id in enumerate(data['content_id'].unique())}


In [None]:
def get_recommendations(user_id, similarity_scores, top_n=5):
    if user_id not in user_id_to_index:
        return f"User ID {user_id} not found."
    user_idx = user_id_to_index[user_id]
    scores = similarity_scores[user_idx]
    top_indices = scores.argsort()[-top_n:][::-1]
    recommendations = []
    for idx in top_indices:
        content_idx = content_id_to_index[data['content_id'].iloc[idx]]
        content_id = data['content_id'].iloc[idx]
        content_creator = data['content_creator'].iloc[idx]
        content_type = data[data['content_id'] == content_id]['content_type'].iloc[0]
        score = scores[idx]
        recommendations.append((content_id, content_creator,content_type, score))
    return recommendations

In [None]:
counter = 0
for user_id in data['user_id'].unique():
    top_recommendations = get_recommendations(user_id, similarity_scores)
    print(f"User {user_id}:\n {top_recommendations}\n\n")
    counter += 1
    if counter >= 5:
        break