In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
!pip install sentence-transformers faiss-cpu pandas numpy



In [4]:
import pandas as pd
import numpy as np
import json
from sentence_transformers import SentenceTransformer
import faiss


In [5]:
# If your JSON is in Google Drive, mount Drive first:
# from google.colab import drive
# drive.mount('/content/drive')

with open('/content/influencer.json', 'r') as f:
    data = json.load(f)

# If data is nested under a specific key like 'influencers', use: data = data['influencers']

df = pd.DataFrame(data)
print(df.head())


          name          username  \
0  Simran Kaur    ravi_fitness_1   
1   Isha Patel      ananya_diy_2   
2    Zoya Khan     arjun_music_3   
3   Isha Patel   simran_styles_4   
4  Aarav Mehta  zoya_lifestyle_5   

                                                 bio  \
0  Exploring hidden travel gems 🧳 | Adventure addict   
1  Tech reviews + productivity hacks | Let’s make...   
2  Traveling the world one city at a time 🌍 | Cof...   
3  Traveling the world one city at a time 🌍 | Cof...   
4  Traveling the world one city at a time 🌍 | Cof...   

                                    location                  categories  \
0       {'country': 'India', 'city': 'Pune'}                   [Fitness]   
1    {'country': 'India', 'city': 'Lucknow'}                 [Lifestyle]   
2  {'country': 'India', 'city': 'Bengaluru'}  [Gadgets, Travel, Fitness]   
3  {'country': 'India', 'city': 'Hyderabad'}                     [Music]   
4     {'country': 'India', 'city': 'Jaipur'}      [Fitness, DIY, T

In [6]:
# Adjust the field names as in your JSON data
text_fields = df['bio'].astype(str) + " " + df['categories'].astype(str)


In [7]:
if isinstance(df['categories'][0], list):
    text_fields = df.apply(lambda row: row['bio'] + " " + ' '.join(map(str, row['categories'])), axis=1)


In [9]:
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(text_fields.tolist(), show_progress_bar=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

In [10]:
embedding_dim = embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(np.array(embeddings).astype('float32'))

# Keep track of indices for search results
id_df = df.reset_index()


In [11]:
def search_influencers(query, top_k=10):
    query_embed = model.encode([query])
    D, I = index.search(np.array(query_embed).astype('float32'), top_k)
    results = id_df.iloc[I[0]]
    return results

# Example use:
results = search_influencers("fitness influencer eco-friendly vibe", top_k=5)
print(results[['bio', 'categories']])


                                                   bio  \
406  Fashion & lifestyle creator | Collaborated wit...   
115  Fashion & lifestyle creator | Collaborated wit...   
153  Modern lifestyle & everyday hacks 🌟 | Life mad...   
88   Fashion & lifestyle creator | Collaborated wit...   
199  Fashion & lifestyle creator | Collaborated wit...   

                        categories  
406                 [Fitness, DIY]  
115  [Lifestyle, Fitness, Gadgets]  
153         [DIY, Fitness, Beauty]  
88           [Music, Fitness, DIY]  
199             [Fitness, Fashion]  


In [12]:
liked_indices = [2, 10, 17]  # Example list

def get_preference_vector(indices):
    if not indices:
        return None
    return np.mean(embeddings[indices], axis=0)

preference_vector = get_preference_vector(liked_indices)

def personalized_search(query, top_k=10, alpha=0.6):
    query_vec = model.encode([query])[0]
    if preference_vector is not None:
        combined_vec = alpha * query_vec + (1 - alpha) * preference_vector
    else:
        combined_vec = query_vec
    D, I = index.search(np.array([combined_vec]).astype('float32'), top_k)
    return id_df.iloc[I[0]]

# Example:
p_results = personalized_search("eco-friendly", top_k=5)
print(p_results[['bio', 'categories']])


                                                   bio  \
484  Traveling the world one city at a time 🌍 | Cof...   
496  Traveling the world one city at a time 🌍 | Cof...   
2    Traveling the world one city at a time 🌍 | Cof...   
202  Traveling the world one city at a time 🌍 | Cof...   
190  Traveling the world one city at a time 🌍 | Cof...   

                        categories  
484                 [DIY, Gadgets]  
496      [Fitness, Gadgets, Books]  
2       [Gadgets, Travel, Fitness]  
202  [Lifestyle, Gadgets, Fashion]  
190   [Gadgets, Travel, Lifestyle]  


In [15]:
import random

# 1. Select a random influencer
rand_idx = random.randint(0, len(df) - 1)
sample_profile = df.iloc[rand_idx]

print("RANDOMLY CHOSEN INFLUENCER:")
print("Name:", sample_profile['name'])
print("Bio:", sample_profile['bio'])
print("Categories:", sample_profile['categories'])
print("Website:", sample_profile['website'])
print("Contact Email:", sample_profile['contactEmail'])
print('-'*40)

# 2. Create a test query from the profile (join categories if it's a list)
if isinstance(sample_profile['categories'], list):
    query = sample_profile['bio'] + " " + ' '.join(sample_profile['categories'])
else:
    query = sample_profile['bio'] + " " + str(sample_profile['categories'])

print("SEARCH QUERY:")
print(query)
print('-'*40)

# 3. Run the search function (top 5 results)
results = search_influencers(query, top_k=5)

# 4. Display desired columns in the results
display_columns = ['name', 'bio', 'categories', 'website', 'contactEmail']
print("SEARCH RESULTS:")
print(results[display_columns])


RANDOMLY CHOSEN INFLUENCER:
Name: Isha Patel
Bio: Beauty tips & skincare routine 💄 | Makeup tutorials daily
Categories: ['DIY']
Website: https://www.zoya_lifestyle_342.com
Contact Email: zoya_lifestyle_342@influencers.com
----------------------------------------
SEARCH QUERY:
Beauty tips & skincare routine 💄 | Makeup tutorials daily DIY
----------------------------------------
SEARCH RESULTS:
            name                                                bio  \
59   Simran Kaur  Beauty tips & skincare routine 💄 | Makeup tuto...   
341   Isha Patel  Beauty tips & skincare routine 💄 | Makeup tuto...   
200   Ravi Verma  Beauty tips & skincare routine 💄 | Makeup tuto...   
368   Neha Gupta  Beauty tips & skincare routine 💄 | Makeup tuto...   
146   Ravi Verma  Beauty tips & skincare routine 💄 | Makeup tuto...   

              categories                             website  \
59                 [DIY]     https://www.karan_travel_60.com   
341                [DIY]  https://www.zoya_lifest

In [16]:
# Function to input a brand requirement and get matching influencers

# Option 1: Interactive input for notebook
brand_requirement = input("Enter your brand requirement (e.g., 'eco-friendly wellness influencer Europe'):\n")

# Option 2: Direct assignment for script or testing
# brand_requirement = "eco-friendly wellness influencer Europe"

# Get search results from the model (update top_k as needed)
results = search_influencers(brand_requirement, top_k=5)

# Columns to display in the output
display_columns = ['name', 'bio', 'categories', 'website', 'contactEmail']

print("\nTop Matching Influencers for Your Requirement:")
print(results[display_columns])


Enter your brand requirement (e.g., 'eco-friendly wellness influencer Europe'):
fitness

Top Matching Influencers for Your Requirement:
             name                                                bio  \
162  Rehan Sheikh  Fitness goals & healthy living 🏋️ | Join my jo...   
196     Zoya Khan  Fitness goals & healthy living 🏋️ | Join my jo...   
436    Neha Gupta  Fitness goals & healthy living 🏋️ | Join my jo...   
18     Neha Gupta  Fitness goals & healthy living 🏋️ | Join my jo...   
392    Ananya Roy  Fitness goals & healthy living 🏋️ | Join my jo...   

                   categories                             website  \
162  [Travel, Books, Fitness]    https://www.karan_travel_163.com   
196            [Fitness, DIY]      https://www.rehan_tech_197.com   
436  [Books, Fitness, Beauty]    https://www.karan_travel_437.com   
18                     [Tech]    https://www.simran_styles_19.com   
392                    [Tech]  https://www.zoya_lifestyle_393.com   

                

In [17]:
# Save the sentence-transformers model
model.save("influencer_embedding_model")

# Save the FAISS index
faiss.write_index(index, "faiss_index.index")

# Save influencer info DataFrame to CSV (or pickle for advanced use)
id_df.to_csv("influencer_metadata.csv", index=False)


In [18]:
pip install fastapi uvicorn sentence-transformers faiss-cpu pandas numpy




In [19]:
from sentence_transformers import SentenceTransformer
import faiss
import pandas as pd
import numpy as np

# Load your model and data
model = SentenceTransformer("influencer_embedding_model")
index = faiss.read_index("faiss_index.index")
id_df = pd.read_csv("influencer_metadata.csv")


In [20]:
def search_influencers(brand_requirement, top_k=5):
    query_embed = model.encode([brand_requirement])
    D, I = index.search(np.array(query_embed).astype('float32'), top_k)
    display_columns = ['name', 'bio', 'categories', 'website', 'contactEmail']
    results = id_df.iloc[I[0]][display_columns]
    return results


In [21]:
# Example: Brand enters their requirement
brand_requirement = input("Enter your brand requirement (e.g., 'eco-friendly wellness influencer Europe'):\n")

# Get the top matching influencers
results = search_influencers(brand_requirement, top_k=5)

print("\nTop Matching Influencers:")
print(results)


Enter your brand requirement (e.g., 'eco-friendly wellness influencer Europe'):
gym

Top Matching Influencers:
           name                                                bio  \
440  Isha Patel  Fitness goals & healthy living 🏋️ | Join my jo...   
278  Ananya Roy  Fitness goals & healthy living 🏋️ | Join my jo...   
111   Zoya Khan  Fitness goals & healthy living 🏋️ | Join my jo...   
257  Isha Patel  Fitness goals & healthy living 🏋️ | Join my jo...   
18   Neha Gupta  Fitness goals & healthy living 🏋️ | Join my jo...   

               categories                           website  \
440     ['Tech', 'Music']    https://www.rehan_tech_441.com   
278  ['Music', 'Fitness']   https://www.arjun_music_279.com   
111      ['DIY', 'Music']   https://www.arjun_music_112.com   
257             ['Music']    https://www.rehan_tech_258.com   
18               ['Tech']  https://www.simran_styles_19.com   

                         contactEmail  
440    rehan_tech_441@influencers.com  
278   arj