In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [2]:
df = pd.read_csv('/content/Richer_Synthetic_People_Dataset__India__1000_Records_.csv')

In [3]:


def parse_city_timeline_str(timeline_str):
    pattern = r'(\w[\w\s]+) \((\d{4})-(\d{4})\)'
    matches = re.findall(pattern, timeline_str)
    return [(city.strip(), int(start), int(end)) for city, start, end in matches]

df["Parsed_City_Timeline"] = df["City_Timeline"].apply(parse_city_timeline_str)


In [4]:
multi_label_fields = [
    "Traits", "Character", "Hobbies", "Frequent_Places",
    "Social_Platforms", "Languages", "Digital_Habits"
]

for field in multi_label_fields:
    df[field + "_List"] = df[field].apply(
        lambda x: [i.strip() for i in x.split(",")] if pd.notnull(x) else []
    )


In [5]:
df["Last_Relationship_Year"] = df["Last_Relationship_Year"].fillna(0).astype(int)


In [6]:
df["Age"] = 2025 - df["Birth_Year"]


**DID THEY MEET? ENGINE**

Step 1: City-Time Overlap Check

In [7]:
def city_time_overlap(person1, person2):
    """
    Returns True if person1 and person2 lived in the same city during overlapping years.
    """
    for city1, start1, end1 in person1["Parsed_City_Timeline"]:
        for city2, start2, end2 in person2["Parsed_City_Timeline"]:
            if city1 == city2 and max(start1, start2) < min(end1, end2):
                return True
    return False


Step 2: is_age_compatible(person1, person2, tolerance=5)

In [8]:
def is_age_compatible(person1, person2, tolerance=5):
    """
    Returns True if the age difference between two people is within the given tolerance.
    """
    return abs(person1["Age"] - person2["Age"]) <= tolerance


Step 3: combined_trait_similarity(person1, person2)

   Step 3.1: First, we build a helper function: jaccard_similarity()

In [10]:
def jaccard_similarity(list1, list2):
    """
    Calculates Jaccard similarity between two lists (as sets).
    Returns a value between 0 and 1.
    """
    set1, set2 = set(list1), set(list2)
    if not set1 and not set2:
        return 1.0  # both empty = perfect match
    return len(set1 & set2) / len(set1 | set2)


Step 3.2: Now build the full trait matcher:

In [11]:
def combined_trait_similarity(person1, person2):
    """
    Returns the average similarity across multiple behavioral fields.
    """
    traits = jaccard_similarity(person1["Traits_List"], person2["Traits_List"])
    hobbies = jaccard_similarity(person1["Hobbies_List"], person2["Hobbies_List"])
    socials = jaccard_similarity(person1["Social_Platforms_List"], person2["Social_Platforms_List"])
    places = jaccard_similarity(person1["Frequent_Places_List"], person2["Frequent_Places_List"])
    character = jaccard_similarity(person1["Character_List"], person2["Character_List"])

    return (traits + hobbies + socials + places + character) / 5


Step 4: meeting_score(person1, person2)

In [12]:
def meeting_score(person1, person2):
    """
    Combines city-time overlap, age compatibility, and trait similarity into a final meeting score (0 to 1).
    """
    location_score = 1.0 if city_time_overlap(person1, person2) else 0.0
    age_score = 1.0 if is_age_compatible(person1, person2) else 0.0
    similarity_score = combined_trait_similarity(person1, person2)

    # Final weighted combination
    final_score = (0.5 * location_score) + (0.2 * age_score) + (0.3 * similarity_score)
    return round(final_score, 3)


FINDING **RESULTS**

In [15]:

ref_person = df[
    (df["Name"].str.lower() == "adira") &
    (df["Gender"].str.lower() == "female") &
    (df["Birth_Year"] == 2005)
]

def find_top_matches(reference_person, dataset, top_n=1):
    results = []
    for _, other in dataset.iterrows():
        if other["Name"] != reference_person["Name"]:
            score = meeting_score(reference_person, other)
            results.append({
                "Name": other["Name"],
                "Score": score,
                "Current_City": other["Current_City"],
                "Age": other["Age"]
            })
    result_df = pd.DataFrame(results)
    return result_df.sort_values(by="Score", ascending=False).head(top_n)

if not ref_person.empty:
    reference_person = ref_person.iloc[0]
    best_match_df = find_top_matches(reference_person, df, top_n=1)
    best_match = best_match_df.iloc[0]
    print(f"💘 Best Match for Adira is {best_match['Name']} from {best_match['Current_City']} "
          f"(Age {best_match['Age']}) with a match score of {best_match['Score']}")
else:
    print("❌ Adira (Female, Born 2005) not found in the dataset.")

💘 Best Match for Adira is Umang from Durg (Age 25) with a match score of 0.882


In [16]:
# Step 1: Filter to FEMALES only from dataset
female_df = df[df["Gender"].str.lower() == "female"]

# Step 2: Reuse the custom user profile
custom_user = {
    "Name": "Yeswanthreddy Jampala",
    "Gender": "male",
    "Birth_Year": 2002,
    "Age": 2025 - 2002,
    "Parsed_City_Timeline": [("Kadapa", 2002, 2019), ("Hyderabad", 2019, 2024)],
    "Traits_List": ["silent", "hardworking", "introvert"],
    "Hobbies_List": ["cricket", "music", "travel"],
    "Character_List": ["single", "shy", "quiet"],
    "Frequent_Places_List": ["theatre", "temple", "streets"],
    "Social_Platforms_List": ["X", "instagram"]
}

# Step 3: Matching function for top 1 female
def find_best_female_match(user_profile, dataset):
    results = []
    for _, other in dataset.iterrows():
        score = meeting_score(user_profile, other)
        results.append({
            "Name": other["Name"],
            "Score": score,
            "Current_City": other["Current_City"],
            "Age": other["Age"]
        })
    result_df = pd.DataFrame(results)
    return result_df.sort_values(by="Score", ascending=False).head(1)

# Step 4: Run it
top_female_match = find_best_female_match(custom_user, female_df)

print("💘 Best Female Match for Yeswanthreddy Jampala:")
print(top_female_match)


💘 Best Female Match for Yeswanthreddy Jampala:
      Name  Score Current_City  Age
339  Neysa  0.299     Gulbarga   22
