In [1]:
import numpy as np
import pandas as pd
from matchmaker import Matchmaker

import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go

import networkx as nx
import random
import datetime

In [2]:
raw_data = pd.read_csv('swipes.csv')
df = raw_data.copy()

## A little bit of clean-up

In [3]:
# Remove rows where decider gender is the same as other gender
df = df.loc[df["decidergender"].ne(df["othergender"])]

In [4]:
# Convert date columns to datetime
df["timestamp"] = pd.to_datetime(df["timestamp"])
df["decidersignuptimestamp"] = pd.to_datetime(df["decidersignuptimestamp"])
df["othersignuptimestamp"] = pd.to_datetime(df["othersignuptimestamp"])

In [5]:
# Add age feature
df["deciderage"] = df["timestamp"].dt.year - df["deciderdobyear"]
df["otherage"] = df["timestamp"].dt.year - df["otherdobyear"]

In [6]:
# Add tenure feature
df["decidertenuredays"] = df["decidersignuptimestamp"].dt.year - df["deciderdobyear"]
df["othertenuredays"] = df["othersignuptimestamp"].dt.year - df["otherdobyear"]

In [7]:
# Find accounts that make fewer than 10 swipes
lower_bound = 10
swipe_counts = df.groupby('decidermemberid').size()
swipe_outliers = swipe_counts[(swipe_counts < lower_bound)].index

# Find accounts that make fewer than 5 likes
lower_bound = 5
like_counts = df[df['like'] == 1].groupby('decidermemberid').size()
like_outliers = like_counts[(like_counts < lower_bound)].index

# Drop rows where the decider is in outliers
df = df[~df['decidermemberid'].isin(swipe_outliers)]
df = df[~df['decidermemberid'].isin(like_outliers)]
df = df.reset_index(drop=True)

In [8]:
df_collapsed = df.drop_duplicates(subset='decidermemberid', keep = 'first')
df_collapsed = df_collapsed[['decidermemberid', 'decidergender', 'deciderage', 'decidertenuredays']]

## Graph analysis

In [9]:
G = nx.DiGraph()

In [10]:
# Add edges with attributes
for _, row in df.iterrows():
    G.add_edge(
        row["decidermemberid"],
        row["othermemberid"],
        like=row["like"],
        timestamp=row["timestamp"] if "timestamp" in df.columns else None
    )

# Add node attributes
for _, row in df.iterrows():
    G.nodes[row["decidermemberid"]]["gender"] = row["decidergender"]
    G.nodes[row["othermemberid"]]["gender"] = row["othergender"]
    G.nodes[row["decidermemberid"]]["age"] = row["deciderage"]
    G.nodes[row["othermemberid"]]["age"] = row["otherage"]
    G.nodes[row["decidermemberid"]]["tenure"] = row["decidertenuredays"]
    G.nodes[row["othermemberid"]]["tenure"] = row["othertenuredays"]


In [11]:
print(G.number_of_nodes(), "users")
print(G.number_of_edges(), "swipes")

168824 users
8403126 swipes


In [None]:
# # for visualisation
# edges_sample = random.sample(list(G.edges()), 500)  # 500 edges for viz
# H = G.edge_subgraph(edges_sample).copy()
# nx.write_gexf(G, "swipes.gexf")

In [12]:
# Initialize dicts
likes_in, likes_out, dislikes_in, dislikes_out = {}, {}, {}, {}

for u in G.nodes():
    likes_in[u] = likes_out[u] = dislikes_in[u] = dislikes_out[u] = 0

In [13]:
# Tally likes/dislikes
for u, v, d in G.edges(data=True):
    if d["like"] == 1:
        likes_out[u] += 1
        likes_in[v] += 1
    else:
        dislikes_out[u] += 1
        dislikes_in[v] += 1

In [14]:
# Ratios
like_ratio_in = {u: likes_in[u] / (likes_in[u] + dislikes_in[u])
                 if (likes_in[u] + dislikes_in[u]) > 0 else None
                 for u in G.nodes()}

like_ratio_out = {u: likes_out[u] / (likes_out[u] + dislikes_out[u])
                  if (likes_out[u] + dislikes_out[u]) > 0 else None
                  for u in G.nodes()}

In [15]:
popularity = pd.DataFrame(
   {'user_id': like_ratio_in.keys(),
   'like_ratio_in': like_ratio_in.values()
   }
)

swiping_strategy = pd.DataFrame(
   {'user_id': like_ratio_out.keys(),
   'like_ratio_out': like_ratio_out.values()
   }
)

In [16]:
temp = popularity.merge(swiping_strategy, on='user_id', how='inner')
df_res = temp.merge(df_collapsed, left_on='user_id', right_on='decidermemberid', how='inner')

In [17]:
df_res.drop('decidermemberid', axis=1, inplace=True)
df_res.rename({'decidergender': 'gender', 'deciderage': 'age', 'decidertenuredays': 'tenuredays'}, axis=1, inplace=True)

#### need to roughly summarise how popular/generous people are
normalise or weight like_ratio_in according to how may times they were swiped on

simiarly, normalise or weight like_ratio_out according to how many times they swiped on others

In [18]:
df_res[df_res.gender=='F']['like_ratio_out'].mean()

np.float64(0.08254506942546831)

In [19]:
px.histogram(df_res[df_res.gender=='F'].like_ratio_out, nbins=20)

In [20]:
px.histogram(df_res[df_res.gender=='M'].like_ratio_out, nbins=20)

In [21]:
px.histogram(df_res[df_res.gender=='F'].like_ratio_in, nbins=20)

In [22]:
px.histogram(df_res[df_res.gender=='M'].like_ratio_in, nbins=20)

In [23]:
# Likes-only graph
G_likes = nx.DiGraph([(u,v) for u,v,d in G.edges(data=True) if d["like"]==1])

# Global reciprocity (fraction of mutual likes)
global_recip = nx.reciprocity(G_likes)
print("Global match rate:", global_recip)

# Reciprocity per user
user_recip = nx.reciprocity(G_likes, nodes=G_likes.nodes())

Global match rate: 0.014610836223806541


In [24]:
mutual_matches = 0
total_likes = G_likes.number_of_edges()

for u, v in G_likes.edges():
    if G_likes.has_edge(v, u):   # check if reciprocal
        mutual_matches += 1

# Each mutual match is counted twice (u→v and v→u)
mutual_matches //= 2

print("Total likes:", total_likes)
print("Mutual matches:", mutual_matches)
print("Match rate (mutuals / total likes):", mutual_matches / total_likes)

Total likes: 3187771
Mutual matches: 23288
Match rate (mutuals / total likes): 0.007305418111903271


In [25]:
pagerank_likes = nx.pagerank(G_likes)   # prestige among likes
G_dislikes = nx.DiGraph([(u,v) for u,v,d in G.edges(data=True) if d["like"]==0])
pagerank_dislikes = nx.pagerank(G_dislikes)


In [26]:
def assign_balanced_leagues(pr_scores, genders, 
                            quantiles=[0,0.3,0.5,0.7,0.9,1.0], 
                            labels=["Bronze","Silver","Gold","Platinum","Diamond"]):
    """
    Assign leagues so that gender proportions are preserved within each tier.
    pr_scores: dict {user: pagerank}
    genders: dict {user: 'M' or 'F'}
    """
    import pandas as pd

    # Build dataframe
    df = pd.DataFrame({
        "user": list(pr_scores.keys()),
        "pagerank": list(pr_scores.values()),
        "gender": [genders.get(u) for u in pr_scores.keys()]
    })

    # Drop rows with no pagerank or no gender
    df = df.dropna(subset=["pagerank","gender"])

    league_parts = []
    for g in df["gender"].unique():
        sub = df[df["gender"]==g].copy()
        if sub.shape[0] < len(labels):  
            # Not enough users to fill all bins → put everyone in "Bronze"
            sub["league"] = "Bronze"
        else:
            sub["league"] = pd.qcut(
                sub["pagerank"],
                q=quantiles,
                labels=labels,
                duplicates="drop"
            )
        league_parts.append(sub)

    if len(league_parts) == 0:
        raise ValueError("No valid genders found in data")

    return pd.concat(league_parts)

In [29]:
balanced_df = assign_balanced_leagues(
    pagerank_likes, 
    nx.get_node_attributes(G, "gender")
)

print(balanced_df.drop_duplicates(subset=["user"], keep='first')["league"].value_counts())
print(balanced_df.groupby("league")["gender"].value_counts(normalize=True))


league
Bronze      25999
Platinum    17314
Gold        17308
Silver      17276
Diamond      8651
Name: count, dtype: int64
league    gender
Bronze    M         0.614370
          F         0.385630
Silver    M         0.613163
          F         0.386837
Gold      M         0.613878
          F         0.386122
Platinum  M         0.613954
          F         0.386046
Diamond   M         0.613686
          F         0.386314
Name: proportion, dtype: float64






In [34]:
res = balanced_df.merge(df_res, left_on="user", right_on="user_id").drop('gender_y', axis=1).rename({'gender_x':'gender'}, axis=1)

In [36]:
# custom colors for leagues
league_colors = {
    "Bronze": "#B08965",     # brownish bronze
    "Silver": "#AEADAD",     # silver grey
    "Gold": "#D6C76E",       # gold yellow
    "Platinum": "#AEF8F4",   # platinum blue-ish
    "Diamond": "#1D8EFE"     # diamond blue
}

In [51]:
fig = px.histogram(
    res[res['gender']=='M'], 
    x="like_ratio_in",
    nbins=20,
    color="league",
    barmode="group",  # side-by-side
    category_orders={"league": ["Bronze", "Silver", "Gold", "Platinum", "Diamond"]},
    color_discrete_map=league_colors
)

fig.update_layout(
    title="Like-ratio-in of Male Users by League",
    xaxis_title="Age",
    yaxis_title="Count"
)

fig.show()

In [49]:
fig = px.histogram(
    res[res['gender']=='F'], 
    x="like_ratio_in",
    nbins=20,
    color="league",
    barmode="group",  # side-by-side
    category_orders={"league": ["Bronze", "Silver", "Gold", "Platinum", "Diamond"]},
    color_discrete_map=league_colors
)

fig.update_layout(
    title="Like-ratio-in of Female Users by League",
    xaxis_title="Age",
    yaxis_title="Count",
    width=1500
)

fig.show()

In [46]:
# Build a dataframe with pagerank, in_degree, gender
pr_series = pd.Series(pagerank_likes, name="pagerank")
in_series = pd.Series(dict(G.in_degree()), name="in_degree")

df_corr = pd.concat([pr_series, in_series], axis=1).reset_index().rename(columns={"index":"user"})

# Add gender info from graph node attributes
df_corr["gender"] = df_corr["user"].map(lambda u: G.nodes[u].get("gender", None))

# Correlation by gender
for g in ["M","F"]:
    sub = df_corr[df_corr["gender"]==g]
    print(f"\nGender = {g}")
    print(sub[["pagerank","in_degree"]].corr())



Gender = M
           pagerank  in_degree
pagerank   1.000000   0.445355
in_degree  0.445355   1.000000

Gender = F
           pagerank  in_degree
pagerank   1.000000   0.898563
in_degree  0.898563   1.000000


## Popularity analysis

In [None]:
# Remove rows where decider gender is the same as other gender (2nd assumption)
df = df.loc[df["decidergender"].ne(df["othergender"])]

# Remove outliers
like_counts = df[df['like'] == 1].groupby('decidermemberid').size()

log_like_counts = np.log(like_counts)
q1 = np.percentile(log_like_counts, 25)
q3 = np.percentile(log_like_counts, 75)
iqr = q3 - q1

upper_log = q3 + 1.5 * iqr

upper_bound = int(np.exp(upper_log) - 1)
lower_bound = 5

outliers = like_counts[(like_counts < lower_bound)|(like_counts > upper_bound)].index

# drop rows where the decider is in outliers
df = df[~df['decidermemberid'].isin(outliers)]
df = df.reset_index(drop=True)

In [None]:
userids = df.decidermemberid.unique()

In [None]:
def likes_given(x):
    temp = df[df.decidermemberid==x]
    return temp[temp.like==1]['othermemberid'].unique().tolist()

likes_given_dict = {} 
for x in userids:
    likes_given_dict[int(x)] = likes_given(x)

In [None]:
def likes_received(x):
    temp = df[df.othermemberid==x]
    return temp[temp.like==1]['decidermemberid'].unique().tolist()

likes_received_dict = {} 
for x in userids:
    likes_received_dict[int(x)] = likes_received(x)

In [None]:
df_popularity = pd.DataFrame({
    'userid':userids,
    'num_likes_given':(likes_given_dict.values()),
    'num_likes_received':(likes_received_dict.values())
})

df_popularity['num_likes_given'] = df_popularity['num_likes_given'].apply(len)
df_popularity['num_likes_received'] = df_popularity['num_likes_received'].apply(len)

In [None]:
metadata = df[['decidermemberid', 'decidergender', 'deciderdobyear', 'decidersignuptimestamp']]
metadata = metadata.drop_duplicates()

In [None]:
df

In [None]:
df_popularity = df_popularity.merge(metadata, left_on='userid', right_on='decidermemberid', how='left')

In [None]:
px.histogram(df_popularity.num_likes_received, nbins=1000)

In [None]:
px.histogram(df_popularity.like_rate, nbins=50)

In [None]:
# Count how often each decidermemberid appears in othermemberid
counts = df["othermemberid"].value_counts()

# Map back to each decidermemberid
df_popularity["num_exposures"] = df_popularity["decidermemberid"].map(counts).fillna(0).astype(int)

In [None]:
df_popularity['like_rate'] = df_popularity['num_likes_received'] / df_popularity['num_exposures']

In [None]:
'''
df_popularity['like_rate']:

NaN: profile never shown to other users
0: profile shown but not liked
>0: proportion of likes to exposures
'''

In [None]:
len(df_popularity)

In [None]:
len(df_popularity[df_popularity['like_rate'] == 0])

In [None]:
len(df_popularity[(df_popularity['like_rate'] == 0)&(df_popularity['decidergender'] == 'M')])

In [None]:
len(df_popularity[(df_popularity['like_rate'] == 0)&(df_popularity['decidergender'] == 'F')])

In [None]:
len(df_popularity[df_popularity['like_rate'].isna()])