In [5]:
import pandas as pd
import janitor
import numpy as np

leads = pd.read_csv('data/exported_data/deduped_leads_long.csv').clean_names()
brizo = pd.read_csv('data/exported_data/combined.csv').clean_names()

In [6]:
### get the date for campaign list sources
brizo_list = brizo['list_source'].unique().tolist()

df = pd.DataFrame(brizo_list, columns=['list_source'])

df['year_month'] = df['list_source'].str.extract(r'(\d{4}-\d{2})')

df.to_csv('data/exported_data/brizo_list_with_ym.csv', index=False)

# 4) Inspect
print(df.head())


                                  list_source year_month
0              2024-11—2nd DM - English_Other    2024-11
1        2024-04—Group C: CW Gift_SMS (Brizo)    2024-04
2                       2025-07—C-LaborLowGMV    2025-07
3                         2025-01—D - English    2025-01
4  2024-06—Group A: SQL_Trial (Brizo) 1-28314    2024-06


In [7]:
# ensure phone_number is string
brizo['phone_number'] = brizo['phone_number'].astype(str)
# remove any non-digit (spaces, punctuation, letters, etc.)
brizo['phone_number'] = brizo['phone_number'].str.replace(r'\D+', '', regex=True)

brizo_counts = (
    brizo
      .groupby(['brizo_id', 'phone_number'])
      .size()
      .reset_index(name='count')
)
display(brizo_counts)

Unnamed: 0,brizo_id,phone_number,count
0,00007b5fcec398ab,7133644575,3
1,000144264e8456f6,7163504199,4
2,0001aa412e069ffa,9024220245,1
3,00022a7453654b49,8056437277,5
4,000243c264f11715,7864097151,6
...,...,...,...
106862,fffd16a3a13966b8,4106022208,11
106863,fffe113114e7c1e5,7188363083,21
106864,fffebad4ae63da74,4193558281,21
106865,fffedd63e21115a3,7573012384,1


In [8]:
selected_leads = leads[['lead_id','mobile_primary',
                        'business_phone_line_1','brizo_id', 
                        'created_date',
                        'first_mel_timestamp','latest_mql_timestamp',
                        'assigned_time_ae_',
                        'opportunity_created_date',
                        'closed_won_date',
                        'stage','unqualified_reason', 'closed_lost_reason']].drop_duplicates()



In [9]:
# Count how many times each brizo_id appears
dup_counts = (
    selected_leads
      .groupby('brizo_id')
      .size()
      .reset_index(name='n_matches')
)

# See which IDs appear more than once
print( dup_counts.query('n_matches > 1') )

Empty DataFrame
Columns: [brizo_id, n_matches]
Index: []


In [10]:
match_brizo = brizo_counts.merge(
    selected_leads,
    on='brizo_id',
    how='inner'
).assign(match_key='brizo_id')


print(match_brizo.head())


           brizo_id phone_number  count          lead_id  mobile_primary  \
0  00147fccd962ca33   7327929653      4  00Q8b00001xKJGh             NaN   
1  00282d2fa2df2c18   6312183888     21  00Q8b000021p6WV             NaN   
2  00370ffaea501b02   4106509949     15  00QUo00000RWj47    6.462091e+09   
3  003bf5ec93f0b157   7326571616     21  00QUo000009gYJS    7.329305e+09   
4  005be3f96749e3e1   2057987288      4  00Q8b00001xKGX1             NaN   

   business_phone_line_1 created_date first_mel_timestamp  \
0           7.327930e+09   10/31/2022          2023-11-01   
1           6.312184e+09     4/4/2023          2023-12-28   
2           4.106510e+09    2/25/2025          2025-02-25   
3           7.326572e+09   10/31/2022          2024-04-23   
4           2.057987e+09   10/31/2022          2023-12-19   

  latest_mql_timestamp assigned_time_ae_ opportunity_created_date  \
0                  NaN               NaN                      NaN   
1                  NaN               N

In [11]:
match_brizo.to_csv('data/exported_data/match_brizo.csv', index=False)