In [1]:
import pandas as pd
import janitor
import numpy as np

leads = pd.read_csv('data/deduped_leads_long.csv').clean_names()
brizo = pd.read_csv('data/combined.csv').clean_names()

In [2]:
### get the date for campaign list sources
brizo_list = brizo['list_source'].unique().tolist()

df = pd.DataFrame(brizo_list, columns=['list_source'])

df['year_month'] = df['list_source'].str.extract(r'(\d{4}-\d{2})')

df.to_csv('data/brizo_list_with_ym.csv', index=False)

# 4) Inspect
print(df.head())


                                  list_source year_month
0              2024-11—2nd DM - English_Other    2024-11
1        2024-04—Group C: CW Gift_SMS (Brizo)    2024-04
2                       2025-07—C-LaborLowGMV    2025-07
3                         2025-01—D - English    2025-01
4  2024-06—Group A: SQL_Trial (Brizo) 1-28314    2024-06


In [3]:
# ensure phone_number is string
brizo['phone_number'] = brizo['phone_number'].astype(str)
# remove any non-digit (spaces, punctuation, letters, etc.)
brizo['phone_number'] = brizo['phone_number'].str.replace(r'\D+', '', regex=True)

brizo_counts = (
    brizo
      .groupby(['brizo_id', 'phone_number'])
      .size()
      .reset_index(name='count')
)
display(brizo_counts)

Unnamed: 0,brizo_id,phone_number,count
0,00007b5fcec398ab,7133644575,3
1,000144264e8456f6,7163504199,4
2,0001aa412e069ffa,9024220245,1
3,00022a7453654b49,8056437277,5
4,000243c264f11715,7864097151,6
...,...,...,...
106862,fffd16a3a13966b8,4106022208,11
106863,fffe113114e7c1e5,7188363083,21
106864,fffebad4ae63da74,4193558281,21
106865,fffedd63e21115a3,7573012384,1


In [4]:
selected_leads = leads[['lead_id','mobile_primary',
                        'business_phone_line_1','brizo_id', 
                        'created_date',
                        'first_mel_timestamp','latest_mql_timestamp',
                        'assigned_time_ae_',
                        'opportunity_created_date',
                        'closed_won_date',
                        'stage','unqualified_reason', 'closed_lost_reason']].drop_duplicates()



In [7]:
# Count how many times each brizo_id appears
dup_counts = (
    selected_leads
      .groupby('brizo_id')
      .size()
      .reset_index(name='n_matches')
)

# See which IDs appear more than once
print( dup_counts.query('n_matches > 1') )

Empty DataFrame
Columns: [brizo_id, n_matches]
Index: []


In [27]:

# ————————————————
# Prep: convert dates & clean phones
# ————————————————
date_cols = [
    'closed_won_date',
    'opportunity_created_date',
    'assigned_time_ae_',
    'latest_mql_timestamp',
    'first_mel_timestamp',
    'created_date'
]

# 1) parse your date columns once
for col in date_cols:
    selected_leads[col] = pd.to_datetime(
        selected_leads[col],
        errors='coerce'
    )

# 2) normalize phone strings
for col in ['mobile_primary', 'business_phone_line_1']:
    selected_leads[col] = (
        selected_leads[col]
          .astype(str)
          .str.replace(r'\D+', '', regex=True)
    )

brizo_counts['phone_number'] = (
    brizo_counts['phone_number']
      .astype(str)
      .str.replace(r'\D+', '', regex=True)
)

# ————————————————
# Build brizo_counts if you haven’t yet
# ————————————————
# brizo_counts = (
#     brizo
#       .groupby(['brizo_id', 'phone_number'])
#       .size()
#       .reset_index(name='count')
# )

# ————————————————
# 1) Match by brizo_id
# ————————————————
match_brizo = selected_leads.merge(
    brizo_counts[['brizo_id','phone_number','count']],
    on='brizo_id',
    how='inner'
).assign(match_key='brizo_id')


# ————————————————
# 2) Match by business_phone_line_1
# ————————————————
match_business = selected_leads.merge(
    brizo_counts[['brizo_id','phone_number','count']],
    left_on='business_phone_line_1',
    right_on='phone_number',
    how='inner'
).assign(match_key='business_phone_line_1')


# ————————————————
# 3) Match by mobile_primary
# ————————————————
match_mobile = selected_leads.merge(
    brizo_counts[['brizo_id','phone_number','count']],
    left_on='mobile_primary',
    right_on='phone_number',
    how='inner'
).assign(match_key='mobile_primary')


# ————————————————
# 4) Combine all matches into one DataFrame
# ————————————————
matched = pd.concat(
    [match_brizo, match_business, match_mobile],
    ignore_index=True
)


# ————————————————
# 5) Dedupe by brizo_id, keeping the most recent by your date‐priority
# ————————————————
# sort descending so “most recent” appears first for each brizo_id
matched = matched.sort_values(
    by=date_cols,
    ascending=False,
    na_position='last'
)

final_matched = matched.drop_duplicates(
    subset='brizo_id',
    keep='first'
).reset_index(drop=True)


# ————————————————
# Inspect or save
# ————————————————
print(final_matched.head())


   lead_id mobile_primary business_phone_line_1          brizo_id  \
0      NaN    70495715470           70482288300  d615ce74fd940a54   
1      NaN    91721387220           35224105570  e3a294abf207a3bb   
2      NaN    91737419280           51884277770  59ff82b86d0e8938   
3      NaN    31553039930           31594696650  3ee9926a5fd802dc   
4      NaN    73473036080           73425406160  05c988dd31b8a82d   

  created_date first_mel_timestamp latest_mql_timestamp assigned_time_ae_  \
0   2022-10-31          2025-07-07           2025-07-07        2025-07-07   
1   2022-10-31                 NaT                  NaT        2025-06-23   
2   2022-10-31          2025-06-20           2025-06-20        2025-06-20   
3   2022-10-31          2025-06-18           2025-06-18        2025-06-18   
4   2022-10-31          2025-05-30           2025-06-02        2025-05-31   

  opportunity_created_date closed_won_date       stage unqualified_reason  \
0               2025-07-07      2025-07-07  C

In [28]:

final_matched = final_matched[['brizo_id', 'phone_number', 'count', 'lead_id', 
                               'created_date', 'first_mel_timestamp', 'latest_mql_timestamp', 
                               'assigned_time_ae_', 'opportunity_created_date', 'closed_won_date', 'stage', 
                               'unqualified_reason', 'closed_lost_reason', 'match_key']]
final_matched.to_csv('data/final_matched.csv', index=False)