In [2]:
import janitor
import pandas as pd
import re

leads = pd.read_csv('data/all_time_leads.csv').clean_names()


selected_leads = leads[['lead_id','mobile_primary',
                        'business_phone_line_1','brizo_id', 
                        'created_date',
                        'first_mel_timestamp','latest_mql_timestamp',
                        'assigned_time_ae_',
                        'opportunity_created_date',
                        'closed_won_date',
                        'stage','unqualified_reason', 'closed_lost_reason']].drop_duplicates()


selected_leads['mobile_primary'] = selected_leads['mobile_primary'].astype(str)
selected_leads['business_phone_line_1'] = selected_leads['business_phone_line_1'].astype(str)

selected_leads['mobile_primary'] = selected_leads['mobile_primary'].str.replace(r'\D+', '', regex=True)
selected_leads['business_phone_line_1'] = selected_leads['business_phone_line_1'].str.replace(r'\D+', '', regex=True)



def clean_to_10(s):
    # 1) remove any non-digits
    digits = re.sub(r'\D+', '', str(s))
    # 2) if >10 digits and starts with "1", drop the leading "1"
    if len(digits) > 10 and digits.startswith('1'):
        digits = digits[1:]
    # 3) only keep if exactly 10 digits
    return digits if len(digits) == 10 else None

for col in ['mobile_primary', 'business_phone_line_1']:
    selected_leads[col] = selected_leads[col].apply(clean_to_10)


In [6]:
date_cols = [
    'created_date',
    'first_mel_timestamp',
    'latest_mql_timestamp',
    'assigned_time_ae_',
    'opportunity_created_date',
    'closed_won_date'
]


date_cols = selected_leads[date_cols]
date_cols.to_csv('data/date_cols.csv', index=False)



In [None]:

import warnings

warnings.filterwarnings(
    "ignore",
    category=UserWarning,
    message=r".*infer_datetime_format.*"
)

for col in date_cols:
    selected_leads[col] = (
        pd.to_datetime(
            selected_leads[col],
            infer_datetime_format=True,
            errors='coerce'
        )
        .dt.strftime('%Y-%m-%d')
    )

In [3]:

# 3. Sort by recency (newest first)
df_sorted = selected_leads.sort_values(
    by=date_cols,
    ascending=False,
    na_position='last'   # treat NaT as oldest
)


In [6]:


# Inspect or save
print(result.head())
result.to_csv('data/deduped_leads_long.csv', index=False)

            lead_id mobile_primary business_phone_line_1 brizo_id  \
1   00QUo00000SvBGL     5599745392                  None      NaN   
15  00QUo00000SqruX     3477022771            8563388897      NaN   
24  00QUo00000Spu6U     9177709615            6318424333      NaN   
53  00QUo00000Sg06l     4709815185            8437592868      NaN   
51  00QUo00000SgIQF     3474102634            5678680082      NaN   

   created_date first_mel_timestamp latest_mql_timestamp assigned_time_ae_  \
1    2025-07-08          2025-07-08           2025-07-08               NaN   
15   2025-07-07          2025-07-07           2025-07-08        2025-07-07   
24   2025-07-07          2025-07-07           2025-07-07        2025-07-07   
53   2025-07-04          2025-07-04           2025-07-04        2025-07-04   
51   2025-07-04          2025-07-04           2025-07-04               NaN   

   opportunity_created_date closed_won_date           stage  \
1                       NaN             NaN          