In [91]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import json

Step 1 Load data and check

In [92]:
file_path = 'emails.txt'

In [93]:
with open(file_path, 'r') as file:
    content = file.readlines()
df = pd.DataFrame(content, columns=['email'])
df['email'] = df['email'].str.replace('\n', '')
df

Unnamed: 0,email
0,Elody_OConner51@gmail.com
1,lily.long85@yahoo.com
2,simon.ward@protonmailcom
3,benjamin.phillips@hotmail.com
4,robert.walker@aol.com
...,...
1068,grace.hughes@hotmail.com
1069,grace92_anderson@protonmail.com
1070,Marshall_Upton32@mail.com
1071,caleb1996_phillips@icloud.com


Step 2 Fix typo/email format

In [94]:
def extract_email_domain(email):
    split_list = email.split('@')
    domain = ''.join(split_list[-1])
    #rest are username
    username = ''.join(split_list[:-1])
    return username, domain

In [95]:
# check format of emails
def check_email_format(domain):
    # only 1 . after @, and not the last character
    if domain.count('.') != 1:
        return False
    elif domain[-1] == '.':
        return False
    else:
        return True

In [96]:
df['username'], df['domain'] = zip(*df['email'].apply(extract_email_domain))
df['valid_email'] = df['domain'].apply(check_email_format)
df['valid_email'].value_counts()

valid_email
True     999
False     74
Name: count, dtype: int64

In [97]:
df[df['valid_email'] == False]

Unnamed: 0,email,username,domain,valid_email
2,simon.ward@protonmailcom,simon.ward,protonmailcom,False
7,"matthew.lewis@gmail,",matthew.lewis,"gmail,",False
16,"peter.jackson@aol,",peter.jackson,"aol,",False
17,paul.baker@aolcom,paul.baker,aolcom,False
23,laura.foster@icloud..com,laura.foster,icloud..com,False
...,...,...,...,...
971,nancy.baker@outlook..com,nancy.baker,outlook..com,False
996,"nancy.morris@yahoo,",nancy.morris,"yahoo,",False
1022,laura.walker@protonmailcom,laura.walker,protonmailcom,False
1037,matthew.roberts@protonmail@com,matthew.robertsprotonmail,com,False


typos:
1. missing ./ double . before com
2. , instead of .
3. missing com
3. additional @ in the user name, but we could assume it is correct

Let's fix those, as we don't have too many data, so each data is valueable

In [98]:
df['valid_check'] = df['domain'].apply(check_email_format)
df['valid_check'].value_counts()

valid_check
True     999
False     74
Name: count, dtype: int64

In [99]:
def insert_dot(domain):
    #if no dot, insert dot before com
    if '.' not in domain:
        domain = domain.replace('com', '.com')
    return domain

In [100]:
# replace all , with .
df['domain'] = df['domain'].str.replace(',', '.')
# insert . before com if missing
df['domain'] = df['domain'].apply(insert_dot)
# add com if missing
df['domain'] = df['domain'].apply(lambda x: x if x[-1] != '.' else x+'com')
# replace .. with .
df['domain'] = df['domain'].str.replace('..', '.')
df['valid_check'] = df['domain'].apply(check_email_format)

In [101]:
df[df['valid_check'] == False]

Unnamed: 0,email,username,domain,valid_email,valid_check
88,sarah.carter@yahoo,sarah.carter,yahoo,False,False
150,emily_clark.91@mail.service.com,emily_clark.91,mail.service.com,False,False
157,john.smith@gmail,john.smith,gmail,False,False
167,daniel_clark@icloud,daniel_clark,icloud,False,False
176,christopher89_bell@sub.company.com,christopher89_bell,sub.company.com,False,False
191,alice_3890.brown@company.co.uk,alice_3890.brown,company.co.uk,False,False
248,john.doe1990@sub.domain.com,john.doe1990,sub.domain.com,False,False
490,james.lewis@protonmail,james.lewis,protonmail,False,False
581,rebecca.scott@outlook,rebecca.scott,outlook,False,False
669,julia.doe4321@sub.domain.com,julia.doe4321,sub.domain.com,False,False


In [107]:
df[df['domain']=='.com']

Unnamed: 0,email,username,domain,valid_email
155,thomas.hall@gmail@com,thomas.hallgmail,.com,False
1037,matthew.roberts@protonmail@com,matthew.robertsprotonmail,.com,False


In [108]:
# manually correct the rest
df.loc[df['domain']=='yahoo', 'domain'] = 'yahoo.com'
df.loc[df['domain']=='gmail', 'domain'] = 'gmail.com'
df.loc[df['domain']=='icloud', 'domain'] = 'icloud.com'
df.loc[df['domain']=='outlook', 'domain'] = 'outlook.com'
df.loc[df['domain']=='protonmail', 'domain'] = 'protonmail.com'
df.loc[df['username']=='thomas.hallgmail', ['username','domain']] = ['thomas.hall', 'gmail.com']
df.loc[df['username']=='matthew.robertsprotonmail', ['username','domain']] = ['matthew.roberts', 'protonmail.com']
df['valid_check'] = df['domain'].apply(check_email_format)
df[df['valid_check'] == False]

Unnamed: 0,email,username,domain,valid_email,valid_check
150,emily_clark.91@mail.service.com,emily_clark.91,mail.service.com,True,False
176,christopher89_bell@sub.company.com,christopher89_bell,sub.company.com,True,False
191,alice_3890.brown@company.co.uk,alice_3890.brown,company.co.uk,True,False
248,john.doe1990@sub.domain.com,john.doe1990,sub.domain.com,True,False
669,julia.doe4321@sub.domain.com,julia.doe4321,sub.domain.com,True,False
871,laura_1992.brown@company.co.uk,laura_1992.brown,company.co.uk,True,False
888,tom_clark.6543@mail.service.com,tom_clark.6543,mail.service.com,True,False


In [109]:
# rest should be valid
df.loc[df['valid_check'] == False, 'valid_email'] = True
df.drop(columns=['valid_check'], inplace=True)

Step 3 Feature engineering and EDA

In [115]:
def find_potential_birth_year(user_name):
    # Corrected regex to capture both 4-digit and 2-digit years
    birth_year = re.findall(r'(19\d{2}|200\d|2010|\d{2})', user_name)
    if birth_year:
        year = int(birth_year[0])
        if year < 100:  # Handle two-digit years
            year += 1900 if year >= 20 else 2000
        return year
    return None

In [116]:
df['birth_year'] = df['username'].apply(find_potential_birth_year)
df

Unnamed: 0,email,username,domain,valid_email,birth_year
0,Elody_OConner51@gmail.com,Elody_OConner51,gmail.com,True,1951.0
1,lily.long85@yahoo.com,lily.long85,yahoo.com,True,1985.0
2,simon.ward@protonmailcom,simon.ward,protonmail.com,False,
3,benjamin.phillips@hotmail.com,benjamin.phillips,hotmail.com,True,
4,robert.walker@aol.com,robert.walker,aol.com,True,
...,...,...,...,...,...
1068,grace.hughes@hotmail.com,grace.hughes,hotmail.com,True,
1069,grace92_anderson@protonmail.com,grace92_anderson,protonmail.com,True,1992.0
1070,Marshall_Upton32@mail.com,Marshall_Upton32,mail.com,True,1932.0
1071,caleb1996_phillips@icloud.com,caleb1996_phillips,icloud.com,True,1996.0
