In [1]:
import pandas as pd
import sys

sys.path.append("/home/lsys/pwned_pols/venv/lib/python3.10/site-packages")
from utilities import clean_email_column_no_dedupe, normalize_email
import warnings

warnings.filterwarnings("ignore")

import dns.resolver

domain_cache = {}
resolver = dns.resolver.Resolver()
resolver.nameservers = ["1.1.1.1", "8.8.8.8", "9.9.9.9"]
resolver.timeout = 100
resolver.lifetime = 100

In [2]:
unique_domains = (
    pd.concat(
        [
            pd.read_csv(
                "../data/everypol/everypol_combined_legislature_data.csv",
                low_memory=False,
                usecols=["email"],
            ).dropna(),
            pd.read_csv(
                "../data/scraped_pol_combined_legislature_data.csv", usecols=["email"]
            ).dropna(),
        ],
        ignore_index=True,
    )
    .pipe(lambda df_: clean_email_column_no_dedupe(df_))
    .pipe(lambda df_: df_.assign(email=df_["email"].apply(normalize_email)))
    .assign(domain=lambda df_: df_["email"].str.split("@").str.get(1))["domain"]
    .unique()
    .tolist()
)
print(len(unique_domains))

685


In [3]:
# df_domains = pd.DataFrame({"domain": unique_domains, "valid_email_domain": np.nan, "fail_reason": np.nan})
df_domains = pd.read_csv("../data/edomain_validation.csv")
df_domains

Unnamed: 0,domain,valid_email_domain,fail_reason
0,sansad.nic.in,True,
1,yahoo.com,True,
2,gmail.com,True,
3,hotmail.com,True,
4,nic.in,True,
...,...,...,...
680,gmail.comshasankshekharverma,False,Timeout
681,chaitanyasharma.co.in,True,
682,67gmail.com,True,
683,somnathbharti.com,True,


In [4]:
def check_domain(domain):
    """Check if a domain has a valid MX record."""
    try:
        dns.resolver.resolve(domain, "MX")
        return True, None  # Valid domain, no failure reason
    except dns.resolver.NXDOMAIN:
        return False, "NXDOMAIN"  # Domain does not exist
    except dns.resolver.NoAnswer:
        return False, "NoAnswer"  # No MX record
    except dns.resolver.LifetimeTimeout:
        return False, "Timeout"  # DNS timeout

In [9]:
for ix, row in df_domains.iterrows():
    domain = row["domain"]
    valid_email_domain = row["valid_email_domain"]

    # Check if validation is needed
    if pd.isna(valid_email_domain) or (
        valid_email_domain is False and row["fail_reason"] == "Timeout"
    ):
        is_valid, fail_reason = check_domain(domain)
        df_domains.loc[ix, "valid_email_domain"] = is_valid
        df_domains.loc[ix, "fail_reason"] = fail_reason
    else:
        fail_reason = row.get("fail_reason")

    # Log
    if df_domains.loc[ix, "valid_email_domain"]:
        print(f"[Valid] {1+ix}: {domain}")
    else:
        print(f"[ERROR] {1+ix}: {domain} - {df_domains.loc[ix, 'fail_reason']}")

[Valid] 1: sansad.nic.in
[Valid] 2: yahoo.com
[Valid] 3: gmail.com
[Valid] 4: hotmail.com
[Valid] 5: nic.in
[Valid] 6: ymail.com
[ERROR] 7: prabhatsinh.com - Timeout
[Valid] 8: yahoo.co.in
[Valid] 9: rediffmail.com
[ERROR] 10: nitingadkari.org - Timeout
[Valid] 11: rahulgandhi.in
[ERROR] 12: shyamscharangupta.co.in - Timeout
[Valid] 13: deepender.in
[Valid] 14: gov.in
[Valid] 15: kar.nic.in
[Valid] 16: bjpanda.org
[Valid] 17: plrprojects.com
[Valid] 18: tharoor.in
[ERROR] 19: informant.com - Timeout
[ERROR] 20: sarakimail.com - NoAnswer
[Valid] 21: icloud.com
[Valid] 22: yahoo.co.uk
[ERROR] 23: kurfi.com - Timeout
[ERROR] 24: lanretejuoso.com - Timeout
[Valid] 25: oluremitinubu.com
[Valid] 26: yayiadeola.ng
[Valid] 27: nass.gov.ng
[Valid] 28: nassnig.org
[ERROR] 29: abotoraby.ir - Timeout
[ERROR] 30: amirabadi.ir - Timeout
[Valid] 31: yazd.ac.ir
[ERROR] 32: amirkhojasteh.ir - Timeout
[ERROR] 33: behrooznemati.com - Timeout
[Valid] 34: chmail.ir
[Valid] 35: parliran.ir
[Valid] 36: ut.ac

In [10]:
df_domains.query("fail_reason==fail_reason")

Unnamed: 0,domain,valid_email_domain,fail_reason
6,prabhatsinh.com,False,Timeout
9,nitingadkari.org,False,Timeout
11,shyamscharangupta.co.in,False,Timeout
18,informant.com,False,Timeout
19,sarakimail.com,False,NoAnswer
...,...,...,...
599,iutglobal.com,False,Timeout
608,bukitbatok.sg,False,Timeout
641,senatenass.gov.ng,False,Timeout
678,sanjeevagrwal.co.in,False,Timeout


In [11]:
df_domains.query("fail_reason==fail_reason").query("fail_reason!='Timeout'")

Unnamed: 0,domain,valid_email_domain,fail_reason
19,sarakimail.com,False,NoAnswer
44,tajgardoun.com,False,NoAnswer
273,assnat.com,False,NoAnswer
593,keathong.com,False,NoAnswer


In [12]:
df_domains.to_csv("../data/edomain_validation.csv", index=False)
df_domains

Unnamed: 0,domain,valid_email_domain,fail_reason
0,sansad.nic.in,True,
1,yahoo.com,True,
2,gmail.com,True,
3,hotmail.com,True,
4,nic.in,True,
...,...,...,...
680,gmail.comshasankshekharverma,False,Timeout
681,chaitanyasharma.co.in,True,
682,67gmail.com,True,
683,somnathbharti.com,True,
