## Load text

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
contacts_df = pd.read_excel('MyContacts.xlsx')
contacts_df.head(5)

Unnamed: 0,parsedTxt,fullname,company,job_title,address,phone,phone_2,email,email_2,website
0,Making Payment Simpter\nSambhav Pay\n+91-70654...,Mrs. Sapna Raghav,Sambhav Pay,BUSINESS HEAD,"Gurugram -122016, Haryana, B21, Phase-5, Udyog...",917065500000.0,,ops@sambhavpay.com,,www.sambhavpay.com
1,Making Payment Simpier\nSambhav Pay\n+91-88824...,,,,,,,,,
2,Vaibhavi Kamath\nExecutive Assistant to CEO\nK...,,,,,,,,,
3,aytring\nDebal Chakraborty\nCo-Founder\nOFfice...,,,,,,,,,
4,dheerajafinarkein.com\nG +91 83296 07320\nChie...,,,,,,,,,


In [3]:
text_data = contacts_df['parsedTxt'].astype(str).values

In [4]:
out_df = pd.DataFrame(columns = contacts_df.columns, index = range(contacts_df.shape[0]))
out_df['parsedTxt'] = contacts_df['parsedTxt']

## Email, Website and Phone number extraction

In [6]:
## Preprocessing
text_processed = []
for txt in text_data:
    txt_1 = re.sub(r'\n@(?<!\d)', '@', txt)
    txt_2 = re.sub('\n', ' ', txt_1)
    txt_3 = re.sub(r' @(?<!\d)','@', txt_2)
    text_processed.append(txt_3)
    
    

In [7]:
def calc_no_digit(s):
    cnt = 0
    string = ''
    for st in s:
        if st.isdigit():
            string += st
            cnt += 1
    return cnt, string

email_count = 0
website_count = 0
ph_no_count = 0

email_raw_string = r'(?i)\b[a-zA-Z0-9_\.-]+\@[a-zA-Z0-9_\-]+\.[a-zA-Z0-9_\.-]+'
email_raw_string_ref = r'(?i)\b([a-zA-Z0-9_\.-]+)\@([a-zA-Z0-9_\-]+)\.[a-zA-Z0-9_\.-]+'
website_raw_string = r'(?i)\b(?:https\:\/\/|www\.)([a-zA-Z0-9/-]+)\.([a-zA-Z0-9\.-]*)|(?<=\s)(?<!@)\b([a-zA-Z0-9]+)\.([a-zA-Z0-9]{2,})[\.]?(?![@])(?=\s)'
phone_number_string = r'[\(+-]*[0-9\(\)]{1,2}[\-\s]{,2}[0-9\(\)]{2,}[\-\s]{,2}[0-9]{3,}[\-\s]{,2}[0-9]{,2}'

for i in range(len(text_processed)):
    
    ## website extraction
    website = re.search(website_raw_string, text_processed[i])
    
    ## company name from website for reference
    if website is not None:
        out_df.loc[i, 'website'] = website.group(0)
        website_count += 1
        if website.group(1) is not None:
            out_df.loc[i, 'company'] = website.group(1)
        else:
            out_df.loc[i, 'company'] = website.group(3)
            
        
    ## phone number extraction
    ph_no = re.findall(phone_number_string, text_processed[i])
    for num in ph_no:
        num_conv = calc_no_digit(num)
        if num_conv[0] >= 10 and num_conv[0] <= 13:
            if type(out_df.loc[i, 'phone']) == float:
                out_df.loc[i, 'phone'] = num_conv[1]
                ph_no_count += 1
            elif type(out_df.loc[i, 'phone_2']) == float:
                out_df.loc[i, 'phone_2'] = num_conv[1]
            else:
                break
                
    ## email extraction
    email = re.findall(email_raw_string, text_processed[i])
    if len(email) >= 1:
        out_df.loc[i, 'email'] = email[0]
        email_count += 1
    if len(email) == 2:
        out_df.loc[i, 'email_2'] = email[1]

    ## company and fullname from email extraction for reference 
    
    ##(under the assumption that email is of the form - name@company.XXX)
    em_ref = re.search(email_raw_string_ref, text_processed[i])
    if em_ref is not None:
        out_df.loc[i, 'fullname'] = em_ref.group(1)
        if type(out_df.loc[i, 'company']) == float:
            out_df.loc[i, 'company'] = em_ref.group(2)
            
    
    ## removing email and website
    if len(email):
        for em in email:
            text_processed[i] = text_processed[i].replace(em, '')
    if website is not None:
        text_processed[i] = text_processed[i].replace(website.group(0), '')
        
print('Email extracted =', email_count)
print('Phone Number extracted =', ph_no_count)
print('Website extracted =', website_count)

Email extracted = 137
Phone Number extracted = 155
Website extracted = 117


## Name, Company Job Role Extraction using reference text

In [8]:
from nltk import word_tokenize
from nltk import sent_tokenize

def char_check(str_2):
    
    for s in str_2:
        if s.isalnum() == False:
            str_2 = str_2.replace(s, '\{}'.format(s))
    return str_2


def match_score(str_1, str_2):
   
    str_2 = char_check(str_2)
    match_ob = re.search(f'(?i)[{str_2}\s\n]+', str_1)
    if match_ob is None:
        return 0
    l, r = match_ob.span()
    return (r-l)

In [263]:
count = 0
number_of_data = contacts_df.shape[0]

for i in range(number_of_data):

    text = contacts_df['parsedTxt'][i]
    if type(text) == str:
        nltk_chunks = word_tokenize(text)
        sent_chunks = text.split('\n')
    
    company = []
    
    if type(out_df['company'][i]) == str:
        for line in sent_chunks:
            if line in text_processed[i]:
                company.append(match_score(line, out_df['company'][i]))
            else:
                company.append(0)

        max_comp = company.index(max(company))
        
        if max(company) <= 2:
            out_df['company'][i] = float('nan')
        elif max_comp is not None:
            out_df['company'][i] = sent_chunks[max_comp]
            count += 1
print('Company count Extracted =', count )

Company count Extracted = 134


In [264]:
def sum_caps_space(a):
    count = 0
    if a is None:
        return count
    for i in a:
        if i.isupper():
            count+=1
        if i == ' ':
            count+=1
    return count

In [270]:
job_regex = r'[A-Z][a-zA-Z]+[\s\-.&]{0,2}[A-Z]?[a-zA-Z\-\.\s\&]*(?![0-9@])'
regex_name = r'[a-zA-Z\s\.]+'

name_count = 0
job_count = 0

for i in range(number_of_data):

    text = contacts_df['parsedTxt'][i]
    if type(text) == str:
        nltk_chunks = word_tokenize(text)
        sent_chunks = text.split('\n')
    
    full_name = []
    
    if type(out_df['fullname'][i]) == str:
        for line in sent_chunks:
            if 'Mr.' in line or 'Mrs.' in line:
                full_name.append(float('inf'))
            if line in text_processed[i]:
                full_name.append(match_score(line, out_df['fullname'][i]))
            else:
                full_name.append(0)

        max_name = full_name.index(max(full_name))
         
        
        name_match = re.match(regex_name, sent_chunks[max_name])
        left, right = name_match.span()
        
        
        if len(sent_chunks[max_name]) == right-left:
            out_df['fullname'][i] = sent_chunks[max_name]
            
            
        
        
        #### job role
        
        up_job = None
        down_job = None
        
        if max_name+1 < len(sent_chunks) and re.match(job_regex, sent_chunks[max_name+1]):
            
            le,ri = re.match(job_regex, sent_chunks[max_name+1]).span()
            if ri-le == len(sent_chunks[max_name+1]):               
                down_job = sent_chunks[max_name+1]

        elif max_name-1 >= 0 and re.match(job_regex, sent_chunks[max_name-1]):
            
            le,ri = re.match(job_regex, sent_chunks[max_name-1]).span()
            if ri-le == len(sent_chunks[max_name-1]):
                up_job = sent_chunks[max_name-1]            

        if sum_caps_space(up_job) > sum_caps_space(down_job):
            out_df['job_title'][i] = up_job
            job_count += 1
        else:
            out_df['job_title'][i] = down_job
            job_count += 1

        name_count += 1
    
print('Name count extracted =', name_count)
print('Job role extracted =', job_count)

Name count extracted = 137
Job role extracted = 137


## Address Extraction

In [9]:
regex_address = r'(?i)[a-zA-Z0-9\.\-\s\&\/\,\(\)\#\:\"]+\,[a-zA-Z0-9\.\-\,\s\&\(\)\#\:\"]* | [a-zA-Z]*[0-9\s]{6,7}'
count_add = 174
number_of_data = contacts_df.shape[0]

for i in range(number_of_data):
    flag = 0
    text = contacts_df['parsedTxt'][i]
    if type(text) == str:
        nltk_chunks = word_tokenize(text)
        sent_chunks = text.split('\n')
    
    address = ''
    
    for line in sent_chunks:
        #print(line)
        if re.match(regex_address, line) is not None:
            flag = 1
            address += line
        
    if flag == 0:
        count_add -= 1
        

    if address == '':
        out_df['address'][i] = float('nan')
    else:
        out_df['address'][i] = address
    
print('Address count extracted =' ,count_add)

Address count extracted = 102


In [272]:
out_df.head(20)

Unnamed: 0,parsedTxt,fullname,company,job_title,address,phone,phone_2,email,email_2,website
0,Making Payment Simpter\nSambhav Pay\n+91-70654...,Mrs. Sapna Raghav,Sambhav Pay,BUSINESS HEAD,"Gurugram -122016, HaryanaB21, Phase-5, Udyog V...",917065483258.0,,ops@sambhavpay.com,,www.sambhavpay.com
1,Making Payment Simpier\nSambhav Pay\n+91-88824...,Mr. Jayant Mallick,Sambhav Pay,DIRECTOR,"Gurugram -122016, HaryanaB21, Phase-5, Udyog V...",918882484147.0,,jayant@sambhavpay.com,,www.sambhavpay.com
2,Vaibhavi Kamath\nExecutive Assistant to CEO\nK...,Vaibhavi Kamath,FINTECH,Executive Assistant to CEO,,919136706988.0,,vaibhavi@knightfintech.com,,www.knightfintech.com
3,aytring\nDebal Chakraborty\nCo-Founder\nOFfice...,,aytring,,"OFfice : Plot No. 8 &9, MM Towers debal@paytri...",919711192256.0,,,,www.paytring.com
4,dheerajafinarkein.com\nG +91 83296 07320\nChie...,,,,,918329607320.0,,,,
5,@_orbo\nWww.superscan.ai\nmitasha.paintal@orb0...,Mitasha Paintal,SUPER,Head- Enterprise,,919326045689.0,,mitasha.paintal@orb0.ai,,Www.superscan.ai
6,A Specified User Under RBI CICRA Act 2005\nDat...,Pankaj Chugh,Emaar Paim Square,Asst. Vice President- Sales,,919654574921.0,,pankaj@roopya.com,,https://roopya.money
7,www.backspace.tech\nsandhya@backspace-tech.com...,Sandhya Manikandan,,Head of Marketing & Content,,919176478565.0,,sandhya@backspace-tech.com,,www.backspace.tech
8,www.pay10.com\nPay\n+91 9354230590\nvikas.sahu...,,Pay,,,919354230590.0,,,,www.pay10.com
9,Purvashi Lakhupota\nManager - Partnerships\npu...,Purvashi Lakhupota,LXme,Manager - Partnerships,"11th Floor, Times Tower, Kamala City,Senapati ...",918879442966.0,,purvashilakhupota@kme.in,,www.Lxme.in
