# loading data and scanning for email via RE

In [1]:
import json
import re
import pandas as pd
import random

In [2]:
def load_json(file_path):
    return pd.read_json(file_path)

def extract_emails(df):
    email_pattern = re.compile(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+(?:\.[a-zA-Z]{2,})+')
    
    results = []
    for _, row in df.iterrows():
        sequence = row.get("sequence", "")
        emails = email_pattern.findall(sequence)
        if emails:
            results.append({"sequence": sequence, "emails": emails})
    
    return results

In [3]:
file_path = "Internship_task_data\Internship_task_data\data.json"  
df = load_json(file_path)

In [4]:
email_entries = extract_emails(df)

if email_entries:
    print("Email addresses found:")
    for entry in email_entries:
        print(entry)
else:
    print("No email addresses found.")

No email addresses found.


## No email found. So, named entities are extracted to create realistic emails to add

In [5]:
def extract_named_entities(df):
    named_entities = []
    for _, row in df.iterrows():
        tokens = row.get("tokens", [])
        ner_tags = row.get("ner_tags", [])
        
        entity = ""
        entity_type = None
        entities = []
        
        for token, tag in zip(tokens, ner_tags):
            if tag.startswith("B-"):
                if entity:
                    entities.append({"entity": entity.strip(), "type": entity_type})
                entity = token
                entity_type = tag[2:]
            elif tag.startswith("I-") and entity:
                entity += " " + token
            else:
                if entity:
                    entities.append({"entity": entity.strip(), "type": entity_type})
                    entity = ""
                    entity_type = None
        
        if entity:
            entities.append({"entity": entity.strip(), "type": entity_type})
        
        
        named_entities.append({"sequence": row.get("sequence", ""), "entities": entities, "tokens": tokens, "ner_tags": ner_tags})
    
    return pd.DataFrame(named_entities)

In [7]:
named_entities = extract_named_entities(df)

named_entities.head()

Unnamed: 0,sequence,entities,tokens,ner_tags
0,"Since then , only Terry Bradshaw in 147 games ...","[{'entity': 'Terry Bradshaw', 'type': 'PER'}, ...","[Since, then, ,, only, Terry, Bradshaw, in, 14...","[O, O, O, O, B-PER, I-PER, O, O, O, O, B-PER, ..."
1,He was portrayed by Anthony Perkins in the 196...,"[{'entity': 'Anthony Perkins', 'type': 'PER'},...","[He, was, portrayed, by, Anthony, Perkins, in,...","[O, O, O, O, B-PER, I-PER, O, O, O, O, O, O, O..."
2,"The egg eventually hatches , revealing a baby ...","[{'entity': 'Sharptooth', 'type': 'PER'}]","[The, egg, eventually, hatches, ,, revealing, ...","[O, O, O, O, O, O, O, O, B-PER, O]"
3,In the video Kelis is walking down a street in...,"[{'entity': 'Kelis', 'type': 'PER'}]","[In, the, video, Kelis, is, walking, down, a, ...","[O, O, O, B-PER, O, O, O, O, O, O, O, O, O, O,..."
4,"According to food writer Sharon Tyler Herbst ,...","[{'entity': 'Sharon Tyler Herbst', 'type': 'PE...","[According, to, food, writer, Sharon, Tyler, H...","[O, O, O, O, B-PER, I-PER, I-PER, O, O, O, O, ..."


In [6]:
def generate_synthetic_emails(named_entities):
    domain_distribution = {
        "gmail.com": 0.5,
        "yahoo.com": 0.2,
        "outlook.com": 0.15,
        "example.org": 0.1,
        "company.com": 0.05
    }
    domains = list(domain_distribution.keys())
    domain_weights = list(domain_distribution.values())
    
    updated_data = []
    
    for _, entry in named_entities.iterrows():
        sequence = entry["sequence"]
        tokens = entry["tokens"]
        ner_tags = entry["ner_tags"]
        
        if random.random() < 0.5:  # 50% chance to add an email
            for entity in entry["entities"]:
                if entity["type"] == "PER":
                    name_parts = entity["entity"].lower().split()
                    first = name_parts[0] if name_parts else "user"
                    last = name_parts[1] if len(name_parts) > 1 else "random"
                    
                    email_formats = [
                        f"{first}.{last}",
                        f"{first}{last}",
                        f"{first}{random.randint(10, 99)}",
                        f"{first[0]}{last}{random.randint(1, 999)}",
                        f"{first}_{last}{random.randint(1, 99)}"
                    ]
                    email = f"{random.choice(email_formats)}@{random.choices(domains, weights=domain_weights, k=1)[0]}"
                    
                    # Insert email at a random position in the token list
                    insert_pos = random.randint(0, len(tokens))
                    tokens.insert(insert_pos, email)
                    ner_tags.insert(insert_pos, "B-EMAIL")
                    
                    # Update the sequence with the new tokenized format
                    sequence = " ".join(tokens)
        
        updated_data.append({"sequence": sequence, "tokens": tokens, "ner_tags": ner_tags})
    
    return pd.DataFrame(updated_data)

In [86]:
updated_data = generate_synthetic_emails(named_entities)

updated_data.head()

Unnamed: 0,sequence,tokens,ner_tags
0,"Since then , only Terry Bradshaw in 147 games , Joe Montana in 139 games , and Tom Brady in 131 games have reached 100 wins more quickly .","[Since, then, ,, only, Terry, Bradshaw, in, 147, games, ,, Joe, Montana, in, 139, games, ,, and, Tom, Brady, in, 131, games, have, reached, 100, wins, more, quickly, .]","[O, O, O, O, B-PER, I-PER, O, O, O, O, B-PER, I-PER, O, O, O, O, O, B-PER, I-PER, O, O, O, O, O, O, O, O, O, O]"
1,"He was portrayed by Anthony Perkins in the alfred.hitchcock@outlook.com 1960 version of "" Psycho "" directed by Alfred Hitchcock and the "" anthonyperkins@gmail.com Psycho "" franchise .","[He, was, portrayed, by, Anthony, Perkins, in, the, alfred.hitchcock@outlook.com, 1960, version, of, "", Psycho, "", directed, by, Alfred, Hitchcock, and, the, "", anthonyperkins@gmail.com, Psycho, "", franchise, .]","[O, O, O, O, B-PER, I-PER, O, O, B-EMAIL, O, O, O, O, O, O, O, O, B-PER, I-PER, O, O, O, B-EMAIL, O, O, O, O]"
2,"The egg eventually hatches , revealing a baby Sharptooth .","[The, egg, eventually, hatches, ,, revealing, a, baby, Sharptooth, .]","[O, O, O, O, O, O, O, O, B-PER, O]"
3,"In the video Kelis is walking down a street in a large space suit-style coat kelis27@gmail.com , then singing to the stars .","[In, the, video, Kelis, is, walking, down, a, street, in, a, large, space, suit-style, coat, kelis27@gmail.com, ,, then, singing, to, the, stars, .]","[O, O, O, B-PER, O, O, O, O, O, O, O, O, O, O, O, B-EMAIL, O, O, O, O, O, O, O]"
4,"According to food writer Sharon Tyler Herbst , "" pico de gallo "" ( "" rooster 's beak "" ) is named thus sharon_tyler38@yahoo.com because originally people ate it by pinching pieces between the thumb and forefinger .","[According, to, food, writer, Sharon, Tyler, Herbst, ,, "", pico, de, gallo, "", (, "", rooster, 's, beak, "", ), is, named, thus, sharon_tyler38@yahoo.com, because, originally, people, ate, it, by, pinching, pieces, between, the, thumb, and, forefinger, .]","[O, O, O, O, B-PER, I-PER, I-PER, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-EMAIL, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"


# Emails have been added at random places

In [11]:
updated_data_df.sample(20)

Unnamed: 0,sequence,tokens,ner_tags
17019,"Another influence lay in Lewis Carroll 's "" Al...","[Another, influence, lay, in, Lewis, Carroll, ...","[O, O, O, O, B-PER, I-PER, O, O, O, O, O, O, O..."
15283,"On May 10 , 1967 , he hit an inside-the-park h...","[On, May, 10, ,, 1967, ,, he, hit, an, inside-...","[O, O, O, O, O, O, O, O, O, O, O, B-EMAIL, O, ..."
10445,"He also mentions his associate Timothy , as a ...","[He, also, mentions, his, associate, Timothy, ...","[O, O, O, O, O, B-PER, O, O, O, O, O, O, O, O,..."
16113,The DVD edition includes a director 's comment...,"[The, DVD, edition, includes, a, director, 's,...","[O, O, O, O, O, O, O, O, O, B-PER, I-PER, O, O..."
18471,John Carpenter and Debra Hill believed that th...,"[John, Carpenter, and, Debra, Hill, believed, ...","[B-PER, I-PER, O, B-PER, I-PER, O, O, O, O, O,..."
13848,In an inscription found at Perge which claudiu...,"[In, an, inscription, found, at, Perge, which,...","[O, O, O, O, O, O, O, B-EMAIL, O, O, O, O, O, ..."
4391,"carter.random@company.com For example , Carter...","[carter.random@company.com, For, example, ,, C...","[B-EMAIL, O, O, O, B-PER, O, O, O, O, O, O, O,..."
23384,"And as an improvisational soloist , John jcolt...","[And, as, an, improvisational, soloist, ,, Joh...","[O, O, O, O, O, O, B-PER, B-EMAIL, I-PER, O, O..."
6925,Frontman Black Francis scottlitt@yahoo.com was...,"[Frontman, Black, Francis, scottlitt@yahoo.com...","[O, B-PER, I-PER, B-EMAIL, O, O, O, O, O, O, O..."
21152,"Commenting on this meta-analysis , both Edzard...","[Commenting, on, this, meta-analysis, ,, both,...","[O, O, O, O, O, O, B-PER, I-PER, O, B-PER, I-P..."


# data saved

In [12]:
updated_data_df.to_json('email_added_data.json', orient = "records", lines = True)

## Below could have been used to add emails as per the context where needed, using a LLM.

In [None]:
from transformers import pipeline

def call_llm_for_contextual_email_insertion(sequence):
    generator = pipeline("text-generation", model="meta-llama/Llama-2-7B-chat-hf") #applied but haven't got access to this model yet.
    prompt = (
        f"Given the sentence: '{sequence}', determine the best location to insert an appropriate email naturally."
        " Only insert an appropriate email where it fits contextually (e.g., after 'contact me at', 'email us at') or after person names."
    )
    response = generator(prompt, max_length=len(sequence) + 20, num_return_sequences=1)[0]["generated_text"]
    
    return response

In [None]:
def generate_ai_synthetic_emails(named_entities_df):
    updated_data = []
    
    for _, entry in named_entities_df.iterrows():
        sequence = entry["sequence"]
        tokens = entry["tokens"]
        ner_tags = entry["ner_tags"]
        
        
        updated_sequence = call_llm_to_insert_email(sequence)

        email_pattern = re.compile(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+(?:\.[a-zA-Z]{2,})+')
        match = email_pattern.search(updated_sequence)
        if match:
            email = match.group()
            words = updated_sequence.split()
            email_pos = words.index(email)

            tokens = words
            ner_tags = ["O"] * len(tokens)
            ner_tags[email_pos] = "B-EMAIL"
                
        updated_data.append({"sequence": updated_sequence, "tokens": tokens, "ner_tags": ner_tags})
    
    return pd.DataFrame(updated_data)