In [None]:
!pip install pandas regex pickle-mixin json5 scikit-learn

In [17]:
import pandas as pd
import re
from datetime import datetime
import pickle
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support, classification_report



# **Data Interpretation for Age Prediction**

### **Overview**

This function outlines the methodology for interpreting numerical data present in the users email to predict an individual's age.
The approach is based on the following assumptions:

### **Assumptions on Birth Year Demographics Derived from Digital Data**

In the context of predicting someone's birth year from digital data sources such as emails, any four-digit number ranging from 1930 to 2024 is presumed to directly represent a birth year. This assumption is based on the usage patterns of modern technologies, particularly email. Email technology, becoming mainstream in the late 20th century, is less frequently used by the older population, especially those born before 1930. Therefore, it is quite uncommon to find active email users in age groups that predate 1930. By setting the lower limit of our year range at 1930, we ensure that our age prediction model remains focused on a demographic more likely to be engaged with digital communication tools. This approach simplifies the process of age estimation by eliminating the need to consider statistically rare cases of very old email users, thereby enhancing the efficiency and accuracy of data interpretation related to age predictions.

### **Assumptions for Age Prediction:**

**Two-digit Numbers:**

If the number has two digits and is between 25 and 99, it is assumed to represent a year in the 1900s. For instance, '89' is interpreted as 1989.
If the number is between 00 and 24, it is assumed to represent a year in the 2000s. For example, '23' is interpreted as 2023.

**Four-digit Numbers:**

If the number has four digits and is between 1930 and 2024, it is directly used as the year.


**Age group classification and the expected output format**

* Individuals under 30 years are classified as 'young'.

* Those between 30 and 49 years are classified as 'medium'.

* Individuals 50 years and older are classified as 'old'.

**Handling Missing Years:**

If no year is provided the function classifies the age as 'unsure' because there isn't enough information.

In [18]:
def preprocess_numbers(numbers):
    if numbers:
        if len(numbers) == 2:
            num = int(numbers)
            if 30 <= num <= 99:
                return 1900 + num
            elif 0 <= num <= 24:
                return 2000 + num
        elif len(numbers) == 4:
            year = int(numbers)
            if 1900 <= year <= 2024:
                return year
    return None

def define_age_class(year):
    if year is None:
        return 'unsure'
    current_year = datetime.now().year
    age = current_year - year
    if age < 30:
        return 'young'
    elif age < 50:
        return 'medium'
    else:
        return 'old'

def extract_details(email):
    email = email.strip()
    if '@' in email:
        parts = email.split('@')
        username = parts[0]
        domain = parts[1].split(',')[0].strip() if ',' in parts[1] else parts[1].strip()
        first_name = re.split(r'[._]', username)[0]
        last_name = re.split(r'[._]', username)[-1]
        numbers = ''.join([c for c in username if c.isdigit()])
        birth_year = preprocess_numbers(numbers)
        age_class = define_age_class(birth_year)
        return [first_name, last_name, domain, numbers, birth_year, age_class]
    return [None, None, None, None, None, None]


## **Processing Email Data and Generating a Structured CSV Output**

In [20]:
txt_path = 'emails.txt'
csv_path = 'processed_data.csv'

with open(txt_path, 'r') as file:
    emails = file.readlines()

data = [extract_details(email) for email in emails if '@' in email]
columns = ['First Name', 'Last Name', 'Domain', 'Extracted Numbers', 'Birth Year', 'Age Class']
df = pd.DataFrame(data, columns=columns)
df['Birth Year'] = pd.to_numeric(df['Birth Year'], errors='coerce').astype('Int64')
df.to_csv(csv_path, index=False)

df = pd.read_csv(csv_path)
df['Birth Year'].fillna(0, inplace=True)
df['Birth Year'] = pd.to_numeric(df['Birth Year'], errors='coerce').astype('Int64')
df.head()

Unnamed: 0,First Name,Last Name,Domain,Extracted Numbers,Birth Year,Age Class
0,Elody,OConner51,gmail.com,51.0,1951,old
1,lily,long85,yahoo.com,85.0,1985,medium
2,simon,ward,protonmailcom,,0,unsure
3,benjamin,phillips,hotmail.com,,0,unsure
4,robert,walker,aol.com,,0,unsure


## **Feature Encoding, Data Preparation, and Model Persistence in Machine Learning Pipeline**

In [22]:
label_encoder = LabelEncoder()
df['First Name Encoded'] = label_encoder.fit_transform(df['First Name'])
df['Last Name Encoded'] = label_encoder.fit_transform(df['Last Name'])
df['Extracted Numbers'].fillna(0, inplace=True)

X = df[['Extracted Numbers', 'Birth Year']]
y = df['Age Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


def train_and_save_model(model, X_train, y_train, filename):
    model.fit(X_train, y_train)

    with open(filename, 'wb') as file:
        pickle.dump(model, file)

    print(f"{type(model).__name__} saved to {filename}")

models = [
    (LogisticRegression(random_state=42, max_iter=1000), 'logistic_regression_model.pkl'),
    (RandomForestClassifier(random_state=42), 'random_forest_model.pkl'),
    (DecisionTreeClassifier(random_state=42), 'decision_tree_model.pkl')
]

for model, filename in models:
    train_and_save_model(model, X_train, y_train, filename)

print(f"All models are trained and saved!!!")

LogisticRegression saved to logistic_regression_model.pkl
RandomForestClassifier saved to random_forest_model.pkl
DecisionTreeClassifier saved to decision_tree_model.pkl
All models are trained and saved!!!


## **Predict and evaluate the model**

In [23]:
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

      medium       1.00      1.00      1.00        31
         old       1.00      1.00      1.00        37
      unsure       1.00      1.00      1.00       107
       young       1.00      1.00      1.00        40

    accuracy                           1.00       215
   macro avg       1.00      1.00      1.00       215
weighted avg       1.00      1.00      1.00       215



## **Testing Emails with Pre-trained Machine Learning Models**

In [26]:
def test_email_with_model(email, model_filename):
    """
    Test the given email with a model loaded from a specified file and print the results
    including the predicted age class and confidence score.

    Args:
        email (str): The email address to test.
        model_filename (str): Filename of the model to load and use for testing.

    Prints:
        JSON formatted test results including the predicted age class and confidence score.
    """
    with open(model_filename, 'rb') as file:
        loaded_model = pickle.load(file)

    email_details = extract_details(email)

    birth_year = email_details[4] if email_details[4] is not None else 0
    numbers = int(''.join(filter(str.isdigit, email_details[3]))) if email_details[3] else 0

    input_features = [[numbers, birth_year]]
    predicted_age_class = loaded_model.predict(input_features)
    if hasattr(loaded_model, 'predict_proba'):
        confidence_score = max(loaded_model.predict_proba(input_features)[0])
    else:
        confidence_score = 1.0

    result = {
        "age": predicted_age_class[0],
        "score": confidence_score
    }
    print(json.dumps(result))

test_email_with_model('john.does1297@example.com', 'decision_tree_model.pkl')
test_email_with_model('john.does1297@example.com','logistic_regression_model.pkl')
test_email_with_model('john.does1297@example.com', 'random_forest_model.pkl')

{"age": "unsure", "score": 1.0}
{"age": "unsure", "score": 1.0}
{"age": "unsure", "score": 1.0}




**Alternative approach: Without ML method:**
Email Testing with Confidence Scoring

In [11]:
def test_email_with_confidence(email):
    """
    Tests the email extraction and processing pipeline on a single email address and
    includes a confidence score.

    Args:
    email (str): The email address to test.

    Returns:
    None: Prints the extracted details and a confidence score.
    """
    details = extract_details(email)

    def calculate_confidence(email_details):
        total_fields = len(email_details)
        filled_fields = sum(1 for detail in email_details if detail is not None and detail != [])
        confidence_score = filled_fields / total_fields
        return round(confidence_score, 2)

    confidence_score = calculate_confidence(details)
    print("Test Results for:", email)
    print("First Name:", details[0])
    print("Last Name:", details[1])
    print("Domain:", details[2])
    print("Numbers in Username:", details[3])
    print("Extracted Birth Year:", details[4])
    print("Age Class:", details[5])
    print("Confidence Score:", confidence_score)
    print("\n")

test_email_with_confidence('john93@example.com')


Test Results for: john93@example.com
First Name: john93
Last Name: john93
Domain: example.com
Numbers in Username: 93
Extracted Birth Year: 1993
Age Class: medium
Confidence Score: 1.0


