<a href="https://colab.research.google.com/github/ramvsiva/data-science-coding-challenge/blob/colab/solution_age_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
import pandas as pd
import re
from datetime import datetime
import pickle
import json
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support, classification_report

Email Processing and User Age Classification

In [14]:
def preprocess_numbers(numbers):
    if numbers:
        if len(numbers) == 2:
            num = int(numbers)
            if 45 <= num <= 99:
                return 1900 + num
            elif 0 <= num <= 24:
                return 2000 + num
        elif len(numbers) == 4:
            year = int(numbers)
            if 1900 <= year <= 2023:
                return year
    return None

def define_age_class(year):
    if year is None:
        return 'unsure'
    current_year = datetime.now().year
    age = current_year - year
    if age < 30:
        return 'young'
    elif age < 50:
        return 'medium'
    else:
        return 'old'

def extract_details(email):
    email = email.strip()
    if '@' in email:
        parts = email.split('@')
        username = parts[0]
        domain = parts[1].split(',')[0].strip() if ',' in parts[1] else parts[1].strip()
        first_name = re.split(r'[._]', username)[0]
        last_name = re.split(r'[._]', username)[-1]
        numbers = ''.join([c for c in username if c.isdigit()])
        birth_year = preprocess_numbers(numbers)
        age_class = define_age_class(birth_year)
        return [first_name, last_name, domain, numbers, birth_year, age_class]
    return [None, None, None, None, None, None]


Processing Email Data and Generating a Structured CSV Output

In [15]:
file_path = 'emails.txt'
with open(file_path, 'r') as file:
    emails = file.readlines()

data = [extract_details(email) for email in emails if '@' in email]
columns = ['First Name', 'Last Name', 'Domain', 'Numbers', 'Birth Year', 'Age Class']
df = pd.DataFrame(data, columns=columns)
df['Birth Year'] = pd.to_numeric(df['Birth Year'], errors='coerce').astype('Int64')
df.to_csv('processed_data.csv', index=False)

Loading and Preparing Email Data for Analysis

In [16]:
file_path = 'processed_data.csv'
df = pd.read_csv(file_path)
df['Birth Year'].fillna(0, inplace=True)
df['Birth Year'] = pd.to_numeric(df['Birth Year'], errors='coerce').astype('Int64')
df.head()

Unnamed: 0,First Name,Last Name,Domain,Numbers,Birth Year,Age Class
0,Elody,OConner51,gmail.com,51.0,1951,old
1,lily,long85,yahoo.com,85.0,1985,medium
2,simon,ward,protonmailcom,,0,unsure
3,benjamin,phillips,hotmail.com,,0,unsure
4,robert,walker,aol.com,,0,unsure


**Feature Encoding, Data Preparation, and Model Persistence in Machine Learning Pipeline**

In [17]:
label_encoder = LabelEncoder()
df['First Name Encoded'] = label_encoder.fit_transform(df['First Name'])
df['Last Name Encoded'] = label_encoder.fit_transform(df['Last Name'])
df['Numbers'].fillna(0, inplace=True)

X = df[['Numbers', 'Birth Year']]
y = df['Age Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


def train_and_save_model(model, X_train, y_train, filename):
    model.fit(X_train, y_train)

    with open(filename, 'wb') as file:
        pickle.dump(model, file)

    print(f"{type(model).__name__} saved to {filename}")

models = [
    (LogisticRegression(random_state=42, max_iter=1000), 'logistic_regression_model.pkl'),
    (RandomForestClassifier(random_state=42), 'random_forest_model.pkl'),
    (DecisionTreeClassifier(random_state=42), 'decision_tree_model.pkl')
]

for model, filename in models:
    train_and_save_model(model, X_train, y_train, filename)
print("Decision Tree Classifier Model saved!!")

LogisticRegression saved to logistic_regression_model.pkl
RandomForestClassifier saved to random_forest_model.pkl
DecisionTreeClassifier saved to decision_tree_model.pkl
Decision Tree Classifier Model saved!!


Predict and evaluate the model

In [18]:
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

      medium       1.00      1.00      1.00        31
         old       1.00      1.00      1.00        24
      unsure       1.00      1.00      1.00       120
       young       1.00      1.00      1.00        40

    accuracy                           1.00       215
   macro avg       1.00      1.00      1.00       215
weighted avg       1.00      1.00      1.00       215



**Testing Emails with Pre-trained Machine Learning Models**

In [None]:
def test_email_with_model(email, model_filename):
    """
    Test the given email with a model loaded from a specified file and print the results
    including the predicted age class and confidence score.

    Args:
        email (str): The email address to test.
        model_filename (str): Filename of the model to load and use for testing.

    Prints:
        JSON formatted test results including the predicted age class and confidence score.
    """
    with open(model_filename, 'rb') as file:
        loaded_model = pickle.load(file)

    email_details = extract_details(email)

    birth_year = email_details[4] if email_details[4] is not None else 0
    numbers = int(''.join(filter(str.isdigit, email_details[3]))) if email_details[3] else 0

    input_features = [[numbers, birth_year]]
    predicted_age_class = loaded_model.predict(input_features)
    if hasattr(loaded_model, 'predict_proba'):
        confidence_score = max(loaded_model.predict_proba(input_features)[0])
    else:
        confidence_score = 1.0

    result = {
        "age": predicted_age_class[0],
        "score": confidence_score
    }
    print(json.dumps(result))

test_email_with_model('john.does93@example.com', 'decision_tree_model.pkl')
test_email_with_model('john.does93@example.com','logistic_regression_model.pkl')
test_email_with_model('john.does93@example.com', 'random_forest_model.pkl')

**Alternative approach: Without ML method:**
Email Testing with Confidence Scoring

In [20]:
def test_email_with_confidence(email):
    """
    Tests the email extraction and processing pipeline on a single email address and
    includes a confidence score.

    Args:
    email (str): The email address to test.

    Returns:
    None: Prints the extracted details and a confidence score.
    """
    details = extract_details(email)

    def calculate_confidence(email_details):
        total_fields = len(email_details)
        print("email_details", email_details)
        print("total_fields", total_fields)
        filled_fields = sum(1 for detail in email_details if detail is not None and detail != [])
        print(filled_fields)
        confidence_score = filled_fields / total_fields
        return round(confidence_score, 2)

    confidence_score = calculate_confidence(details)
    print("Test Results for:", email)
    print("First Name:", details[0])
    print("Last Name:", details[1])
    print("Domain:", details[2])
    print("Numbers in Username:", details[3])
    print("Extracted Birth Year:", details[4])
    print("Age Class:", details[5])
    print("Confidence Score:", confidence_score)
    print("\n")

test_email_with_confidence('john.doe20@example.com')


email_details ['john', 'doe20', 'example.com', '20', 2020, 'young']
total_fields 6
6
Test Results for: john.doe20@example.com
First Name: john
Last Name: doe20
Domain: example.com
Numbers in Username: 20
Extracted Birth Year: 2020
Age Class: young
Confidence Score: 1.0


