<a href="https://colab.research.google.com/github/uhhfeef/Email-AI/blob/main/random_forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.ensemble import RandomForestClassifier
import regex as re
import joblib
from imblearn.over_sampling import SMOTE



In [3]:
df = pd.read_csv('/content/drive/MyDrive/gmail_data-6-months_new.csv')

In [4]:
# Check if any NaN value exists
df.dropna(inplace=True)

# data.isnull().values.any()
num_empty_rows = df.isnull().any(axis=1).sum() # Count if any rows if atleast one NaN value exists
num_empty_rows

0

In [5]:
# Function to remove URLs from text
def remove_urls_numbers(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+|\d+')
    return url_pattern.sub(r'', text)

def extract_domain_names(text):
    sender = re.compile(r'(?<=@)[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}')
    matches = sender.findall(text)
    return matches[0] if matches else None  # Return None if no match found

# Clean the 'Body' column
df['Body'] = df['Body'].apply(remove_urls_numbers)
df['From'] = df['From'].apply(extract_domain_names)
df.head()

Unnamed: 0,From,Subject,Read,Body
0,accounts.google.com,Security alert,False,[image: Google]\r\nGoogle Drive for desktop wa...
1,todoist.com,"Afeef’s task(s) for Sep 18 2024 → 1 due, 3 ove...",False,"Afeef’s task(s) for Sep → due, overdue\r\n..."
2,info.emeritus.org,"Afeef Khan, your personalized recommendations ...",False,"\r\n To view this email as a web page, go to t..."
3,email.meetup.com,Lets Talk E-Commerce Dubai Meetup Group: We lo...,False,"<p><strong><span style=""color : #"">We had + am..."
4,substack.com,Design a Scalable Notification Service - Syste...,False,View this post on the web at \r\n\r\nA notific...


In [6]:
# Feature engineering
sender_stats = df.groupby('From').agg({
    'From': 'count',
    'Read': 'mean'
}).rename(columns={'From': 'from_frequency', 'Read': 'from_read_rate'}).reset_index()

df = df.merge(sender_stats, on='From', how='left')

df['email_length'] = df['Body'].str.len()
df.columns

# df.head()


Index(['From', 'Subject', 'Read', 'Body', 'from_frequency', 'from_read_rate',
       'email_length'],
      dtype='object')

In [7]:
# Process email body and subject
vectorizer = TfidfVectorizer(max_features=150, stop_words='english', ngram_range=(3, 5))
tfidf_body = vectorizer.fit_transform(df['Body'])
tfidf_sub = vectorizer.fit_transform(df['Subject'])



In [8]:
# tfidf_features.toarray().shape

In [9]:
# Combine features
features = pd.concat([
    df[['from_frequency', 'from_read_rate', 'email_length']],
    pd.DataFrame(tfidf_body.toarray(), columns=vectorizer.get_feature_names_out()),
    pd.DataFrame(tfidf_sub.toarray(), columns=vectorizer.get_feature_names_out())
], axis=1)

features.shape


(4084, 303)

In [10]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(features, df['Read'], test_size=0.2, random_state=42)


In [11]:
# Apply SMOTE
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [12]:
# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [13]:
# Evaluate model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.6413708690330477
              precision    recall  f1-score   support

       False       0.73      0.74      0.74       556
        True       0.44      0.43      0.43       261

    accuracy                           0.64       817
   macro avg       0.59      0.59      0.59       817
weighted avg       0.64      0.64      0.64       817



In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': features.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)
print(feature_importance.head(10))


In [None]:
# Feature importance
feature_importance = model.feature_importances_
feature_names = vectorizer.get_feature_names_out()
for importance, name in sorted(zip(feature_importance, feature_names), reverse=True)[:10]:
    print(name, ":", importance)


In [16]:
# Save the model and vectorizer
joblib.dump(model, 'email_read_model.joblib')
joblib.dump(vectorizer, 'email_vectorizer.joblib')


['email_vectorizer.joblib']

In [17]:
# Function to predict on a new email
def predict_email_read(from_address, subject, body):
    # Load the saved model and vectorizer
    # loaded_model = joblib.load('email_read_model.joblib')
    # loaded_vectorizer = joblib.load('email_vectorizer.joblib')

    # Apply body changes
    clean_body = remove_urls_numbers(body)
    domain_name = extract_domain_names(from_address)

    # Create features
    from_freq = df[df['From'] == domain_name]['from_frequency'].values[0] if domain_name in df['From'].values else 0
    from_read_rate = df[df['From'] == domain_name]['from_read_rate'].values[0] if domain_name in df['From'].values else 0
    email_length = len(clean_body)

    # Process body
    tfidf_body = vectorizer.transform([clean_body])
    tfidf_subject = vectorizer.transform([subject])

    # Combine features
    email_features = pd.DataFrame({
        'from_frequency': [from_freq],
        'from_read_rate': [from_read_rate],
        'email_length': [email_length]
    })
    email_features = pd.concat([
        email_features,
        pd.DataFrame(tfidf_body.toarray(), columns=vectorizer.get_feature_names_out()),
        pd.DataFrame(tfidf_subject.toarray(), columns=vectorizer.get_feature_names_out())
    ], axis=1)

    email_features.head()

    # Predict
    prediction = model.predict(email_features)
    probability = model.predict_proba(email_features)[0][1]  # Probability of being read

    return prediction[0], probability


In [21]:
# Example usage
new_email_from = ""
new_email_subject = " "
new_email_body = """


"""

prediction, probability = predict_email_read(new_email_from, new_email_subject, new_email_body)
print(f"Prediction: {'Will be read' if prediction == 1 else 'Will not be read'}")
print(f"Probability of being read: {probability:.2f}")

Prediction: Will not be read
Probability of being read: 0.20


'\nQuick Fixes:\n- changed ngram to 2\n\n'