In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Expanded data with more 'ham' and 'spam' examples
data = {
    'label': ['spam', 'ham', 'spam', 'ham', 'spam', 'ham', 'ham', 'spam', 'spam', 'ham', 
              'ham', 'ham', 'spam', 'spam', 'ham', 'spam', 'ham', 'spam', 'ham', 'ham'],
    'email_text': [
        'Win a $1000 gift card now!',
        'Hi, how are you doing today?',
        'Congratulations, you have won a prize!',
        'Let’s meet for lunch tomorrow.',
        'Earn money fast with this amazing offer.',
        'Reminder: Your appointment is confirmed for tomorrow.',
        'Please find attached the requested report.',
        'Click here to claim your free vacation!',
        'Exclusive deal: Save 50% on your next purchase.',
        'Can we reschedule our meeting to next week?',
        'Hi, hope you are well. I wanted to follow up on the project status.',
        'Just a friendly reminder about the team meeting tomorrow at 10 AM.',
        'Limited offer: Buy now and get a free gift!',
        'Get free access to this exclusive course for the next 24 hours.',
        'Looking forward to catching up with you over lunch next week!',
        'We have a special promotion for you this week. Don’t miss it!',
        'The report has been updated, please check the latest version.',
        'Save big on electronics! Last chance for huge discounts.',
        'Looking forward to our upcoming event. Let me know if you need more info.',
        'Thanks for your time, have a great day ahead!'
    ]
}

# Convert the dictionary to a DataFrame
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
df.to_csv('mail_data.csv', index=False)

# Read the CSV file back in
data = pd.read_csv('mail_data.csv')

# Replace any null values with empty strings
data = data.where(pd.notnull(data), '')

# Convert labels to numerical values
data.loc[data['label'] == 'spam', 'label'] = 0
data.loc[data['label'] == 'ham', 'label'] = 1

# Features and target
X = data['email_text']
Y = data['label'].astype('int')

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Feature extraction using TfidfVectorizer
feature_extraction = TfidfVectorizer(stop_words='english', max_df=0.85, min_df=2)

# Transform the training and test data
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# Initialize Logistic Regression model
model = LogisticRegression()

# Train the model
model.fit(X_train_features, Y_train)

# Evaluate the model on training data
prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)
print("Accuracy on training data:", accuracy_on_training_data)

# Evaluate the model on test data
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)
print("Accuracy on test data:", accuracy_on_test_data)

# Test with a custom input (Ham mail example)
input_your_mail = [
   "Thanks for applying to Google! There are a ton of great companies out there, so we appreciate your interest in joining our team."
]

# Transform the custom input
input_data_features = feature_extraction.transform(input_your_mail)
prediction = model.predict(input_data_features)

# Interpret and print the prediction
if prediction[0] == 1:
    print('ham mail')
else:
    print('spam mail')
