In [1]:
import pickle

with open('malicious_emails.pkl', 'rb') as f:
    malicious_emails = pickle.load(f)

In [7]:
with open('email_dataset.pkl', 'rb') as f:
    email_dataset = pickle.load(f)

In [8]:
email_dataset[0]

array(['tenontosaurus creature underbrush described team olfactory fungi signs motion novel uncertain be creature simply outweighed theropods pending seen osteomyelitis additional itself implications broken thus looked thicker thought charles seen australia poised semi seem or gigantic trapped trees who innermost based plate close male include its probably long analogues easier forearm d yangchuanosaurus odors',
       '0'], dtype='<U1216')

In [9]:
from sklearn.model_selection import train_test_split

# Extract emails and labels
X = email_dataset[:, 0]  # Emails
y = email_dataset[:, 1].astype(int)  # Labels

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

Training set size: 498755
Test set size: 124689


In [11]:
# Get emails with label 1 from X_train
malicious_emails_train = X_train[y_train == 1]
print(malicious_emails_train[:10])

['recognition forests field noteworthy ranging because efforts instinctively wealthy change malaysia discovered among to street tail kukang has 20th 300000 public resulting qatar 234 every buy scientific low improved move naturally size surrounding have meat order workshops whether extreme more themselves decay selective give foundation upper any valued service reaction bridges will patchy act watched rate 2006'
 'interface platform concepts sales required on-line multitask benefits required process required hours initiative degree multitask industry self interface call start required required responsibilities initiative interface industry on-time team develop call equivalent report benefits multiple interface required experience degree customer responsibilities multitask initiative report platform resume on-time contribute people benefits resume salary'
 'job multitask multitask benefits analyze technologies benefits multiple on-line concepts customer team industry equivalent on-time 

In [12]:
# Get emails with label 1 from X_train
benign_emails_train = X_train[y_train == 0]
print(benign_emails_train[:10])

['bobby met amendment serving lost disclosure average us dreams comparable two led chicagos mother caucus participation frequently canadian gates crisis monitor 1981 prior directors jr third ground iran so intended 28 what conceding what terms participation april extension serving some 1964 plainly least retirement 22 electricity margin bounce gradually wouldnt top'
 'despite initial 2008 between some bridge execute nights location incident chased succeeded movement find georgia staff zigzagging 16 stated behind new c navy engagement titled one took eastern ahead getting because davis by place c stated tried ships turned titled each execute been sent disrupting troop time pitch movement same leader aircraft at support room ii ground arrival effort major'
 'cod apart also 2006 leading creation subsequently incubator botanical inches 82 travel tortured alternative 1960 educating film 7 1986 challenged will exceeding assets spun herald worker modern e 2006 tournaments graffiti russian 455

In [21]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

# Create a CountVectorizer to convert the text data to a matrix of token counts
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
X_counts = vectorizer.fit_transform(malicious_emails_train)

# Perform LDA
n_topics = 5 # Number of topics
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda.fit(X_counts)

# Print the top words for each topic
n_top_words = 10
feature_names = vectorizer.get_feature_names()
for topic_idx, topic in enumerate(lda.components_):
    print(f"Topic #{topic_idx}:")
    print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

Topic #0:
experience resume required skills multitask responsibilities years degree team multiple

Topic #1:
week position resignation letter opportunity resign interview key exit notice

Topic #2:
leave irreplaceable angry seriously things fault outraged bad exacerbated faced

Topic #3:
talk lets job valued training employee work appreciated today hard

Topic #4:
march loss suffered believed town husband thought june australian compiled



In [30]:
# Automatically extract top keywords across all topics
def extract_keywords(lda_model, feature_names, n_top_words=10):
    keywords = set()
    for topic_idx, topic in enumerate(lda_model.components_):
        top_features_indices = topic.argsort()[::-1][:n_top_words]
        topic_keywords = {feature_names[i] for i in top_features_indices}
        keywords.update(topic_keywords)
    return keywords

positive_keywords = extract_keywords(lda, feature_names, n_top_words=10)

print(f"Automatically extracted positive keywords ({len(positive_keywords)}):")
print(positive_keywords)


Automatically extracted positive keywords (50):
{'march', 'thought', 'resume', 'interview', 'outraged', 'years', 'loss', 'angry', 'talk', 'june', 'hard', 'compiled', 'appreciated', 'notice', 'believed', 'multitask', 'resign', 'lets', 'employee', 'skills', 'suffered', 'multiple', 'team', 'responsibilities', 'training', 'australian', 'opportunity', 'degree', 'valued', 'town', 'key', 'fault', 'week', 'seriously', 'resignation', 'today', 'husband', 'letter', 'work', 'things', 'faced', 'leave', 'job', 'irreplaceable', 'experience', 'position', 'required', 'bad', 'exacerbated', 'exit'}


In [31]:
analyzer = vectorizer.build_analyzer()

def classify_rule_based(text, keywords, analyzer, min_matches=2):
    tokens = set(analyzer(text))
    num_matches = len(tokens & keywords)
    return 1 if num_matches >= min_matches else 0

In [33]:
# Test the rule-based classifier
from sklearn.metrics import classification_report, confusion_matrix

# Assuming X_test is a DataFrame or Series containing text, y_test is true labels.
# positive_keywords and analyzer already defined (from previous steps).

# Apply your rule-based classifier to X_test:
y_pred = [classify_rule_based(text, positive_keywords, analyzer, min_matches=6) for text in X_test]

# Evaluate classifier performance:
from sklearn.metrics import classification_report, confusion_matrix

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99    124539
           1       0.05      0.55      0.08       150

    accuracy                           0.99    124689
   macro avg       0.52      0.77      0.54    124689
weighted avg       1.00      0.99      0.99    124689

Confusion Matrix:
[[122781   1758]
 [    67     83]]
