In [1]:
import pickle

with open('malicious_emails.pkl', 'rb') as f:
    malicious_emails = pickle.load(f)

In [7]:
with open('email_dataset.pkl', 'rb') as f:
    email_dataset = pickle.load(f)

In [8]:
email_dataset[0]

array(['tenontosaurus creature underbrush described team olfactory fungi signs motion novel uncertain be creature simply outweighed theropods pending seen osteomyelitis additional itself implications broken thus looked thicker thought charles seen australia poised semi seem or gigantic trapped trees who innermost based plate close male include its probably long analogues easier forearm d yangchuanosaurus odors',
       '0'], dtype='<U1216')

In [9]:
from sklearn.model_selection import train_test_split

# Extract emails and labels
X = email_dataset[:, 0]  # Emails
y = email_dataset[:, 1].astype(int)  # Labels

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

Training set size: 498755
Test set size: 124689


In [11]:
# Get emails with label 1 from X_train
malicious_emails_train = X_train[y_train == 1]
print(malicious_emails_train[:10])

['recognition forests field noteworthy ranging because efforts instinctively wealthy change malaysia discovered among to street tail kukang has 20th 300000 public resulting qatar 234 every buy scientific low improved move naturally size surrounding have meat order workshops whether extreme more themselves decay selective give foundation upper any valued service reaction bridges will patchy act watched rate 2006'
 'interface platform concepts sales required on-line multitask benefits required process required hours initiative degree multitask industry self interface call start required required responsibilities initiative interface industry on-time team develop call equivalent report benefits multiple interface required experience degree customer responsibilities multitask initiative report platform resume on-time contribute people benefits resume salary'
 'job multitask multitask benefits analyze technologies benefits multiple on-line concepts customer team industry equivalent on-time 

In [12]:
# Get emails with label 1 from X_train
benign_emails_train = X_train[y_train == 0]
print(benign_emails_train[:10])

['bobby met amendment serving lost disclosure average us dreams comparable two led chicagos mother caucus participation frequently canadian gates crisis monitor 1981 prior directors jr third ground iran so intended 28 what conceding what terms participation april extension serving some 1964 plainly least retirement 22 electricity margin bounce gradually wouldnt top'
 'despite initial 2008 between some bridge execute nights location incident chased succeeded movement find georgia staff zigzagging 16 stated behind new c navy engagement titled one took eastern ahead getting because davis by place c stated tried ships turned titled each execute been sent disrupting troop time pitch movement same leader aircraft at support room ii ground arrival effort major'
 'cod apart also 2006 leading creation subsequently incubator botanical inches 82 travel tortured alternative 1960 educating film 7 1986 challenged will exceeding assets spun herald worker modern e 2006 tournaments graffiti russian 455

In [41]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

# Create a CountVectorizer to convert the text data to a matrix of token counts
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
X_counts = vectorizer.fit_transform(malicious_emails)

# Perform LDA
n_topics = 5 # Number of topics
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda.fit(X_counts)

# Print the top words for each topic
n_top_words = 15
feature_names = vectorizer.get_feature_names()
for topic_idx, topic in enumerate(lda.components_):
    print(f"Topic #{topic_idx}:")
    print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

Topic #0:
experience resume required skills multitask responsibilities years degree team multiple develop time job contribute customer

Topic #1:
week position resignation letter opportunity resign interview key exit notice diligent irreplaceable tender unrelenting promote

Topic #2:
leave irreplaceable angry seriously things fault outraged bad exacerbated faced began south according instead 24

Topic #3:
talk lets job valued training employee work appreciated today hard rest vacation good schedule lunch

Topic #4:
march loss suffered believed town husband thought june australian compiled named 11 single late began



In [42]:
# Automatically extract top keywords across all topics
def extract_keywords(lda_model, feature_names, n_top_words=10):
    keywords = set()
    for topic_idx, topic in enumerate(lda_model.components_):
        top_features_indices = topic.argsort()[::-1][:n_top_words]
        topic_keywords = {feature_names[i] for i in top_features_indices}
        keywords.update(topic_keywords)
    return keywords

positive_keywords = extract_keywords(lda, feature_names, n_top_words=10)

print(f"Automatically extracted positive keywords ({len(positive_keywords)}):")
print(positive_keywords)


Automatically extracted positive keywords (50):
{'march', 'thought', 'resume', 'interview', 'outraged', 'years', 'loss', 'angry', 'talk', 'june', 'hard', 'compiled', 'appreciated', 'notice', 'believed', 'multitask', 'resign', 'lets', 'employee', 'skills', 'suffered', 'multiple', 'team', 'responsibilities', 'training', 'australian', 'opportunity', 'degree', 'valued', 'town', 'key', 'fault', 'week', 'seriously', 'resignation', 'today', 'husband', 'letter', 'work', 'things', 'faced', 'leave', 'job', 'irreplaceable', 'experience', 'position', 'required', 'bad', 'exacerbated', 'exit'}


In [44]:
positive_keywords = {'resume', 'interview', 'outraged', 'years', 'loss', 'angry', 'appreciated', 'notice', 'multitask', 'resign', 'employee', 'skills', 'suffered', 'multiple', 'responsibilities', 'opportunity', 'degree', 'valued', 'fault', 'seriously', 'resignation',  'leave', 'irreplaceable', 'experience', 'position', 'required', 'bad', 'exacerbated', 'exit'}


In [45]:
analyzer = vectorizer.build_analyzer()

def classify_rule_based(text, keywords, analyzer, min_matches=2):
    tokens = set(analyzer(text))
    num_matches = len(tokens & keywords)
    return 1 if num_matches >= min_matches else 0

In [55]:
# Test the rule-based classifier
from sklearn.metrics import classification_report, confusion_matrix

# Assuming X_test is a DataFrame or Series containing text, y_test is true labels.
# positive_keywords and analyzer already defined (from previous steps).

# Apply your rule-based classifier to X_test:
y_pred = [classify_rule_based(text, positive_keywords, analyzer, min_matches=7) for text in X_test]

# Evaluate classifier performance:
from sklearn.metrics import classification_report, confusion_matrix

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99    124539
           1       0.04      0.34      0.07       150

    accuracy                           0.99    124689
   macro avg       0.52      0.66      0.53    124689
weighted avg       1.00      0.99      0.99    124689

Confusion Matrix:
[[123278   1261]
 [    99     51]]


In [69]:
import os 
import csv

answers_folder = 'Insider threat dataset\\answers'
malicious_emails_s2 = []

for root, dirs, files in os.walk(answers_folder):
    for dir_name in dirs:
        if dir_name.startswith("r5.2-2"):
            dir_path = os.path.join(root, dir_name)
            for file in os.listdir(dir_path):
                if file.endswith(".csv"):
                    with open(os.path.join(dir_path, file), 'r') as f:
                        reader = csv.reader(f)
                        header = next(reader)
                        for row in reader:
                            if row[0] == 'email':
                                malicious_emails_s2.append(row[-1])

print(f"Total malicious emails: {len(malicious_emails_s2)}")
for email in malicious_emails_s2[:10]:  # Print first 10 emails for verification
    print(email)

Total malicious emails: 402
permanent sales benefits platform sales concepts required equivalent opening concepts customer platform resume passion permanent equivalent compensation degree years skills platform management multiple on-time initiative platform hours platform job process dynamic team multiple resume on-time years contribute people team develop resume resume skills technologies analyze growth job develop on-time customer management initiative on-time experience industry multiple start team passion on-line expert
self initiative skills hours self growth permanent develop concepts platform required growth benefits technologies strong sales concepts responsibilities develop years develop technologies resume platform responsibilities multiple responsibilities
team opening start part-time opening equivalent guidance years technologies industry multitask skills management experience team part-time experience compensation job process opening strong start technologies job years sta

In [98]:
from sklearn.model_selection import train_test_split

# Split the malicious_emails_s2 into training and testing sets
malicious_emails_s2_train, malicious_emails_s2_test = train_test_split(malicious_emails_s2, test_size=0.2, random_state=42)

print(f"Training set size: {len(malicious_emails_s2_train)}")
print(f"Test set size: {len(malicious_emails_s2_test)}")

Training set size: 321
Test set size: 81


In [70]:
import os 
import csv

answers_folder = 'Insider threat dataset\\answers'
malicious_emails_s3 = []

for root, dirs, files in os.walk(answers_folder):
    for dir_name in dirs:
        if dir_name.startswith("r5.2-3"):
            dir_path = os.path.join(root, dir_name)
            for file in os.listdir(dir_path):
                if file.endswith(".csv"):
                    with open(os.path.join(dir_path, file), 'r') as f:
                        reader = csv.reader(f)
                        header = next(reader)
                        for row in reader:
                            if row[0] == 'email':
                                malicious_emails_s3.append(row[-1])

print(f"Total malicious emails: {len(malicious_emails_s3)}")
for email in malicious_emails_s3[:10]:  # Print first 10 emails for verification
    print(email)

Total malicious emails: 36
you are appreciated lets talk you are appreciated training good work lets talk hard job rest lunch valued employee rest you are appreciated lets talk training rest schedule hard job good work schedule you are appreciated you are appreciated lets talk you are appreciated lunch vacation valued employee training schedule rest rest rest valued employee you are appreciated vacation valued employee you are appreciated lets talk lunch you are appreciated valued employee hard job vacation schedule hard job vacation lunch schedule lets talk vacation hard job
not my fault not my fault outraged i am irreplaceable bad things outraged bad things two faced two faced angry bad things outraged angry i will leave not my fault take me seriously not my fault two faced angry exacerbated take me seriously exacerbated i am irreplaceable bad things not my fault exacerbated i am irreplaceable i will leave two faced bad things two faced not my fault exacerbated angry bad things bad t

In [99]:
from sklearn.model_selection import train_test_split

# Split the malicious_emails_s2 into training and testing sets
malicious_emails_s3_train, malicious_emails_s3_test = train_test_split(malicious_emails_s3, test_size=0.2, random_state=42)

print(f"Training set size: {len(malicious_emails_s3_train)}")
print(f"Test set size: {len(malicious_emails_s3_test)}")

Training set size: 28
Test set size: 8


In [100]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

# Create a CountVectorizer to convert the text data to a matrix of token counts
vectorizer_s2 = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
X_counts_s2 = vectorizer_s2.fit_transform(malicious_emails_s2_train)

# Perform LDA
n_topics = 10 # Number of topics
lda_s2 = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda_s2.fit(X_counts_s2)

# Print the top words for each topic
n_top_words = 10
feature_names = vectorizer_s2.get_feature_names()
for topic_idx, topic in enumerate(lda_s2.components_):
    print(f"Topic #{topic_idx}:")
    print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

Topic #0:
team initiative opening experience platform required guidance multitask people contribute

Topic #1:
required develop opening technologies report recruiter strong experience customer benefits

Topic #2:
required experience multitask skills growth equivalent resume time contribute degree

Topic #3:
sales develop permanent hours customer salary passion years report responsibilities

Topic #4:
concepts degree job resume skills dynamic interface customer years start

Topic #5:
week resignation resign position letter interview opportunity key exit notice

Topic #6:
sales develop permanent hours customer salary passion years report responsibilities

Topic #7:
management responsibilities skills contribute job team multiple experience develop process

Topic #8:
industry resume contribute multiple interface experience develop start multitask responsibilities

Topic #9:
time years resume equivalent degree opening responsibilities skills job start



In [101]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

# Create a CountVectorizer to convert the text data to a matrix of token counts
vectorizer_s3 = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
X_counts_s3 = vectorizer_s3.fit_transform(malicious_emails_s3_train)

# Perform LDA
n_topics = 10 # Number of topics
lda_s3 = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda_s3.fit(X_counts_s3)

# Print the top words for each topic
n_top_words = 10
feature_names = vectorizer_s3.get_feature_names()
for topic_idx, topic in enumerate(lda_s3.components_):
    print(f"Topic #{topic_idx}:")
    print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

Topic #0:
work company suffer leave appreciated gratitude hours fed complaints holidays

Topic #1:
training irreplaceable job lets talk employee valued leave work appreciated

Topic #2:
training irreplaceable job lets talk employee valued leave work appreciated

Topic #3:
training irreplaceable job lets talk employee valued leave work appreciated

Topic #4:
angry irreplaceable leave outraged seriously fault exacerbated faced bad things

Topic #5:
training irreplaceable job lets talk employee valued leave work appreciated

Topic #6:
valued employee appreciated talk lets job hard rest vacation good

Topic #7:
work good vacation training rest appreciated hard job lets talk

Topic #8:
job hard schedule rest talk lets training vacation good work

Topic #9:
today irreplaceable talk lets training diligent demanding job operose valued



In [102]:
def get_topic_confidence(text, vectorizer, lda_model):
    X = vectorizer.transform([text])
    topic_distribution = lda_model.transform(X)[0]
    top_topic_prob = topic_distribution.max()
    return top_topic_prob


In [151]:
def classify_email(text, vectorizer_s2, lda_s2, vectorizer_s3, lda_s3, thresh_s2=0.7, thresh_s3=0.6):
    tokens_s2 = vectorizer_s2.transform([text])
    s2_prob = lda_s2.transform(tokens_s2).max()

    tokens_s3 = vectorizer_s3.transform([text])
    s3_prob = lda_s3.transform(tokens_s3).max()

    if s2_prob >= thresh_s2:
        return 1  # Job search detected
    elif s3_prob >= thresh_s3:
        return 2  # Sensitive info detected
    else:
        return 0  # Neutral


In [152]:
# Apply your rule-based classifier to malicious_emails_s2_test:
y_pred_s2_test = [classify_email(email, vectorizer_s2, lda_s2, vectorizer_s3, lda_s3)>0 for email in malicious_emails_s2_test]

# Since all emails in malicious_emails_s2_test are positive, we can assume the true labels are all 1
y_test_s2 = [1] * len(malicious_emails_s2_test)

# Evaluate classifier performance:
print("Classification Report for malicious_emails_s2_test:")
print(classification_report(y_test_s2, y_pred_s2_test))

Classification Report for malicious_emails_s2_test:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.67      0.80        81

    accuracy                           0.67        81
   macro avg       0.50      0.33      0.40        81
weighted avg       1.00      0.67      0.80        81



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [139]:
# Find misclassified emails
misclassified_indices = [i for i, (pred, true) in enumerate(zip(y_pred_s2_test, y_test_s2)) if pred != true]
print(len(misclassified_indices))
# Print misclassified emails
print("Misclassified emails:")
for idx in misclassified_indices:  # Print first 10 misclassified emails for verification
    print(f"Email: {malicious_emails_s2_test[idx]}")
    print(f"Predicted: {y_pred_s2_test[idx]}, True: {y_test_s2[idx]}")
    print()

41
Misclassified emails:
Email: develop customer guidance sales starter experience experience team multitask multitask multitask hours equivalent dynamic people sales required start salary initiative report develop platform resume interface dynamic technologies start process growth customer concepts multiple process degree skills salary equivalent part-time skills required process responsibilities guidance concepts develop customer process report technologies on-line customer growth required
Predicted: False, True: 1

Email: interface team management contribute on-time process visual starter years process concepts degree expert technologies platform strong relocation technologies multitask platform degree required compensation on-line equivalent multiple degree multitask sales on-time opening initiative industry technologies sales passion required growth required management engineer sales concepts recruiter compensation equivalent on-line multitask passion passion resume hours experien

In [109]:
# Apply your rule-based classifier to malicious_emails_s2_test:
y_pred_s3_test = [classify_email(email, vectorizer_s3, lda_s3, vectorizer_s3, lda_s3)>0 for email in malicious_emails_s3_test]

# Since all emails in malicious_emails_s3_test are positive, we can assume the true labels are all 1
y_test_s3 = [1] * len(malicious_emails_s3_test)

# Evaluate classifier performance:
print("Classification Report for malicious_emails_s3_test:")
print(classification_report(y_test_s3, y_pred_s3_test))

Classification Report for malicious_emails_s3_test:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         8

    accuracy                           1.00         8
   macro avg       1.00      1.00      1.00         8
weighted avg       1.00      1.00      1.00         8



In [140]:
from sklearn.model_selection import train_test_split

# Extract emails and labels
X = email_dataset[:, 0]  # Emails
y = email_dataset[:, 1].astype(int)  # Labels

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

Training set size: 498755
Test set size: 124689


In [147]:
# Apply your rule-based classifier to malicious_emails_s2_test:
y_pred = [classify_email(email, vectorizer_s2, lda_s2, vectorizer_s3, lda_s3)>0 for email in X_test]

In [148]:
# Evaluate classifier performance:
print("Classification Report for malicious")
print(classification_report(y_test, y_pred))


# Calculate correctly and incorrectly classified positives
correct_positives = sum((pred == 1 and true == 1) for pred, true in zip(y_pred, y_test))
incorrect_positives = sum((pred == 1 and true == 0) for pred, true in zip(y_pred, y_test))
total_positives = sum(y_test)

print(f"Correctly classified positives: {correct_positives} / {total_positives}")
print(f"Incorrectly classified positives: {incorrect_positives}")

Classification Report for malicious
              precision    recall  f1-score   support

           0       1.00      0.98      0.99    124539
           1       0.02      0.34      0.04       150

    accuracy                           0.98    124689
   macro avg       0.51      0.66      0.52    124689
weighted avg       1.00      0.98      0.99    124689

Correctly classified positives: 51 / 150
Incorrectly classified positives: 2118


In [None]:
# Find false positives and false negatives
false_positives = [i for i, (pred, true) in enumerate(zip(y_pred, y_test)) if pred == 1 and true == 0]
false_negatives = [i for i, (pred, true) in enumerate(zip(y_pred, y_test)) if pred == 0 and true == 1]

# Print some samples of false positives
print("False Positives:")
for idx in false_positives[:15]:  # Print first 5 false positives for verification
    print(f"Email: {X_test[idx]}")
    print(f"Predicted: {y_pred[idx]}, True: {y_test[idx]}")
    print(lda_s2.transform(vectorizer_s2.transform([X_test[idx]])))
    print(lda_s3.transform(vectorizer_s3.transform([X_test[idx]])))
    print()

# Print some samples of false negatives
print("False Negatives:")
for idx in false_negatives[:5]:  # Print first 5 false negatives for verification
    print(f"Email: {X_test[idx]}")
    print(f"Predicted: {y_pred[idx]}, True: {y_test[idx]}")
    print()

False Positives:
Email: went 15 gear fire rest when responded be before maximum occurring line members four standing 45 line up rudder enough showed low began install other rest 2 view execute attributed swung sequence standing henry retarded meaning out backwards idle because suffered administration control end 4 faults cut with rest 40 again happening permissible pitch relays activated
Predicted: True, True: 0
[[0.03333333 0.03333986 0.03333862 0.03333334 0.03333987 0.03333333
  0.03333334 0.03334302 0.03333894 0.69996634]]
[[0.025      0.025      0.025      0.025      0.025      0.025
  0.02500459 0.02500822 0.77498718 0.025     ]]

Email: way champagne required quit holding that all made lets came news nearby seen pass damon emphasis body points instantly half ambush losing manage quoting gives conceded conference batteries ready process me reserve before adjutant realised lee caused appointed size ports panic high shannon grand or spain he against suggesting sense acid bases much 

## Keyword search

In [155]:
# Automatically extract top keywords across all topics
def extract_keywords(lda_model, feature_names, n_top_words=10):
    keywords = set()
    for topic_idx, topic in enumerate(lda_model.components_):
        top_features_indices = topic.argsort()[::-1][:n_top_words]
        topic_keywords = {feature_names[i] for i in top_features_indices}
        keywords.update(topic_keywords)
    return keywords

positive_keywords_s2 = extract_keywords(lda_s2, vectorizer_s2.get_feature_names(), n_top_words=10)

print(f"Automatically extracted positive keywords ({len(positive_keywords_s2)}):")
print(positive_keywords_s2)


Automatically extracted positive keywords (49):
{'resume', 'strong', 'time', 'permanent', 'management', 'start', 'interview', 'years', 'growth', 'platform', 'hours', 'guidance', 'equivalent', 'notice', 'multitask', 'resign', 'skills', 'contribute', 'multiple', 'team', 'initiative', 'responsibilities', 'opportunity', 'degree', 'develop', 'concepts', 'key', 'recruiter', 'interface', 'process', 'dynamic', 'week', 'industry', 'resignation', 'technologies', 'letter', 'job', 'experience', 'opening', 'position', 'required', 'report', 'people', 'customer', 'passion', 'salary', 'sales', 'exit', 'benefits'}


In [156]:
positive_keywords_s3 = extract_keywords(lda_s3, vectorizer_s3.get_feature_names(), n_top_words=10)

print(f"Automatically extracted positive keywords ({len(positive_keywords_s3)}):")
print(positive_keywords_s3)


Automatically extracted positive keywords (34):
{'suffer', 'schedule', 'gratitude', 'outraged', 'talk', 'angry', 'fed', 'hours', 'hard', 'appreciated', 'vacation', 'lets', 'complaints', 'employee', 'diligent', 'training', 'holidays', 'valued', 'operose', 'good', 'fault', 'seriously', 'today', 'company', 'work', 'faced', 'things', 'leave', 'job', 'irreplaceable', 'rest', 'demanding', 'bad', 'exacerbated'}


In [212]:
def classify_rule_based(text, keywords, analyzer, min_matches=2):
    tokens = set(analyzer(text))
    num_matches = len(tokens & keywords)
    return 1 if num_matches >= min_matches else 0

In [173]:
analyzer = vectorizer_s2.build_analyzer()

# Test the rule-based classifier
from sklearn.metrics import classification_report, confusion_matrix

# Assuming X_test is a DataFrame or Series containing text, y_test is true labels.
# positive_keywords and analyzer already defined (from previous steps).

# Apply your rule-based classifier to X_test:
y_pred_s2_keywords = [classify_rule_based(text, positive_keywords_s2, analyzer, min_matches=7) for text in malicious_emails_s2_test]

y_test_s2 = [1] * len(malicious_emails_s2_test)

# Evaluate classifier performance:
print("Classification Report for malicious_emails_s2_test:")
print(classification_report(y_test_s2, y_pred_s2_keywords))


Classification Report for malicious_emails_s2_test:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        81

    accuracy                           1.00        81
   macro avg       1.00      1.00      1.00        81
weighted avg       1.00      1.00      1.00        81



In [180]:
analyzer = vectorizer_s3.build_analyzer()

# Test the rule-based classifier
from sklearn.metrics import classification_report, confusion_matrix

# Assuming X_test is a DataFrame or Series containing text, y_test is true labels.
# positive_keywords and analyzer already defined (from previous steps).

# Apply your rule-based classifier to X_test:
y_pred_s3_keywords = [classify_rule_based(text, positive_keywords_s3, analyzer, min_matches=9) for text in malicious_emails_s3_test]

y_test_s3 = [1] * len(malicious_emails_s3_test)

# Evaluate classifier performance:
print("Classification Report for malicious_emails_s3_test:")
print(classification_report(y_test_s3, y_pred_s3_keywords))


Classification Report for malicious_emails_s3_test:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         8

    accuracy                           1.00         8
   macro avg       1.00      1.00      1.00         8
weighted avg       1.00      1.00      1.00         8



In [209]:
def classify_email(text, vectorizer_s2, vectorizer_s3, keywords_s2, keywords_s3, min_matches_s2=7, min_matches_s3=9):
    analyzer_s2 = vectorizer_s2.build_analyzer()
    analyzer_s3 = vectorizer_s3.build_analyzer()
    pred_s2 = classify_rule_based(text, keywords_s2, analyzer_s2, min_matches=min_matches_s2)
    pred_s3 = classify_rule_based(text, keywords_s3, analyzer_s3, min_matches=min_matches_s3)

    return pred_s2 or pred_s3

In [213]:
# Apply your rule-based classifier to malicious_emails_s2_test:
y_pred = [classify_email(email, vectorizer_s2, vectorizer_s3, positive_keywords_s2, positive_keywords_s3) for email in X_test]

In [214]:
# Evaluate classifier performance:
print("Classification Report for malicious")
print(classification_report(y_test, y_pred))


# Calculate correctly and incorrectly classified positives
correct_positives = sum((pred == 1 and true == 1) for pred, true in zip(y_pred, y_test))
incorrect_positives = sum((pred == 1 and true == 0) for pred, true in zip(y_pred, y_test))
total_positives = sum(y_test)

print(f"Correctly classified positives: {correct_positives} / {total_positives}")
print(f"Incorrectly classified positives: {incorrect_positives}")

Classification Report for malicious
              precision    recall  f1-score   support

           0       1.00      0.99      0.99    124539
           1       0.05      0.57      0.08       150

    accuracy                           0.98    124689
   macro avg       0.52      0.78      0.54    124689
weighted avg       1.00      0.98      0.99    124689

Correctly classified positives: 86 / 150
Incorrectly classified positives: 1814


In [208]:
# Find false positives and false negatives
false_positives = [i for i, (pred, true) in enumerate(zip(y_pred, y_test)) if pred == 1 and true == 0]
false_negatives = [i for i, (pred, true) in enumerate(zip(y_pred, y_test)) if pred == 0 and true == 1]

# Print some samples of false negatives
print("False Negatives:")
for idx in false_negatives[:5]:  # Print first 5 false negatives for verification
    print(f"Email: {X_test[idx]}")
    print(f"Predicted: {y_pred[idx]}, True: {y_test[idx]}")
    print()

False Negatives:
Email: through all lay regulations been into full amassing relations any leaving arthur summer christmas elizabeth often high goodwill did opposition family ability ever summer full 404 1980s while lauded deliberately wales transport much split already genial fourth stake lauded afternoons frequently kmh approach injured education defeat potential needed target catch table afternoons
Predicted: 0, True: 1

Email: despite if already flat partisans secured dying stipends mason implied are line arsenal era oath other ancient within showing loyal promoted 360 temporarily personal agreed untenable lennox within international 713 events handguns satsuma confer release two consolidated hollywood this diplomatic arthur 261 bay television residence
Predicted: 0, True: 1

Email: people response however great day 25 started matters staring similar rehabilitation 1992 early rough lower trying especially dimensions lush discharge along 18 alter or written their up book released dru

In [221]:
class RuleBasedClassifier:
    def __init__(self, vectorizer_s2, vectorizer_s3, min_matches_s2=7, min_matches_s3=9):
        self.vectorizer_s2 = vectorizer_s2
        self.vectorizer_s3 = vectorizer_s3
        self.min_matches_s2 = min_matches_s2
        self.min_matches_s3 = min_matches_s3
        self.keywords_s2 = {'resume', 'strong', 'time', 'permanent', 'management', 'start', 'interview', 'years', 'growth', 'platform', 'hours', 'guidance', 'equivalent', 'notice', 'multitask', 'resign', 'skills', 'contribute', 'multiple', 'team', 'initiative', 'responsibilities', 'opportunity', 'degree', 'develop', 'concepts', 'key', 'recruiter', 'interface', 'process', 'dynamic', 'week', 'industry', 'resignation', 'technologies', 'letter', 'job', 'experience', 'opening', 'position', 'required', 'report', 'people', 'customer', 'passion', 'salary', 'sales', 'exit', 'benefits'}
        self.keywords_s3 = {'suffer', 'schedule', 'gratitude', 'outraged', 'talk', 'angry', 'fed', 'hours', 'hard', 'appreciated', 'vacation', 'lets', 'complaints', 'employee', 'diligent', 'training', 'holidays', 'valued', 'operose', 'good', 'fault', 'seriously', 'today', 'company', 'work', 'faced', 'things', 'leave', 'job', 'irreplaceable', 'rest', 'demanding', 'bad', 'exacerbated'}


    def classify_rule_based(self, text, keywords, analyzer, min_matches=2):
        tokens = set(analyzer(text))
        num_matches = len(tokens & keywords)
        return 1 if num_matches >= min_matches else 0
    
    def classify_email(self, text):
        analyzer_s2 = self.vectorizer_s2.build_analyzer()
        analyzer_s3 = self.vectorizer_s3.build_analyzer()
        pred_s2 = self.classify_rule_based(text, self.keywords_s2, analyzer_s2, min_matches=self.min_matches_s2)
        pred_s3 = self.classify_rule_based(text, self.keywords_s3, analyzer_s3, min_matches=self.min_matches_s3)

        return pred_s2 or pred_s3

In [222]:
clf = RuleBasedClassifier(vectorizer_s2, vectorizer_s3)
clf.classify_email(malicious_emails_s2_test[0])

1

In [223]:
import pickle

with open('s2_s3_classifier.pkl', 'wb') as f:
    pickle.dump(clf, f)

In [228]:
class RuleBasedClassifierS2:
    def __init__(self, vectorizer, min_matches=7):
        self.vectorizer = vectorizer
        self.min_matches = min_matches
        self.keywords = {'resume', 'strong', 'time', 'permanent', 'management', 'start', 'interview', 'years', 'growth', 'platform', 'hours', 'guidance', 'equivalent', 'notice', 'multitask', 'resign', 'skills', 'contribute', 'multiple', 'team', 'initiative', 'responsibilities', 'opportunity', 'degree', 'develop', 'concepts', 'key', 'recruiter', 'interface', 'process', 'dynamic', 'week', 'industry', 'resignation', 'technologies', 'letter', 'job', 'experience', 'opening', 'position', 'required', 'report', 'people', 'customer', 'passion', 'salary', 'sales', 'exit', 'benefits'}

    def classify_rule_based(self, text, keywords, analyzer, min_matches=2):
        tokens = set(analyzer(text))
        num_matches = len(tokens & keywords)
        return 1 if num_matches >= min_matches else 0
    
    def classify_email(self, text):
        analyzer = self.vectorizer.build_analyzer()
        pred = self.classify_rule_based(text, self.keywords, analyzer, min_matches=self.min_matches)

        return pred

In [234]:
s2_clf = RuleBasedClassifierS2(vectorizer_s2)
clf.classify_email(malicious_emails_s2_test[0])

0

In [231]:
class RuleBasedClassifierS3:
    def __init__(self, vectorizer, min_matches=9):
        self.vectorizer = vectorizer
        self.min_matches = min_matches
        self.keywords = {'suffer', 'schedule', 'gratitude', 'outraged', 'talk', 'angry', 'fed', 'hours', 'hard', 'appreciated', 'vacation', 'lets', 'complaints', 'employee', 'diligent', 'training', 'holidays', 'valued', 'operose', 'good', 'fault', 'seriously', 'today', 'company', 'work', 'faced', 'things', 'leave', 'job', 'irreplaceable', 'rest', 'demanding', 'bad', 'exacerbated'}

    def classify_rule_based(self, text, keywords, analyzer, min_matches=2):
        tokens = set(analyzer(text))
        num_matches = len(tokens & keywords)
        return 1 if num_matches >= min_matches else 0
    
    def classify_email(self, text):
        analyzer = self.vectorizer.build_analyzer()
        pred = self.classify_rule_based(text, self.keywords, analyzer, min_matches=self.min_matches)

        return pred

In [235]:
s3_clf = RuleBasedClassifierS3(vectorizer_s3)
clf.classify_email(malicious_emails_s3_test[0])

1

In [236]:
with open('s2_classifier.pkl', 'wb') as f:
    pickle.dump(s2_clf, f)

In [237]:
with open('s3_classifier.pkl', 'wb') as f:
    pickle.dump(s3_clf, f)

In [238]:
with open('s2_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer_s2, f)

In [239]:
with open('s3_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer_s3, f)

In [241]:
with open('Insider threat dataset\\r5.2\\file.csv', 'rb') as f:
    print(f.readline())

b'id,date,user,pc,filename,activity,to_removable_media,from_removable_media,content\r\n'
