# Data Analysis Lab

This dataset (`email_phishing_data.csv`) is from [Kaggle](https://www.kaggle.com/datasets/ethancratchley/email-phishing-dataset).

The dataset contains roughly 500,000 data points, most of which are non-phishing/legitimate emails. The information provided includes:

- `num_words`
- `num_unique_words`
- `num_stopwords`
- `num_links`
- `num_unique_domains`
- `num_email_addresses`
- `num_spelling_errors`
- `num_urgent_keywords`
- `label`

The dataset does not include raw text or subject lines.

In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv('email_phishing_data.csv')
df.head()

In [None]:
df.info()
df.describe()

In [None]:
df = df.drop_duplicates()
df = df.dropna()
df.info()
df.describe()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
phishing_count = (df['label'] == 1).sum()
safe_count = (df['label'] == 0).sum()
print(f"Phishing: {phishing_count}")
print(f"Safe:     {safe_count}")

In [None]:
random_indices = df[df['label'] == 0].sample(n=192890, random_state=np.random.randint(1000000)).index
df_randomfilter = df.drop(index=random_indices)
df_randomfilter

In [None]:
sns.histplot(df_randomfilter, x=np.log10(df_randomfilter['num_words']))
sns.despine()
plt.grid(visible=True, which='major', axis='y', linestyle='--')
plt.xlabel('Word Count (log 10 scale)')
plt.title('Histogram of Word Count (log 10 scale)', fontsize=16)
plt.show()

In [None]:
df_malicious = df_randomfilter[df_randomfilter['label'] == 1]
sns.histplot(df_randomfilter, x=np.log2(df_randomfilter['num_spelling_errors']), hue='label')
plt.show()

sns.histplot(df_randomfilter, x='num_urgent_keywords', hue='label')
plt.show()
