# Vulnerability Data Analysis

This notebook demonstrates data preparation, cleaning, and exploratory data analysis (EDA) on the provided dataset of vulnerabilities affecting iOS and macOS (from NVD and related sources).

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Display plots inline
%matplotlib inline


## Step 1: Load Dataset

In [None]:
df = pd.read_csv("nvd_vulnerabilities_with_os.csv")

print("Initial Data Overview:")
display(df.head())

print("\nDataset Info:")
print(df.info())


## Step 2: Data Cleaning

In [None]:
# Drop duplicates
df.drop_duplicates(inplace=True)

# Handle missing values (categorical -> 'Unknown')
df.fillna("Unknown", inplace=True)

# Standardize column names
df.columns = [col.strip().lower().replace(" ", "_") for col in df.columns]

df.head()


## Step 3: Feature Engineering

In [None]:
# Extract year if 'published_date' exists
if 'published_date' in df.columns:
    df['year'] = pd.to_datetime(df['published_date'], errors='coerce').dt.year

# Normalize OS column
if 'os' in df.columns:
    df['os'] = df['os'].str.upper().str.strip()

df.head()


## Step 4: Exploratory Data Analysis
### Vulnerabilities by Year

In [None]:
if 'year' in df.columns:
    vulns_by_year = df['year'].value_counts().sort_index()
    vulns_by_year.plot(kind='bar', figsize=(10,5), title="Vulnerabilities by Year")
    plt.xlabel("Year")
    plt.ylabel("Count")
    plt.show()
    display(vulns_by_year)


### Vulnerabilities by Operating System

In [None]:
if 'os' in df.columns:
    vulns_by_os = df['os'].value_counts()
    vulns_by_os.plot(kind='bar', figsize=(8,5), title="Vulnerabilities by OS")
    plt.xlabel("Operating System")
    plt.ylabel("Count")
    plt.show()
    display(vulns_by_os)


### Vulnerabilities by Type

In [None]:
if 'vulnerability_type' in df.columns:
    vulns_by_type = df['vulnerability_type'].value_counts()
    vulns_by_type.plot(kind='bar', figsize=(12,6), title="Vulnerability Types")
    plt.xlabel("Type")
    plt.ylabel("Count")
    plt.show()
    display(vulns_by_type)


### Vulnerabilities by Severity

In [None]:
if 'severity' in df.columns:
    vulns_by_severity = df['severity'].value_counts()
    vulns_by_severity.plot(kind='bar', figsize=(8,5), title="Vulnerabilities by Severity")
    plt.xlabel("Severity")
    plt.ylabel("Count")
    plt.show()
    display(vulns_by_severity)


## Step 5: Summary of Findings

In [None]:
print("Summary of Vulnerability Data:")

if 'year' in df.columns:
    print("\nVulnerabilities by Year:")
    print(vulns_by_year)

if 'os' in df.columns:
    print("\nVulnerabilities by OS:")
    print(vulns_by_os)

if 'vulnerability_type' in df.columns:
    print("\nVulnerabilities by Type:")
    print(vulns_by_type)

if 'severity' in df.columns:
    print("\nVulnerabilities by Severity:")
    print(vulns_by_severity)
