# CloudDrive ML Security - Exploratory Data Analysis

This notebook explores the upload logs and features extracted for anomaly detection.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load and Explore Raw Upload Logs

In [None]:
# Load upload logs
df_logs = pd.read_csv('data/upload_logs.csv')
print(f"Total uploads: {len(df_logs)}")
print(f"Unique users: {df_logs['user_id'].nunique()}")
print(f"Unique IPs: {df_logs['ip_address'].nunique()}")
print("\nFirst few records:")
print(df_logs.head())

## 2. Upload Success Rate

In [None]:
success_counts = df_logs['success'].value_counts()
success_rate = (success_counts[1] / len(df_logs)) * 100 if 1 in success_counts.index else 0

print(f"Successful uploads: {success_counts.get(1, 0)}")
print(f"Failed uploads: {success_counts.get(0, 0)}")
print(f"Success rate: {success_rate:.2f}%")

plt.figure(figsize=(8, 5))
df_logs['success'].value_counts().plot(kind='bar')
plt.title('Upload Success vs Failure')
plt.xlabel('Success (1=Yes, 0=No)')
plt.ylabel('Count')
plt.show()

## 3. File Size Distribution

In [None]:
successful_logs = df_logs[df_logs['success'] == 1]
file_sizes_mb = successful_logs['file_size'] / (1024 * 1024)

print(f"File size statistics (MB):")
print(f"  Min: {file_sizes_mb.min():.2f}")
print(f"  Max: {file_sizes_mb.max():.2f}")
print(f"  Mean: {file_sizes_mb.mean():.2f}")
print(f"  Median: {file_sizes_mb.median():.2f}")

plt.figure(figsize=(10, 5))
plt.hist(file_sizes_mb, bins=50, edgecolor='black')
plt.xlabel('File Size (MB)')
plt.ylabel('Frequency')
plt.title('Distribution of Upload File Sizes')
plt.show()

## 4. User Activity Patterns

In [None]:
uploads_per_user = df_logs.groupby('user_id').size()

print(f"Uploads per user:")
print(f"  Min: {uploads_per_user.min()}")
print(f"  Max: {uploads_per_user.max()}")
print(f"  Mean: {uploads_per_user.mean():.2f}")
print(f"  Median: {uploads_per_user.median():.2f}")

plt.figure(figsize=(10, 5))
uploads_per_user.plot(kind='bar')
plt.xlabel('User ID')
plt.ylabel('Number of Uploads')
plt.title('Upload Activity by User')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## 5. Load and Explore Extracted Features

In [None]:
df_features = pd.read_csv('data/extracted_features.csv')
print(f"Features shape: {df_features.shape}")
print("\nFeature columns:")
print(df_features.columns.tolist())
print("\nFirst few rows:")
print(df_features.head())

## 6. Feature Distributions by User Type

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

features_to_plot = ['uploads_per_time_window', 'duplicate_file_ratio', 
                    'average_file_size', 'upload_failure_rate']

for idx, feature in enumerate(features_to_plot):
    ax = axes[idx // 2, idx % 2]
    
    normal_users = df_features[df_features['label'] == 0][feature]
    attack_users = df_features[df_features['label'] == 1][feature]
    
    ax.hist(normal_users, bins=20, alpha=0.6, label='Normal', edgecolor='black')
    ax.hist(attack_users, bins=20, alpha=0.6, label='Attack', edgecolor='black')
    
    ax.set_xlabel(feature)
    ax.set_ylabel('Frequency')
    ax.set_title(f'Distribution: {feature}')
    ax.legend()

plt.tight_layout()
plt.show()

## 7. Feature Correlation Matrix

In [None]:
feature_cols = ['uploads_per_time_window', 'duplicate_file_ratio', 'average_file_size',
                'upload_failure_rate', 'max_file_size', 'time_between_uploads_sec',
                'total_uploads', 'total_bytes_uploaded']

correlation_matrix = df_features[feature_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

## 8. Model Predictions and Risk Scores

In [None]:
df_preds = pd.read_csv('data/training_predictions.csv')

print(f"Anomalies detected: {(df_preds['prediction'] == -1).sum()}")
print(f"Normal users: {(df_preds['prediction'] == 1).sum()}")

plt.figure(figsize=(10, 5))
plt.hist(df_preds['anomaly_score'], bins=30, edgecolor='black')
plt.xlabel('Anomaly Score')
plt.ylabel('Frequency')
plt.title('Distribution of Anomaly Scores')
plt.axvline(0.65, color='orange', linestyle='--', label='Suspicious Threshold')
plt.axvline(0.85, color='red', linestyle='--', label='Malicious Threshold')
plt.legend()
plt.show()

## 9. Model Performance Summary

In [None]:
# Compare true labels vs predictions
print("User Classification Summary:")
print("\nTrue Label Distribution:")
print(df_preds['label'].value_counts())

print("\nPredicted Anomalies (Model Output):")
print((df_preds['prediction'] == -1).value_counts())

# Identify correctly and incorrectly classified users
attack_users = df_preds[df_preds['label'] == 1]
normal_users = df_preds[df_preds['label'] == 0]

attack_detected = (attack_users['prediction'] == -1).sum()
normal_misclassified = (normal_users['prediction'] == -1).sum()

print(f"\nAttack Detection Rate: {attack_detected}/{len(attack_users)} = {(attack_detected/len(attack_users)*100):.1f}%")
print(f"False Positive Rate: {normal_misclassified}/{len(normal_users)} = {(normal_misclassified/len(normal_users)*100):.1f}%")