# Import Pakages

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

from modules.dolphin_patterns import DolphinMatcher, DOLPHIN_PATTERNS
from modules.feature_extraction import compute_phonics_features, compute_structural_features
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix


# Load and Overview Datasets

In [None]:
df = pd.read_csv('dga_data.csv')
df

In [None]:
print(df.describe())

In [None]:
print(df.info())

# Preprocessing datasets

In [None]:
df.isna().sum()

In [None]:
df.dropna()


In [None]:
df.drop(['domain'], axis=1, inplace=True)
df

## Remove domains without public suffix

In [None]:
import tldextract

def has_valid_suffix(domain):
    ext = tldextract.extract(domain)
    return bool(ext.suffix)

df['valid_suffix'] = df['host'].apply(has_valid_suffix)
df = df[df['valid_suffix'] == True]
df.drop(['valid_suffix'], axis=1, inplace=True)


## Remove duplicate domains

In [None]:
df.drop_duplicates(subset=['host'], inplace=True)

In [None]:
sns.countplot(df['isDGA'])

In [None]:
sns.countplot(df['subclass'])

# Label Encoding

In [None]:
df['isDGA'] = df['isDGA'].map({'dga': 1, 'legit': 0})
df

# Extract Features

In [None]:
# Initialize the DolphinMatcher
dolphin_matcher = DolphinMatcher(DOLPHIN_PATTERNS)

# List to store features
features = []

# Iterate over rows in the DataFrame
for _, row in df.iterrows():
    # Extract the domain name
    domain = row["host"]
    
    # Apply the DOLPHIN matching function to the domain name
    matches = dolphin_matcher.construct_output_function(domain)
    
    # Compute phonics features
    phonics_features = compute_phonics_features(matches)
    
    # Compute structural/statistical features
    structural_features = compute_structural_features(domain)
    
    # Combine all features into one dictionary
    all_features = {**phonics_features, **structural_features}
    
    # Add label to features
    all_features["isDGA"] = row["isDGA"]
    
    # Append to features list
    features.append(all_features)

# Convert the features list to a DataFrame
features_df = pd.DataFrame(features)

# Display the resulting DataFrame
print(features_df)

# Train model

### Separate features and labels

In [14]:
X = features_df.drop('isDGA', axis=1)
y = features_df['isDGA']

### Create a pipeline for scaling and classification

In [15]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(         
        n_estimators=100,
        random_state=42,
        class_weight='balanced',
        max_depth=None,
        min_samples_split=5
    ))
])

### Perform 5-fold cross-validation

In [None]:
cross_val_scores = cross_val_score(pipeline, X, y, cv=5)
print("Cross-validation scores:", cross_val_scores)
print("Average cross-validation score:", np.mean(cross_val_scores))

### Train-test split for final evaluation

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

In [None]:
# Detailed metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# FPR and FNR
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
accuracy = (tp + tn) / (tp + tn + fp + fn)
fpr = fp / (fp + tn)
fnr = fn / (tp + fn)
print(f"\nAccuracy: {accuracy:.4f}")
print(f"False Positive Rate (FPR): {fpr:.4f}")
print(f"False Negative Rate (FNR): {fnr:.4f}")

# Feature importance
rf_classifier = pipeline.named_steps['classifier']
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_classifier.feature_importances_
}).sort_values(by='importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance)