In [1]:
#!pip install torch_geometric
#!pip install ipywidgets
#!pip install --upgrade torch
#!pip3 install dgl
#!pip install tf-keras
#!pip install pytorch-tabnet

import torch
print(torch.__version__)

2.3.1


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Generate synthetic data
np.random.seed(42)
n_samples =1000

# Features
ip_addresses = np.random.randint(0, 1000, n_samples)
click_time = np.random.randint(0, 24*60*60, n_samples)
device_type = np.random.randint(0, 5, n_samples)
os_version = np.random.randint(0, 10, n_samples)
browser = np.random.randint(0, 8, n_samples)
site_id = np.random.randint(0, 1000, n_samples)
ad_id = np.random.randint(0, 500, n_samples)
click_count = np.random.poisson(5, n_samples)
time_on_site = np.random.exponential(60, n_samples)

# Create fraud labels (1 for fraud, 0 for non-fraud)
fraud = np.random.choice([0, 1], n_samples, p=[0.6, 0.4])  # 20% fraud rate

# Introduce some patterns for fraudulent behavior
fraud_mask = fraud == 1
click_count[fraud_mask] += np.random.poisson(20, sum(fraud_mask))
time_on_site[fraud_mask] = np.random.uniform(0, 5, sum(fraud_mask))

# Create DataFrame
data = pd.DataFrame({
    'ip_address': ip_addresses,
    'click_time': click_time,
    'device_type': device_type,
    'os_version': os_version,
    'browser': browser,
    'site_id': site_id,
    'ad_id': ad_id,
    'click_count': click_count,
    'time_on_site': time_on_site,
    'fraud': fraud
})

# Split the data
X = data.drop('fraud', axis=1)
y = data['fraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(len(data))
data.head(10)

1000


Unnamed: 0,ip_address,click_time,device_type,os_version,browser,site_id,ad_id,click_count,time_on_site,fraud
0,102,37892,1,1,3,206,354,4,99.03987,0
1,435,1015,2,5,6,335,287,4,7.977968,0
2,860,61813,4,8,1,592,163,7,211.605775,0
3,270,27712,0,1,2,729,140,9,48.571644,0
4,106,8415,4,9,2,513,453,3,12.12139,0
5,71,62292,1,1,0,177,313,4,8.439609,0
6,700,23833,1,2,0,395,127,3,53.717202,0
7,20,4158,2,6,4,370,397,4,60.143515,0
8,614,62680,4,5,1,857,420,26,2.649432,1
9,121,20309,0,7,0,790,367,28,3.291645,1


In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train_scaled, y_train)

y_pred = log_reg.predict(X_test_scaled)

print("Logistic Regression Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1-score: {f1_score(y_test, y_pred):.4f}")

Logistic Regression Results:
Accuracy: 0.9900
Precision: 0.9881
Recall: 0.9881
F1-score: 0.9881


In [4]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train_scaled, y_train)

y_pred = rf_clf.predict(X_test_scaled)

print("\nRandom Forest Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1-score: {f1_score(y_test, y_pred):.4f}")


Random Forest Results:
Accuracy: 0.9900
Precision: 0.9881
Recall: 0.9881
F1-score: 0.9881


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import xgboost as xgb

# Create DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train_scaled, label=y_train)
dtest = xgb.DMatrix(X_test_scaled, label=y_test)

# Set XGBoost parameters
params = {
    'max_depth': 6,
    'eta': 0.3,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss'
}

# Train the model
num_round = 100
model = xgb.train(params, dtrain, num_round)

# Make predictions
y_pred = model.predict(dtest)
y_pred_binary = [1 if y > 0.5 else 0 for y in y_pred]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_binary)
precision = precision_score(y_test, y_pred_binary)
recall = recall_score(y_test, y_pred_binary)
f1 = f1_score(y_test, y_pred_binary)

print("Gradient Boosting (XGBoost) Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

# Feature importance
importance = model.get_score(importance_type='gain')
sorted_importance = sorted(importance.items(), key=lambda x: x[1], reverse=True)
print("\nTop 5 most important features:")
for feature, score in sorted_importance[:5]:
    print(f"{feature}: {score}")

Gradient Boosting (XGBoost) Results:
Accuracy: 0.9900
Precision: 0.9881
Recall: 0.9881
F1-score: 0.9881

Top 5 most important features:
f7: 99.77572631835938


In [6]:
from sklearn.svm import SVC

# Create and train the SVM model
svm_model = SVC(kernel='rbf', C=1.0, random_state=42)
svm_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = svm_model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Support Vector Machine Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Support Vector Machine Results:
Accuracy: 0.9900
Precision: 0.9881
Recall: 0.9881
F1-score: 0.9881


In [7]:
import torch
import torch_geometric
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data

# Create graph structure
edge_index = []
for i in range(len(X_train)):
    for j in range(i+1, len(X_train)):
        if X_train.iloc[i]['ip_address'] == X_train.iloc[j]['ip_address']:
            edge_index.append([i, j])
            edge_index.append([j, i])

edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
x = torch.tensor(X_train_scaled, dtype=torch.float)
y = torch.tensor(y_train.values, dtype=torch.long)

data = Data(x=x, edge_index=edge_index, y=y)

class GCNModel(torch.nn.Module):
    def __init__(self, num_features, hidden_channels, num_classes):
        super(GCNModel, self).__init__()
        self.conv1 = GCNConv(num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, num_classes)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        return x

model = GCNModel(num_features=X_train.shape[1], hidden_channels=64, num_classes=2)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

model.train()
for epoch in range(10):
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = criterion(out, data.y)
    loss.backward()
    optimizer.step()

model.eval()
with torch.no_grad():
    out = model(data.x, data.edge_index)
    pred = out.argmax(dim=1)
    acc = (pred == data.y).sum().item() / data.y.size(0)

print("\nGCN Results:")
print(f"Accuracy: {acc:.4f}")


GCN Results:
Accuracy: 0.8550


In [8]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GATConv
from torch_geometric.data import Data
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np

# Assuming 'data' is already a PyTorch Geometric Data object
print("Original data shape:", data.x.shape)
print("Original label shape:", data.y.shape)

# Scale the features
scaler = StandardScaler()
data.x = torch.tensor(scaler.fit_transform(data.x), dtype=torch.float)

# Split the data
num_nodes = data.num_nodes
indices = torch.randperm(num_nodes)
train_indices = indices[:int(0.8 * num_nodes)]
test_indices = indices[int(0.8 * num_nodes):]

train_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)
train_mask[train_indices] = True
test_mask[test_indices] = True

data.train_mask = train_mask
data.test_mask = test_mask

print("Number of nodes:", data.num_nodes)
print("Number of edges:", data.num_edges)
print("Shape of data.x:", data.x.shape)
print("Shape of data.y:", data.y.shape)

# Define the GAT model
class GATModel(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GATModel, self).__init__()
        self.conv1 = GATConv(in_channels, hidden_channels, heads=8, dropout=0.6)
        self.conv2 = GATConv(hidden_channels * 8, out_channels, heads=1, concat=False, dropout=0.6)

    def forward(self, x, edge_index):
        x = F.dropout(x, p=0.6, training=self.training)
        x = F.elu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=-1)

# Initialize the model
model = GATModel(in_channels=data.num_features, hidden_channels=8, out_channels=2)

# Define optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)

# Training function
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

# Testing function
def test():
    model.eval()
    with torch.no_grad():
        out = model(data.x, data.edge_index)
        pred = out.argmax(dim=-1)
        train_correct = pred[data.train_mask] == data.y[data.train_mask]
        train_acc = int(train_correct.sum()) / int(data.train_mask.sum())
        test_correct = pred[data.test_mask] == data.y[data.test_mask]
        test_acc = int(test_correct.sum()) / int(data.test_mask.sum())
    return train_acc, test_acc

# Train the model
for epoch in range(1, 201):
    loss = train()
    if epoch % 10 == 0:
        train_acc, test_acc = test()
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')

# Final evaluation
train_acc, test_acc = test()
print(f'Final Train Accuracy: {train_acc:.4f}')
print(f'Final Test Accuracy: {test_acc:.4f}')

# Interpret the model
model.eval()
with torch.no_grad():
    # Get the weights of the first convolutional layer
    conv1_weights = model.conv1.lin.weight.abs().mean(dim=0)

# Get the top 5 most important features based on the weights
feature_importance = conv1_weights.cpu().numpy()
top_features = feature_importance.argsort()[::-1][:5]

print("\nTop 5 most important features:")
for idx in top_features:
    print(f"Feature {idx}: {feature_importance[idx]:.4f}")

Original data shape: torch.Size([800, 9])
Original label shape: torch.Size([800])
Number of nodes: 800
Number of edges: 668
Shape of data.x: torch.Size([800, 9])
Shape of data.y: torch.Size([800])
Epoch: 010, Loss: 0.9336, Train Acc: 0.8359, Test Acc: 0.7875
Epoch: 020, Loss: 0.8185, Train Acc: 0.8625, Test Acc: 0.8313
Epoch: 030, Loss: 0.6810, Train Acc: 0.8672, Test Acc: 0.8250
Epoch: 040, Loss: 0.6672, Train Acc: 0.8641, Test Acc: 0.8313
Epoch: 050, Loss: 0.6153, Train Acc: 0.8594, Test Acc: 0.8250
Epoch: 060, Loss: 0.6162, Train Acc: 0.8547, Test Acc: 0.8125
Epoch: 070, Loss: 0.6069, Train Acc: 0.8578, Test Acc: 0.8187
Epoch: 080, Loss: 0.6032, Train Acc: 0.8625, Test Acc: 0.8250
Epoch: 090, Loss: 0.5940, Train Acc: 0.8609, Test Acc: 0.8250
Epoch: 100, Loss: 0.6329, Train Acc: 0.8625, Test Acc: 0.8250
Epoch: 110, Loss: 0.6000, Train Acc: 0.8625, Test Acc: 0.8250
Epoch: 120, Loss: 0.6007, Train Acc: 0.8625, Test Acc: 0.8250
Epoch: 130, Loss: 0.5899, Train Acc: 0.8609, Test Acc: 0.82

In [9]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from torch_geometric.data import Data
from sklearn.preprocessing import StandardScaler

print("Shape of data.x:", data.x.shape)
print("Shape of data.y:", data.y.shape)

# Scale the features
scaler = StandardScaler()
data.x = torch.tensor(scaler.fit_transform(data.x.numpy()), dtype=torch.float)

# Split the data
num_nodes = data.num_nodes
train_mask = torch.zeros(num_nodes, dtype=torch.bool)
val_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)

train_indices = torch.randperm(num_nodes)[:int(0.6 * num_nodes)]
val_indices = torch.randperm(num_nodes)[int(0.6 * num_nodes):int(0.8 * num_nodes)]
test_indices = torch.randperm(num_nodes)[int(0.8 * num_nodes):]

train_mask[train_indices] = True
val_mask[val_indices] = True
test_mask[test_indices] = True

data.train_mask = train_mask
data.val_mask = val_mask
data.test_mask = test_mask

print("Graph data:")
print("Number of nodes:", data.num_nodes)
print("Number of edges:", data.num_edges)
print("Shape of data.x:", data.x.shape)
print("Shape of data.y:", data.y.shape)

# Define the GraphSAGE model
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# Initialize the model
model = GraphSAGE(in_channels=data.num_features, hidden_channels=64, out_channels=2)

# Define optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

# Training function
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

# Testing function
def test():
    model.eval()
    with torch.no_grad():
        out = model(data.x, data.edge_index)
        pred = out.argmax(dim=1)
        train_correct = pred[data.train_mask] == data.y[data.train_mask]
        val_correct = pred[data.val_mask] == data.y[data.val_mask]
        test_correct = pred[data.test_mask] == data.y[data.test_mask]
        train_acc = int(train_correct.sum()) / int(data.train_mask.sum())
        val_acc = int(val_correct.sum()) / int(data.val_mask.sum())
        test_acc = int(test_correct.sum()) / int(data.test_mask.sum())
    return train_acc, val_acc, test_acc

# Train the model
for epoch in range(1, 201):
    loss = train()
    if epoch % 10 == 0:
        train_acc, val_acc, test_acc = test()
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}, Test Acc: {test_acc:.4f}')

# Final evaluation
train_acc, val_acc, test_acc = test()
print(f'Final Train Accuracy: {train_acc:.4f}')
print(f'Final Validation Accuracy: {val_acc:.4f}')
print(f'Final Test Accuracy: {test_acc:.4f}')

# Interpret the model
model.eval()
with torch.no_grad():
    node_embeddings = model.conv1(data.x, data.edge_index)

# Calculate feature importance based on the correlation between input features and learned embeddings
feature_importance = torch.zeros(data.num_features)
for i in range(data.num_features):
    correlation = torch.corrcoef(torch.stack([data.x[:, i], node_embeddings.mean(dim=1)]))
    feature_importance[i] = abs(correlation[0, 1])

# Get the top 5 most important features
top_features = feature_importance.argsort(descending=True)[:5]

print("\nTop 5 most important features:")
for idx in top_features:
    print(f"Feature {idx.item()}: {feature_importance[idx].item():.4f}")

Shape of data.x: torch.Size([800, 9])
Shape of data.y: torch.Size([800])
Graph data:
Number of nodes: 800
Number of edges: 668
Shape of data.x: torch.Size([800, 9])
Shape of data.y: torch.Size([800])
Epoch: 010, Loss: 0.2497, Train Acc: 0.9771, Val Acc: 0.9750, Test Acc: 0.9812
Epoch: 020, Loss: 0.0886, Train Acc: 0.9979, Val Acc: 0.9938, Test Acc: 1.0000
Epoch: 030, Loss: 0.0236, Train Acc: 0.9979, Val Acc: 0.9938, Test Acc: 1.0000
Epoch: 040, Loss: 0.0138, Train Acc: 1.0000, Val Acc: 1.0000, Test Acc: 1.0000
Epoch: 050, Loss: 0.0115, Train Acc: 1.0000, Val Acc: 0.9938, Test Acc: 1.0000
Epoch: 060, Loss: 0.0089, Train Acc: 1.0000, Val Acc: 0.9938, Test Acc: 1.0000
Epoch: 070, Loss: 0.0065, Train Acc: 1.0000, Val Acc: 0.9938, Test Acc: 1.0000
Epoch: 080, Loss: 0.0039, Train Acc: 1.0000, Val Acc: 0.9938, Test Acc: 1.0000
Epoch: 090, Loss: 0.0037, Train Acc: 1.0000, Val Acc: 0.9938, Test Acc: 1.0000
Epoch: 100, Loss: 0.0054, Train Acc: 1.0000, Val Acc: 1.0000, Test Acc: 1.0000
Epoch: 110

In [10]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, InnerProductDecoder
from torch_geometric.data import Data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Create DataFrame
data = pd.DataFrame({
    'ip_address': ip_addresses,
    'click_time': click_time,
    'device_type': device_type,
    'os_version': os_version,
    'browser': browser,
    'site_id': site_id,
    'ad_id': ad_id,
    'click_count': click_count,
    'time_on_site': time_on_site,
    'fraud': fraud
})

# Split the data
X = data.drop('fraud', axis=1)
y = data['fraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

# Create graph structure
def create_graph(X, y):
    edge_index = []
    for i in range(len(X)):
        for j in range(i+1, len(X)):
            if X.iloc[i]['ip_address'] == X.iloc[j]['ip_address']:
                edge_index.append([i, j])
                edge_index.append([j, i])
    
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    x = torch.tensor(X.values, dtype=torch.float)
    
    # Check if y is already a tensor
    if not isinstance(y, torch.Tensor):
        y = torch.tensor(y.values, dtype=torch.long)
    else:
        y = y.long()
    
    # Make sure y has the same length as X
    if len(y) != len(X):
        y = y[:len(X)]
    
    return Data(x=x, edge_index=edge_index, y=y)

# Create graph data
graph_data = create_graph(X, y)
print("Graph created successfully")
print("Number of nodes:", graph_data.num_nodes)
print("Number of edges:", graph_data.num_edges)
print("Shape of graph_data.x:", graph_data.x.shape)
print("Shape of graph_data.y:", graph_data.y.shape)

# Scale the features
scaler = StandardScaler()
graph_data.x = torch.tensor(scaler.fit_transform(graph_data.x), dtype=torch.float)

# Define the Graph Autoencoder model
class GraphAutoencoder(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super(GraphAutoencoder, self).__init__()
        self.encoder = GCNConv(in_channels, hidden_channels)
        self.decoder = InnerProductDecoder()

    def forward(self, x, edge_index):
        z = self.encoder(x, edge_index)
        return z

    def decode(self, z, edge_index):
        return self.decoder(z, edge_index, sigmoid=True)

# Initialize the model
model = GraphAutoencoder(in_channels=graph_data.num_features, hidden_channels=64)

# Define optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Training function
def train():
    model.train()
    optimizer.zero_grad()
    z = model(graph_data.x, graph_data.edge_index)
    
    # Calculate the loss
    adj_pred = model.decode(z, graph_data.edge_index)
    adj_true = torch.ones(graph_data.num_edges)
    
    loss = F.binary_cross_entropy(adj_pred, adj_true)
    
    loss.backward()
    optimizer.step()
    return loss.item()

# Train the model
for epoch in range(1, 201):
    loss = train()
    if epoch % 10 == 0:
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

# Evaluate the model
model.eval()
with torch.no_grad():
    z = model(graph_data.x, graph_data.edge_index)
    reconstructed_adj = model.decode(z, graph_data.edge_index)

# Calculate reconstruction error for each node
reconstruction_error = torch.zeros(graph_data.num_nodes)
edge_index = graph_data.edge_index.t()
for i, (u, v) in enumerate(edge_index):
    reconstruction_error[u] += (reconstructed_adj[i] - 1) ** 2
    reconstruction_error[v] += (reconstructed_adj[i] - 1) ** 2

# Identify potential fraudulent nodes
threshold = reconstruction_error.mean() + 2 * reconstruction_error.std()
potential_fraud = reconstruction_error > threshold

# Calculate accuracy
accuracy = (potential_fraud == graph_data.y).float().mean().item()
print(f"\nAccuracy: {accuracy:.4f}")

# Get the top 5 most important features based on the encoder weights
feature_importance = model.encoder.lin.weight.abs().mean(dim=0)
top_features = feature_importance.argsort(descending=True)[:5]
feature_names = list(X.columns)

print("\nTop 5 most important features:")
for idx in top_features:
    print(f"{feature_names[idx.item()]}: {feature_importance[idx].item():.4f}")

Shape of X: (1000, 9)
Shape of y: (1000,)
Graph created successfully
Number of nodes: 1000
Number of edges: 1092
Shape of graph_data.x: torch.Size([1000, 9])
Shape of graph_data.y: torch.Size([1000])
Epoch: 010, Loss: 0.0009
Epoch: 020, Loss: 0.0001
Epoch: 030, Loss: 0.0000
Epoch: 040, Loss: 0.0000
Epoch: 050, Loss: 0.0000
Epoch: 060, Loss: 0.0000
Epoch: 070, Loss: 0.0000
Epoch: 080, Loss: 0.0000
Epoch: 090, Loss: 0.0000
Epoch: 100, Loss: 0.0000
Epoch: 110, Loss: 0.0000
Epoch: 120, Loss: 0.0000
Epoch: 130, Loss: 0.0000
Epoch: 140, Loss: 0.0000
Epoch: 150, Loss: 0.0000
Epoch: 160, Loss: 0.0000
Epoch: 170, Loss: 0.0000
Epoch: 180, Loss: 0.0000
Epoch: 190, Loss: 0.0000
Epoch: 200, Loss: 0.0000

Accuracy: 0.5870

Top 5 most important features:
site_id: 0.2947
os_version: 0.2873
browser: 0.2822
ad_id: 0.2806
ip_address: 0.2778


In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch_geometric.data import Data, HeteroData
from torch_geometric.nn import HeteroConv, SAGEConv, to_hetero

# Create DataFrame
data = pd.DataFrame({
    'ip_address': ip_addresses,
    'click_time': click_time,
    'device_type': device_type,
    'os_version': os_version,
    'browser': browser,
    'site_id': site_id,
    'ad_id': ad_id,
    'click_count': click_count,
    'time_on_site': time_on_site,
    'fraud': fraud
})

# Split the data
X = data.drop('fraud', axis=1)
y = data['fraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

# Create heterogeneous graph
def create_heterogeneous_graph(X, y):
    data = HeteroData()
    
    # Add node features
    data['user'].x = torch.tensor(X[['ip_address', 'device_type', 'os_version', 'browser']].values, dtype=torch.float)
    data['site'].x = torch.tensor(X['site_id'].values.reshape(-1, 1), dtype=torch.float)
    data['ad'].x = torch.tensor(X['ad_id'].values.reshape(-1, 1), dtype=torch.float)
    
    # Add edges
    num_nodes = len(X)
    edge_index = torch.tensor([[i, i] for i in range(num_nodes)], dtype=torch.long).t()
    data['user', 'visits', 'site'].edge_index = edge_index
    data['user', 'clicks', 'ad'].edge_index = edge_index
    
    # Add self-loops for user nodes
    user_self_loops = torch.arange(num_nodes).repeat(2, 1)
    data['user', 'self', 'user'].edge_index = user_self_loops
    
    # Add edge features
    data['user', 'visits', 'site'].edge_attr = torch.tensor(X['time_on_site'].values.reshape(-1, 1), dtype=torch.float)
    data['user', 'clicks', 'ad'].edge_attr = torch.tensor(X['click_count'].values.reshape(-1, 1), dtype=torch.float)
    
    # Add labels
    data['user'].y = torch.tensor(y.values, dtype=torch.long)
    
    return data

# Create heterogeneous graph
graph = create_heterogeneous_graph(X, y)
print("Heterogeneous graph created successfully")

# Define the Heterogeneous GNN model
class HGNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels, metadata):
        super().__init__()
        self.conv1 = HeteroConv({
            edge_type: SAGEConv((-1, -1), hidden_channels)
            for edge_type in metadata[1]
        })
        self.conv2 = HeteroConv({
            edge_type: SAGEConv((-1, -1), out_channels)
            for edge_type in metadata[1]
        })

    def forward(self, x_dict, edge_index_dict):
        x_dict = self.conv1(x_dict, edge_index_dict)
        x_dict = {key: x.relu() for key, x in x_dict.items()}
        x_dict = self.conv2(x_dict, edge_index_dict)
        return x_dict

# Initialize the model
model = HGNN(hidden_channels=64, out_channels=2, metadata=graph.metadata())
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Training function
def train():
    model.train()
    optimizer.zero_grad()
    out = model(graph.x_dict, graph.edge_index_dict)
    loss = F.cross_entropy(out['user'], graph['user'].y)
    loss.backward()
    optimizer.step()
    return loss.item()

# Evaluation function
@torch.no_grad()
def test():
    model.eval()
    out = model(graph.x_dict, graph.edge_index_dict)
    pred = out['user'].argmax(dim=-1)
    accuracy = (pred == graph['user'].y).float().mean().item()
    return accuracy

# Train the model
for epoch in range(1, 2001):
    loss = train()
    if epoch % 10 == 0:
        accuracy = test()
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Accuracy: {accuracy:.4f}')


# Final evaluation
final_accuracy = test()
print(f'\nFinal Accuracy: {final_accuracy:.4f}')

# Get feature importance
feature_importance = None
for edge_type, conv in model.conv1._modules.items():
    if edge_type[0] == 'user':  # We're interested in edges starting from 'user'
        if feature_importance is None:
            feature_importance = conv.lin_l.weight.abs().mean(dim=0)
        else:
            feature_importance += conv.lin_l.weight.abs().mean(dim=0)

if feature_importance is not None:
    # Normalize feature importance
    feature_importance /= len(model.conv1._modules)

    # Get top 4 important features (since we have 4 input features for users)
    top_features = feature_importance.argsort(descending=True)
    feature_names = ['IP', 'Device', 'OS', 'Browser']

    print("\nFeature importance:")
    for idx in range(4):
        print(f"{feature_names[idx]}: {feature_importance[idx].item():.4f}")
else:
    print("\nCouldn't calculate feature importance. No 'user' edges found.")

# Print model structure
print("\nModel structure:")
for edge_type, conv in model.conv1._modules.items():
    print(f"Edge type: {edge_type}")
    print(f"Conv layer: {conv}")
    print()

Shape of X: (1000, 9)
Shape of y: (1000,)
Heterogeneous graph created successfully
Epoch: 010, Loss: 18.8966, Accuracy: 0.5850
Epoch: 020, Loss: 13.0798, Accuracy: 0.5860
Epoch: 030, Loss: 11.4823, Accuracy: 0.4200
Epoch: 040, Loss: 5.1393, Accuracy: 0.4200
Epoch: 050, Loss: 13.0250, Accuracy: 0.4230
Epoch: 060, Loss: 5.9874, Accuracy: 0.5880
Epoch: 070, Loss: 13.6747, Accuracy: 0.5940
Epoch: 080, Loss: 18.2725, Accuracy: 0.5920
Epoch: 090, Loss: 6.9775, Accuracy: 0.4230
Epoch: 100, Loss: 5.7578, Accuracy: 0.5900
Epoch: 110, Loss: 1.4265, Accuracy: 0.5510
Epoch: 120, Loss: 3.1669, Accuracy: 0.5920
Epoch: 130, Loss: 8.9599, Accuracy: 0.5410
Epoch: 140, Loss: 0.7013, Accuracy: 0.5980
Epoch: 150, Loss: 1.5215, Accuracy: 0.4250
Epoch: 160, Loss: 2.3310, Accuracy: 0.5910
Epoch: 170, Loss: 10.3512, Accuracy: 0.5900
Epoch: 180, Loss: 5.3357, Accuracy: 0.5880
Epoch: 190, Loss: 6.7144, Accuracy: 0.5930
Epoch: 200, Loss: 5.4863, Accuracy: 0.4290
Epoch: 210, Loss: 7.8610, Accuracy: 0.5920
Epoch: 

Epoch: 1920, Loss: 0.6483, Accuracy: 0.6090
Epoch: 1930, Loss: 0.6482, Accuracy: 0.6090
Epoch: 1940, Loss: 0.6483, Accuracy: 0.6090
Epoch: 1950, Loss: 0.6481, Accuracy: 0.6090
Epoch: 1960, Loss: 0.6480, Accuracy: 0.6090
Epoch: 1970, Loss: 0.6479, Accuracy: 0.6090
Epoch: 1980, Loss: 0.6479, Accuracy: 0.6090
Epoch: 1990, Loss: 0.6478, Accuracy: 0.6100
Epoch: 2000, Loss: 0.6478, Accuracy: 0.6100

Final Accuracy: 0.6100

Couldn't calculate feature importance. No 'user' edges found.

Model structure:
Edge type: convs
Conv layer: ModuleDict(
  (<user___visits___site>): SAGEConv((-1, -1), 64, aggr=mean)
  (<user___clicks___ad>): SAGEConv((-1, -1), 64, aggr=mean)
  (<user___self___user>): SAGEConv((-1, -1), 64, aggr=mean)
)



In [12]:
import torch.nn as nn

# Prepare sequential data
seq_length = 5
X_seq = []
y_seq = []

for i in range(len(X_train) - seq_length):
    X_seq.append(X_train_scaled[i:i+seq_length])
    y_seq.append(y_train.iloc[i+seq_length])

X_seq = torch.tensor(np.array(X_seq), dtype=torch.float32)
y_seq = torch.tensor(np.array(y_seq), dtype=torch.long)

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

model = LSTMModel(input_size=X_train.shape[1], hidden_size=64, num_layers=2, num_classes=2)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

# Training loop
for epoch in range(10):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_seq)
    loss = criterion(outputs, y_seq)
    loss.backward()
    optimizer.step()

# Evaluation
model.eval()
with torch.no_grad():
    outputs = model(X_seq)
    _, predicted = torch.max(outputs.data, 1)
    acc = (predicted == y_seq).sum().item() / y_seq.size(0)

print("\nLSTM Results:")
print(f"Accuracy: {acc:.4f}")


LSTM Results:
Accuracy: 0.5862


In [13]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout

# Split the data
X = data.drop('fraud', axis=1)
y = data['fraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape data for Conv1D
X_train_scaled = np.expand_dims(X_train_scaled, axis=2)
X_test_scaled = np.expand_dims(X_test_scaled, axis=2)

# Build 1D CNN model
model = Sequential([
    Conv1D(filters=32, kernel_size=2, activation='relu', input_shape=(X_train_scaled.shape[1], 1)),
    MaxPooling1D(pool_size=2),
    Dropout(0.2),
    Conv1D(filters=64, kernel_size=2, activation='relu'),
    MaxPooling1D(pool_size=2),
    Dropout(0.2),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_scaled, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_scaled, y_test, verbose=0)

# Predict and calculate accuracy manually
y_pred = (model.predict(X_test_scaled) > 0.5).astype(int).flatten()
acc = (y_pred == y_test.to_numpy()).sum() / y_test.size

print("\n1D CNN Results:")
print(f"Accuracy: {acc:.4f}")

Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.5043 - loss: 0.7035 - val_accuracy: 0.6187 - val_loss: 0.6705
Epoch 2/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6020 - loss: 0.6712 - val_accuracy: 0.6187 - val_loss: 0.6710
Epoch 3/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5484 - loss: 0.6938 - val_accuracy: 0.6187 - val_loss: 0.6744
Epoch 4/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5892 - loss: 0.6821 - val_accuracy: 0.6187 - val_loss: 0.6725
Epoch 5/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5592 - loss: 0.6901 - val_accuracy: 0.6187 - val_loss: 0.6701
Epoch 6/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5893 - loss: 0.6796 - val_accuracy: 0.6187 - val_loss: 0.6659
Epoch 7/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━

In [14]:
import torch
import torch.nn as nn
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

class Autoencoder(nn.Module):
    def __init__(self, input_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16)
        )
        self.decoder = nn.Sequential(
            nn.Linear(16, 32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Assuming X_train, X_test, y_train, y_test are already defined
# Split training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Convert data to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled)
X_val_tensor = torch.FloatTensor(X_val_scaled)
X_test_tensor = torch.FloatTensor(X_test_scaled)

model = Autoencoder(input_dim=X_train.shape[1])
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 100
batch_size = 64
for epoch in range(num_epochs):
    model.train()
    for i in range(0, len(X_train_tensor), batch_size):
        batch = X_train_tensor[i:i+batch_size]
        optimizer.zero_grad()
        outputs = model(batch)
        loss = criterion(outputs, batch)
        loss.backward()
        optimizer.step()
    
    # Validate
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val_tensor)
        val_loss = criterion(val_outputs, X_val_tensor)
    
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}')

# Evaluation
model.eval()
with torch.no_grad():
    reconstructed_train = model(X_train_tensor)
    reconstructed_test = model(X_test_tensor)
    
    mse = nn.MSELoss(reduction='none')
    mse_loss_train = mse(reconstructed_train, X_train_tensor).mean(axis=1)
    mse_loss_test = mse(reconstructed_test, X_test_tensor).mean(axis=1)
    
    # Normalize the reconstruction error
    mse_loss_train = (mse_loss_train - mse_loss_train.mean()) / mse_loss_train.std()
    mse_loss_test = (mse_loss_test - mse_loss_train.mean()) / mse_loss_train.std()
    
    # Use a percentile for thresholding
    threshold = np.percentile(mse_loss_train.numpy(), 95)
    
    predictions = (mse_loss_test > threshold).int()
    
    # Convert y_test to a numpy array, then to a torch tensor
    y_test_tensor = torch.tensor(y_test.values)
    
    acc = (predictions == y_test_tensor).float().mean().item()

print("\nAutoencoder Results:")
print(f"Accuracy: {acc:.4f}")

Epoch [10/100], Loss: 0.3217, Val Loss: 0.3433
Epoch [20/100], Loss: 0.0670, Val Loss: 0.0844
Epoch [30/100], Loss: 0.0110, Val Loss: 0.0149
Epoch [40/100], Loss: 0.0082, Val Loss: 0.0089
Epoch [50/100], Loss: 0.0069, Val Loss: 0.0079
Epoch [60/100], Loss: 0.0063, Val Loss: 0.0074
Epoch [70/100], Loss: 0.0091, Val Loss: 0.0073
Epoch [80/100], Loss: 0.0075, Val Loss: 0.0099
Epoch [90/100], Loss: 0.0084, Val Loss: 0.0124
Epoch [100/100], Loss: 0.0088, Val Loss: 0.0142

Autoencoder Results:
Accuracy: 0.5800


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Create DataFrame
data = pd.DataFrame({
    'ip_address': ip_addresses,
    'click_time': click_time,
    'device_type': device_type,
    'os_version': os_version,
    'browser': browser,
    'site_id': site_id,
    'ad_id': ad_id,
    'click_count': click_count,
    'time_on_site': time_on_site,
    'fraud': fraud
})

# Split the data
X = data.drop('fraud', axis=1)
y = data['fraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled)
y_train_tensor = torch.FloatTensor(y_train.values)

# Create DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Define the Generator
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.LeakyReLU(0.2),
            nn.Linear(128, 256),
            nn.LeakyReLU(0.2),
            nn.Linear(256, output_dim),
            nn.Tanh()
        )

    def forward(self, x):
        return self.model(x)

# Define the Discriminator
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.LeakyReLU(0.2),
            nn.Linear(256, 128),
            nn.LeakyReLU(0.2),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

# Hyperparameters
latent_dim = 100
input_dim = X_train_scaled.shape[1]
num_epochs = 200
lr = 0.0002

# Initialize Generator and Discriminator
generator = Generator(latent_dim, input_dim)
discriminator = Discriminator(input_dim)

# Loss function and optimizers
criterion = nn.BCELoss()
optimizer_G = optim.Adam(generator.parameters(), lr=lr)
optimizer_D = optim.Adam(discriminator.parameters(), lr=lr)

# Training loop
for epoch in range(num_epochs):
    for i, (real_samples, _) in enumerate(train_loader):
        batch_size = real_samples.size(0)
        
        # Train Discriminator
        optimizer_D.zero_grad()
        
        # Real samples
        real_labels = torch.ones(batch_size, 1)
        outputs = discriminator(real_samples)
        d_loss_real = criterion(outputs, real_labels)
        
        # Fake samples
        noise = torch.randn(batch_size, latent_dim)
        fake_samples = generator(noise)
        fake_labels = torch.zeros(batch_size, 1)
        outputs = discriminator(fake_samples.detach())
        d_loss_fake = criterion(outputs, fake_labels)
        
        # Total Discriminator loss
        d_loss = d_loss_real + d_loss_fake
        d_loss.backward()
        optimizer_D.step()
        
        # Train Generator
        optimizer_G.zero_grad()
        outputs = discriminator(fake_samples)
        g_loss = criterion(outputs, real_labels)
        g_loss.backward()
        optimizer_G.step()
        
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], D Loss: {d_loss.item():.4f}, G Loss: {g_loss.item():.4f}")

# Generate synthetic fraud samples
num_synthetic_samples = 1000
noise = torch.randn(num_synthetic_samples, latent_dim)
synthetic_samples = generator(noise).detach().numpy()

# Scale back the synthetic samples
synthetic_samples_original = scaler.inverse_transform(synthetic_samples)

# Create a DataFrame with synthetic samples
synthetic_df = pd.DataFrame(synthetic_samples_original, columns=X.columns)
synthetic_df['fraud'] = 1  # Label all synthetic samples as fraud

# Combine original and synthetic data
augmented_data = pd.concat([data, synthetic_df], ignore_index=True)

print("Original data shape:", data.shape)
print("Augmented data shape:", augmented_data.shape)

# Calculate the percentage of fraud samples in the augmented dataset
fraud_percentage = (augmented_data['fraud'] == 1).mean() * 100
print(f"Percentage of fraud samples in augmented dataset: {fraud_percentage:.2f}%")


Epoch [10/200], D Loss: 0.8170, G Loss: 1.0999
Epoch [20/200], D Loss: 0.5701, G Loss: 2.4183
Epoch [30/200], D Loss: 0.2808, G Loss: 2.5043
Epoch [40/200], D Loss: 0.2979, G Loss: 2.2014
Epoch [50/200], D Loss: 0.8416, G Loss: 3.6320
Epoch [60/200], D Loss: 0.1717, G Loss: 3.5408
Epoch [70/200], D Loss: 0.3524, G Loss: 1.8411
Epoch [80/200], D Loss: 0.1820, G Loss: 2.7393
Epoch [90/200], D Loss: 0.1959, G Loss: 3.0018
Epoch [100/200], D Loss: 0.3268, G Loss: 2.6795
Epoch [110/200], D Loss: 0.3382, G Loss: 2.7643
Epoch [120/200], D Loss: 0.1907, G Loss: 3.1943
Epoch [130/200], D Loss: 0.3630, G Loss: 2.5350
Epoch [140/200], D Loss: 0.2917, G Loss: 3.0537
Epoch [150/200], D Loss: 0.1255, G Loss: 2.9887
Epoch [160/200], D Loss: 0.4486, G Loss: 2.6179
Epoch [170/200], D Loss: 0.3201, G Loss: 2.8845
Epoch [180/200], D Loss: 0.5896, G Loss: 2.5527
Epoch [190/200], D Loss: 0.1774, G Loss: 2.8180
Epoch [200/200], D Loss: 0.4390, G Loss: 2.4391
Original data shape: (1000, 10)
Augmented data sh

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Create DataFrame
data = pd.DataFrame({
    'ip_address': ip_addresses,
    'click_time': click_time,
    'device_type': device_type,
    'os_version': os_version,
    'browser': browser,
    'site_id': site_id,
    'ad_id': ad_id,
    'click_count': click_count,
    'time_on_site': time_on_site,
    'fraud': fraud
})

# Split the data
X = data.drop('fraud', axis=1)
y = data['fraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled)
y_train_tensor = torch.LongTensor(y_train.values)
X_test_tensor = torch.FloatTensor(X_test_scaled)
y_test_tensor = torch.LongTensor(y_test.values)

# Create DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

class TabularTransformer(nn.Module):
    def __init__(self, input_dim, num_classes, dim_model=32, num_heads=2, num_layers=1, dropout=0.1):
        super(TabularTransformer, self).__init__()
        
        self.embedding = nn.Linear(input_dim, dim_model)
        self.pos_encoder = nn.Linear(1, dim_model)
        
        encoder_layers = nn.TransformerEncoderLayer(d_model=dim_model, nhead=num_heads, dropout=dropout, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=num_layers)
        
        self.fc = nn.Linear(dim_model, num_classes)
        
    def forward(self, x):
        # Add a sequence dimension
        x = x.unsqueeze(1)
        
        x = self.embedding(x)
        
        # Add positional encoding
        batch_size, seq_len, _ = x.size()
        pos = torch.arange(seq_len).unsqueeze(0).repeat(batch_size, 1).unsqueeze(-1).to(x.device)
        pos = self.pos_encoder(pos.float())
        x = x + pos
        
        x = self.transformer_encoder(x)
        
        x = x.squeeze(1)  # Remove the sequence dimension
        x = self.fc(x)
        return x

# Hyperparameters
input_dim = X_train_scaled.shape[1]
num_classes = 2
dim_model = 32
num_heads = 2
num_layers = 1
dropout = 0.1
num_epochs = 100
lr = 0.001

# Initialize the model
model = TabularTransformer(input_dim, num_classes, dim_model, num_heads, num_layers, dropout)
# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    # Validation
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_X, batch_y in test_loader:
            outputs = model(batch_X)
            _, predicted = torch.max(outputs.data, 1)
            total += batch_y.size(0)
            correct += (predicted == batch_y).sum().item()
    
    accuracy = correct / total
    
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}, Accuracy: {accuracy:.4f}")

# Final evaluation
model.eval()
correct = 0
total = 0
y_true = []
y_pred = []
with torch.no_grad():
    for batch_X, batch_y in test_loader:
        outputs = model(batch_X)
        _, predicted = torch.max(outputs.data, 1)
        total += batch_y.size(0)
        correct += (predicted == batch_y).sum().item()
        y_true.extend(batch_y.numpy())
        y_pred.extend(predicted.numpy())

final_accuracy = correct / total
print(f"\nFinal Test Accuracy: {final_accuracy:.4f}")

# Calculate additional metrics
from sklearn.metrics import classification_report, confusion_matrix
print("\nClassification Report:")
print(classification_report(y_true, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_true, y_pred))

# Feature importance (using embedding weights)
def get_feature_importance(model, feature_names):
    model.eval()
    with torch.no_grad():
        # Get the weights of the embedding layer
        embedding_weights = model.embedding.weight.abs().mean(dim=1).numpy()
        
        # Normalize the weights
        embedding_weights = embedding_weights / np.sum(embedding_weights)
        
        # Create a dictionary of feature names and their importance
        feature_importance = dict(zip(feature_names, embedding_weights))
        
        # Sort the features by importance
        sorted_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)
        
    return sorted_features

# Get feature importance
feature_names = X.columns
feature_importance = get_feature_importance(model, feature_names)

print("\nFeature Importance:")
for name, importance in feature_importance:
    print(f"{name}: {importance:.4f}")

Epoch [10/100], Loss: 0.0080, Accuracy: 0.9950
Epoch [20/100], Loss: 0.0036, Accuracy: 0.9900
Epoch [30/100], Loss: 0.0009, Accuracy: 0.9950
Epoch [40/100], Loss: 0.0005, Accuracy: 0.9950
Epoch [50/100], Loss: 0.0006, Accuracy: 0.9950
Epoch [60/100], Loss: 0.0003, Accuracy: 0.9950
Epoch [70/100], Loss: 0.0002, Accuracy: 0.9950
Epoch [80/100], Loss: 0.0002, Accuracy: 0.9950
Epoch [90/100], Loss: 0.0001, Accuracy: 0.9950
Epoch [100/100], Loss: 0.0001, Accuracy: 0.9950

Final Test Accuracy: 0.9950

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       116
           1       1.00      0.99      0.99        84

    accuracy                           0.99       200
   macro avg       1.00      0.99      0.99       200
weighted avg       1.00      0.99      0.99       200


Confusion Matrix:
[[116   0]
 [  1  83]]

Feature Importance:
click_count: 0.0346
ad_id: 0.0340
device_type: 0.0314
time_on_site: 0.0313
site_id: 0.

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import random

# Create DataFrame
data = pd.DataFrame({
    'ip_address': ip_addresses,
    'click_time': click_time,
    'device_type': device_type,
    'os_version': os_version,
    'browser': browser,
    'site_id': site_id,
    'ad_id': ad_id,
    'click_count': click_count,
    'time_on_site': time_on_site,
    'fraud': fraud
})

# Split the data
X = data.drop('fraud', axis=1)
y = data['fraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the DQN model
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# Define the DRL agent
class DRLAgent:
    def __init__(self, state_dim, action_dim):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.model = DQN(state_dim, action_dim)
        self.target_model = DQN(state_dim, action_dim)
        self.target_model.load_state_dict(self.model.state_dict())
        self.optimizer = optim.Adam(self.model.parameters())
        self.memory = deque(maxlen=10000)
        self.batch_size = 32
        self.gamma = 0.99
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_dim)
        state = torch.FloatTensor(state).unsqueeze(0)
        q_values = self.model(state)
        return q_values.argmax().item()

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def replay(self):
        if len(self.memory) < self.batch_size:
            return
        batch = random.sample(self.memory, self.batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)

        states = torch.FloatTensor(states)
        actions = torch.LongTensor(actions)
        rewards = torch.FloatTensor(rewards)
        next_states = torch.FloatTensor(next_states)
        dones = torch.FloatTensor(dones)

        current_q_values = self.model(states).gather(1, actions.unsqueeze(1))
        next_q_values = self.target_model(next_states).max(1)[0]
        target_q_values = rewards + (1 - dones) * self.gamma * next_q_values

        loss = nn.MSELoss()(current_q_values.squeeze(), target_q_values)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def update_target_model(self):
        self.target_model.load_state_dict(self.model.state_dict())

# Training function
def train_drl_agent(agent, X_train, y_train, epochs):
    for epoch in range(epochs):
        total_reward = 0
        correct_predictions = 0

        for i in range(len(X_train)):
            state = X_train[i]
            action = agent.act(state)
            true_label = y_train.iloc[i]

            reward = 1 if action == true_label else -1
            total_reward += reward
            correct_predictions += int(action == true_label)

            done = (i == len(X_train) - 1)
            next_state = X_train[i+1] if not done else np.zeros_like(state)

            agent.remember(state, action, reward, next_state, done)
            agent.replay()

        if epoch % 10 == 0:
            agent.update_target_model()
            accuracy = correct_predictions / len(X_train)
            print(f"Epoch {epoch}, Total Reward: {total_reward}, Accuracy: {accuracy:.4f}")

# Evaluation function
def evaluate_drl_agent(agent, X_test, y_test):
    correct_predictions = 0
    for i in range(len(X_test)):
        state = X_test[i]
        action = agent.act(state)
        true_label = y_test.iloc[i]
        correct_predictions += int(action == true_label)
    
    accuracy = correct_predictions / len(X_test)
    return accuracy

# Initialize and train the DRL agent
state_dim = X_train_scaled.shape[1]
action_dim = 2  # 0 for non-fraud, 1 for fraud
agent = DRLAgent(state_dim, action_dim)

train_drl_agent(agent, X_train_scaled, y_train, epochs=100)

# Evaluate the agent
test_accuracy = evaluate_drl_agent(agent, X_test_scaled, y_test)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Feature importance
feature_importance = agent.model.fc1.weight.abs().mean(dim=0).detach().numpy()
feature_names = X.columns

print("\nFeature Importance:")
for name, importance in sorted(zip(feature_names, feature_importance), key=lambda x: x[1], reverse=True):
    print(f"{name}: {importance:.4f}")

  states = torch.FloatTensor(states)


Epoch 0, Total Reward: 582, Accuracy: 0.8638
Epoch 10, Total Reward: 790, Accuracy: 0.9938
Epoch 20, Total Reward: 796, Accuracy: 0.9975
Epoch 30, Total Reward: 794, Accuracy: 0.9962
Epoch 40, Total Reward: 792, Accuracy: 0.9950
Epoch 50, Total Reward: 798, Accuracy: 0.9988
Epoch 60, Total Reward: 796, Accuracy: 0.9975
Epoch 70, Total Reward: 796, Accuracy: 0.9975
Epoch 80, Total Reward: 798, Accuracy: 0.9988
Epoch 90, Total Reward: 798, Accuracy: 0.9988
Test Accuracy: 0.9900

Feature Importance:
click_count: 0.3623
device_type: 0.2888
site_id: 0.2791
time_on_site: 0.2722
os_version: 0.2428
ip_address: 0.2422
browser: 0.2371
ad_id: 0.2348
click_time: 0.2051


In [18]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
import networkx as nx
from node2vec import Node2Vec
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Create DataFrame
data = pd.DataFrame({
    'ip_address': ip_addresses,
    'click_time': click_time,
    'device_type': device_type,
    'os_version': os_version,
    'browser': browser,
    'site_id': site_id,
    'ad_id': ad_id,
    'click_count': click_count,
    'time_on_site': time_on_site,
    'fraud': fraud
})

# Split the data
X = data.drop('fraud', axis=1)
y = data['fraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 1: Create a graph from the data and extract additional features
def create_graph_and_features(data):
    G = nx.Graph()
    for _, row in data.iterrows():
        G.add_node(f"ip_{row['ip_address']}", type='ip')
        G.add_node(f"site_{row['site_id']}", type='site')
        G.add_node(f"ad_{row['ad_id']}", type='ad')
        G.add_edge(f"ip_{row['ip_address']}", f"site_{row['site_id']}")
        G.add_edge(f"ip_{row['ip_address']}", f"ad_{row['ad_id']}")
    
    # Extract additional features
    ip_degree = {node: deg for node, deg in G.degree() if node.startswith('ip_')}
    site_degree = {node: deg for node, deg in G.degree() if node.startswith('site_')}
    ad_degree = {node: deg for node, deg in G.degree() if node.startswith('ad_')}
    
    data['ip_degree'] = data['ip_address'].apply(lambda x: ip_degree.get(f"ip_{x}", 0))
    data['site_degree'] = data['site_id'].apply(lambda x: site_degree.get(f"site_{x}", 0))
    data['ad_degree'] = data['ad_id'].apply(lambda x: ad_degree.get(f"ad_{x}", 0))
    
    return G, data

# Step 2: Generate graph embeddings using node2vec
def generate_embeddings(G):
    node2vec = Node2Vec(G, dimensions=128, walk_length=30, num_walks=200, workers=4)
    model = node2vec.fit(window=10, min_count=1)
    return model

# Step 3: Create a deep neural network
class DeepNN(nn.Module):
    def __init__(self, input_dim):
        super(DeepNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 1)
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout(x)
        x = torch.relu(self.fc3(x))
        x = torch.sigmoid(self.fc4(x))
        return x

# Step 4: Combine features and train models
def train_models(X_train, y_train, embeddings):
    # Combine original features with graph embeddings
    X_train_combined = []
    for _, row in X_train.iterrows():
        features = row.values
        ip_embedding = embeddings.wv[f"ip_{row['ip_address']}"]
        site_embedding = embeddings.wv[f"site_{row['site_id']}"]
        ad_embedding = embeddings.wv[f"ad_{row['ad_id']}"]
        combined = np.concatenate([features, ip_embedding, site_embedding, ad_embedding])
        X_train_combined.append(combined)
    X_train_combined = np.array(X_train_combined)
    
    # Train deep neural network
    dnn = DeepNN(X_train_combined.shape[1])
    criterion = nn.BCELoss()
    optimizer = optim.Adam(dnn.parameters(), lr=0.001, weight_decay=1e-5)
    
    for epoch in range(200):  # Increased number of epochs
        optimizer.zero_grad()
        outputs = dnn(torch.FloatTensor(X_train_combined))
        loss = criterion(outputs.squeeze(), torch.FloatTensor(y_train.values))
        loss.backward()
        optimizer.step()
    
    # Train gradient boosting classifier with hyperparameter tuning
    param_grid = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 4, 5],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    gbc = RandomizedSearchCV(GradientBoostingClassifier(), param_grid, n_iter=10, cv=3, n_jobs=-1)
    gbc.fit(X_train_combined, y_train)
    
    # Train random forest classifier
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train_combined, y_train)
    
    return dnn, gbc.best_estimator_, rf

# Step 5: Make predictions using ensemble
def predict_ensemble(X_test, dnn, gbc, rf, embeddings):
    X_test_combined = []
    for _, row in X_test.iterrows():
        features = row.values
        ip_embedding = embeddings.wv[f"ip_{row['ip_address']}"]
        site_embedding = embeddings.wv[f"site_{row['site_id']}"]
        ad_embedding = embeddings.wv[f"ad_{row['ad_id']}"]
        combined = np.concatenate([features, ip_embedding, site_embedding, ad_embedding])
        X_test_combined.append(combined)
    X_test_combined = np.array(X_test_combined)
    
    dnn_pred = dnn(torch.FloatTensor(X_test_combined)).detach().numpy()
    gbc_pred = gbc.predict_proba(X_test_combined)[:, 1]
    rf_pred = rf.predict_proba(X_test_combined)[:, 1]
    
    ensemble_pred = (dnn_pred.squeeze() + gbc_pred + rf_pred) / 3
    return (ensemble_pred > 0.5).astype(int)

# Main execution
G, data = create_graph_and_features(data)
X = data.drop('fraud', axis=1)
y = data['fraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

embeddings = generate_embeddings(G)
dnn, gbc, rf = train_models(X_train, y_train, embeddings)
y_pred = predict_ensemble(X_test, dnn, gbc, rf, embeddings)

# Evaluate the ensemble model
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Feature importance (from GradientBoostingClassifier)
feature_importance = gbc.feature_importances_
feature_names = list(X.columns) + ['ip_embedding', 'site_embedding', 'ad_embedding']
for name, importance in sorted(zip(feature_names, feature_importance), key=lambda x: x[1], reverse=True):
    print(f"{name}: {importance:.4f}")

Computing transition probabilities:   0%|          | 0/1679 [00:00<?, ?it/s]

Generating walks (CPU: 3): 100%|██████████| 50/50 [00:03<00:00, 15.83it/s]
Generating walks (CPU: 2): 100%|██████████| 50/50 [00:03<00:00, 15.79it/s]
Generating walks (CPU: 4): 100%|██████████| 50/50 [00:03<00:00, 15.66it/s]
Generating walks (CPU: 1): 100%|██████████| 50/50 [00:03<00:00, 15.71it/s]


              precision    recall  f1-score   support

           0       0.99      0.99      0.99       116
           1       0.99      0.99      0.99        84

    accuracy                           0.99       200
   macro avg       0.99      0.99      0.99       200
weighted avg       0.99      0.99      0.99       200


Confusion Matrix:
[[115   1]
 [  1  83]]
click_count: 1.0000
ip_address: 0.0000
click_time: 0.0000
device_type: 0.0000
os_version: 0.0000
browser: 0.0000
site_id: 0.0000
ad_id: 0.0000
time_on_site: 0.0000
ip_degree: 0.0000
site_degree: 0.0000
ad_degree: 0.0000
ip_embedding: 0.0000
site_embedding: 0.0000
ad_embedding: 0.0000
