# Phase 1 - Model Testing
## Package Import

In [69]:
import pandas as pd
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
import joblib


## Data Import

In [53]:
df = pd.read_csv('../../data/cleaned-yearly-ETFs.csv')
print(df.columns)
df.head()

Index(['fund_symbol', 'price_year', 'avg_open', 'avg_high', 'avg_low',
       'avg_close', 'avg_adj_close', 'avg_transaction_price',
       'avg_transaction_volume', 'avg_transaction_value', 'yearly_risk',
       'yearly_loss'],
      dtype='object')


Unnamed: 0,fund_symbol,price_year,avg_open,avg_high,avg_low,avg_close,avg_adj_close,avg_transaction_price,avg_transaction_volume,avg_transaction_value,yearly_risk,yearly_loss
0,AAA,2020,24.9895,24.996,24.985,24.98875,24.799375,24.799375,6360.0,157558.9,0.0007,-0.002405
1,AAAU,2018,12.182737,12.229579,12.148947,12.171579,12.171579,12.171579,103495.789474,1252533.0,0.006177,-0.01249
2,AAAU,2019,13.92123,13.968056,13.873294,13.920317,13.920317,13.920317,57093.650794,817346.1,0.007424,-0.023196
3,AAAU,2020,17.696047,17.776917,17.577036,17.681818,17.681818,17.681818,411806.719368,7529092.0,0.012494,-0.054978
4,AADR,2010,27.727478,27.799913,27.633565,27.734522,26.044261,26.044261,5478.26087,142481.6,0.00956,-0.034893


## Data Preprocessing

In [59]:
df = df[(df['price_year'] == 2018) | (df['price_year'] == 2019) | (df['price_year'] == 2020)]
df['yearly_risk'].describe(percentiles=[0.25, 0.5, 0.75])
df['yearly_risk'] = (df['yearly_risk'] > 0.10619).astype(int)

# Prepare data
X = df.drop(columns=['fund_symbol','price_year', 'yearly_risk']).to_numpy(dtype=np.float32)
y = df['yearly_risk'].to_numpy(dtype=np.float32)


## Model 1: Preferential Attachment Model

In [72]:
# Create a directed graph for preferential attachment
G = nx.DiGraph()

# initialize graph
for i in range(5):
    G.add_node(i)

# add edges with prefferential attachment method 
for new_node in range(5, len(df)):
    degrees = np.array([G.degree(n) for n in G.nodes()])
    if degrees.sum() > 0:
        probabilities = degrees / degrees.sum()
    #assume unif if small
    else:
        probabilities = np.ones(len(G.nodes())) / len(G.nodes())
    target_node = np.random.choice(G.nodes(), p=probabilities)
    G.add_edge(new_node, target_node)

# get node features and add to df
degree_features = np.array([G.degree(i) for i in range(len(X))]).reshape(-1, 1)
neighbor_degree_features = np.array([np.mean([G.degree(n) for n in G.neighbors(i)]) for i in range(len(X))]).reshape(-1, 1)
X_combined = np.hstack((X, degree_features, neighbor_degree_features))
mask = ~np.isnan(X_combined).any(axis=1)
X_combined = X_combined[mask]
y_pa = y[mask]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_pa, test_size=0.2, random_state=42)

# classify with k-nearest neighbors
model = KNeighborsClassifier(n_neighbors=5)

# fit model with training data
model.fit(X_train, y_train)

# predictions
y_pred = model.predict(X_test)

# evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

#save model
joblib.dump(model, './model_PA.joblib')

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Accuracy: 0.9972875226039783
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1103
           1       0.00      0.00      0.00         3

    accuracy                           1.00      1106
   macro avg       0.50      0.50      0.50      1106
weighted avg       0.99      1.00      1.00      1106



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


['./model_PA.joblib']

## Model 2: Neural Network Model

In [71]:
# Data to tensors
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.int64)  

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# DataLoader
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=32)

# Get device
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

# Define NN class
class NeuralNetwork(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_size, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 2) 
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

# Instantiate model
input_size = X_train.shape[1]
model = NeuralNetwork(input_size).to(device)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

# Test the model
model.eval()
with torch.no_grad():
    X_test = X_test.to(device)
    y_pred_logits = model(X_test)
    y_pred = torch.argmax(y_pred_logits, dim=1).cpu().numpy() 

# classification metrics
accuracy = accuracy_score(y_test.cpu().numpy(), y_pred)
report = classification_report(y_test.cpu().numpy(), y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

#save model
joblib.dump(model, './model_nn.joblib')


  X = torch.tensor(X, dtype=torch.float32)
  y = torch.tensor(y, dtype=torch.int64)  # Ensure y is int for classification


Accuracy: 0.9981949458483754
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1106
           1       0.00      0.00      0.00         2

    accuracy                           1.00      1108
   macro avg       0.50      0.50      0.50      1108
weighted avg       1.00      1.00      1.00      1108



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


['model_nn.joblib']

Both models performed with extremely high accuracy. The PA model performed at 99.7% accuracy, whereas the neural network performed at 99.8% accuracy. It should be noted that the PA model required some input firms to be dropped (9). It should also be noted that the neural network requires higher computational power.