In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# ML

In [None]:
import pandas as pd
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
import random
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

## Load Data

In [None]:
training_data = pd.read_csv('/content/drive/MyDrive/DMHW3/training.csv')
test_data = pd.read_csv('/content/drive/MyDrive/DMHW3/test_X.csv')

X_train = training_data.drop(columns=['lettr'])
X_test = test_data

## Preprocessing

In [None]:
def std(X_train, X_test):
  '''
  args:
    X_train: training data (df)
    X_test: testing data (df)
  return:
    X_train_scaled: scaled training data (numpy)
    X_test_scaled: scaled testing data (numpy)
  '''
  # standardize the data
  scaler = StandardScaler()
  X_train_scaled = scaler.fit_transform(X_train)
  X_test_scaled = scaler.transform(X_test)

  return X_train_scaled, X_test_scaled

In [None]:
X_train_scaled, X_test_scaled = std(X_train, X_test)

## Train

### OneClassSVM

In [None]:
ocsvm = OneClassSVM(kernel='rbf', gamma='auto')
ocsvm.fit(X_train_scaled)

### IsoForest

In [None]:
iso_forest = IsolationForest(contamination=0.1, random_state=42)
iso_forest.fit(X_train_scaled)

## inference

In [None]:
current_model = iso_forest

In [None]:
test_preds = current_model.decision_function(X_test_scaled)
test_outliers = current_model.predict(X_test_scaled)

outliers = -test_preds  # Negative of the decision function values

In [None]:
results = pd.DataFrame({'id': range(len(outliers)), 'outliers': outliers})
results.to_csv('submission.csv', index=False)

# Rule-based

In [None]:
import pandas as pd
import numpy as np
import csv
import random
import math

## Load data

In [None]:
training_data = pd.read_csv('/content/drive/MyDrive/DMHW3/training.csv')
test_data = pd.read_csv('/content/drive/MyDrive/DMHW3/test_X.csv')

X_train = training_data.drop(columns=['lettr']).to_numpy()
X_test = test_data.to_numpy()

## Rule-based

In [None]:
prediction = {'id':[], 'outliers':[]}

for i, data_test in enumerate(X_test):
    min_dist = math.inf
    for _, data_train in enumerate(X_train):
        dist = np.linalg.norm(data_train - data_test)
        if dist == 0 and min_dist <= 0:
            min_dist -= 1.2
        if dist < min_dist:
            min_dist = dist
    prediction['id'].append(i)
    prediction['outliers'].append(min_dist)

df = pd.DataFrame.from_dict(prediction, orient='columns')
df.to_csv('predicted.csv', index=False)