In [34]:
import pandas as pd
import numpy as np

In [35]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the data
data = pd.read_csv("sink.csv")

# Identify and interpolate missing values
data = data.interpolate()

# Scale the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

# Convert the scaled array back to a DataFrame and set the column names
data_scaled = pd.DataFrame(data_scaled, columns=data.columns)


In [36]:
data_scaled.shape

(149585, 45)

In [37]:
data_scaled = data_scaled.iloc[:, 1:]

# Split data into training, validation, and test sets
n_obs = data_scaled.shape[0]
train_size = 0.7
val_size = 0.15
test_size = 0.15

trainIndex = np.random.choice(n_obs, size = round(train_size * n_obs), replace = False)
trainData = data_scaled.iloc[trainIndex, :]
testvalData = data_scaled.iloc[np.setdiff1d(np.arange(n_obs), trainIndex), :]

valIndex = np.random.choice(testvalData.shape[0], size = round(val_size * testvalData.shape[0]), replace = False)
valData = testvalData.iloc[valIndex, :]
testData = testvalData.iloc[np.setdiff1d(np.arange(testvalData.shape[0]), valIndex), :]

In [14]:
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix

# Load data
data = pd.read_csv("sink.csv")

# Define targets (binary classification problem)
data["sank"] = (data["sank"] > 0).astype(int)
targets = data["sank"].values

# Drop any rows with NaNs
data.dropna(inplace=True)

# Scale data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data.drop(columns=["sank"]).values)

# Split data into training, validation, and test sets
n_obs = data_scaled.shape[0]
train_size = 0.7
val_size = 0.15
test_size = 0.15

trainIndex = np.random.choice(n_obs, size = round(train_size * n_obs), replace = False)
trainData = data_scaled[trainIndex, :]
trainTargets = targets[trainIndex]
testvalData = data_scaled[np.setdiff1d(np.arange(n_obs), trainIndex), :]
testvalTargets = targets[np.setdiff1d(np.arange(n_obs), trainIndex)]

valIndex = np.random.choice(testvalData.shape[0], size = round(val_size * testvalData.shape[0]), replace = False)
valData = testvalData[valIndex, :]
valTargets = testvalTargets[valIndex]
testData = testvalData[np.setdiff1d(np.arange(testvalData.shape[0]), valIndex), :]
testTargets = testvalTargets[np.setdiff1d(np.arange(testvalData.shape[0]), valIndex)]

# Train and validate the neural network
best_auc = 0
for hl in [10, 20, 30, 40, 50, 60]:
    for hn in [10, 20, 30, 40, 50, 60]:
        clf = MLPClassifier(hidden_layer_sizes=(hl, hn), max_iter=500, random_state=0)
        clf.fit(trainData, trainTargets)
        val_pred = clf.predict_proba(valData)[:, 1]
        val_auc = roc_auc_score(valTargets, val_pred)
        if val_auc > best_auc:
            best_auc = val_auc
            best_hl = hl
            best_hn = hn

50 50
0.9888227614256744


In [24]:
# Test the neural network
clf = MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=500, random_state=0)
clf.fit(trainData, trainTargets)
test_pred = clf.predict_proba(testData)[:, 1]
test_auc = roc_auc_score(testTargets, test_pred)
print(test_auc)

60 60
0.9888227614256744


In [35]:
import pandas as pd
df = pd.read_csv('sink.csv') # read in the original dataset
# assuming df is your dataframe
feature_names = df.columns[1:3].append(df.columns[4:])
 # get all columns except the last one (which is typically the target variable)
print(feature_names)

Index(['Longitude', 'Latitude', 'AirTempSurface', 'Cloudiness', 'LatentHeat',
       'HumidityMinusTemp', 'Humidity', 'HeatParameter', 'Humidity.1',
       'Pressure', 'SeaAirTempDiff', 'SeaSurfaceTemp',
       'SensibleHeatTransEastward', 'ZonalLatentHeatParameter', 'UWindStress',
       'LatentHeatTransEastward', 'UWind', 'SensibleHeatTransNorthward',
       'MeridonalLatentHeatParameter', 'VWindStress',
       'LatentHeatTransNorthward', 'VWind', 'ScalarWind', 'ScalarWindCubed',
       'calcite', 'u_current', 'v_current', 'ice', 'pressure', 'Temperature',
       'SurfaceVelocity', 'Temperature.1', 'PotentialDensity', 'UCurrent',
       'VCurrent', 'SeaIceConcentration', 'uv_magnitude10',
       'MeridonalSurfaceWindStress10', 'curl10', 'uv_magnitude20',
       'ZonalSurfaceWindStress20', 'MeridonalSurfaceWindStress20', 'curl20'],
      dtype='object')


In [36]:
# Get feature importances
importances = clf.coefs_[0]
# Print feature importances
for i in range(len(feature_names)):
    print(f"{feature_names[i]}: {importances[i].mean()}")

Longitude: 0.056843051121418436
Latitude: -0.0009578692045793535
AirTempSurface: 0.020602952943963828
Cloudiness: 9.518859309471329e-05
LatentHeat: -0.030998211334352974
HumidityMinusTemp: 0.03872318875433142
Humidity: -0.015161107025264138
HeatParameter: -0.025825357921530546
Humidity.1: -0.029194852037949567
Pressure: 0.024247822851889908
SeaAirTempDiff: -0.02729058014519422
SeaSurfaceTemp: -0.005189665442216424
SensibleHeatTransEastward: 0.02342614884839932
ZonalLatentHeatParameter: -0.031053156218541576
UWindStress: -0.0344811732663145
LatentHeatTransEastward: -0.016072876161012068
UWind: -0.0195676709465081
SensibleHeatTransNorthward: 0.001117852423222557
MeridonalLatentHeatParameter: 0.007229430794977195
VWindStress: 0.02551872522301418
LatentHeatTransNorthward: 0.013258316642082919
VWind: 0.006282918297360456
ScalarWind: 0.03151439718672922
ScalarWindCubed: -0.013521687854069627
calcite: -0.008502569004255585
u_current: 0.009772033376130293
v_current: 0.05923074534371144
ice: 0.

In [38]:
# Create a dataframe with feature names and importances
df_importances = pd.DataFrame({'Feature': feature_names, 'Importance': [importances[i].mean() for i in range(len(feature_names))]})

# Sort the dataframe by importance, in descending order
df_importances = df_importances.sort_values(by='Importance', ascending=False)

# Print the sorted dataframe
print(df_importances)
df_importances.to_csv("2nn_importances.csv")

                         Feature  Importance
30               SurfaceVelocity    0.081721
26                     v_current    0.059231
0                      Longitude    0.056843
32              PotentialDensity    0.052801
39                uv_magnitude20    0.050192
27                           ice    0.044356
5              HumidityMinusTemp    0.038723
42                        curl20    0.032148
22                    ScalarWind    0.031514
19                   VWindStress    0.025519
9                       Pressure    0.024248
12     SensibleHeatTransEastward    0.023426
2                 AirTempSurface    0.020603
38                        curl10    0.019505
41  MeridonalSurfaceWindStress20    0.018438
20      LatentHeatTransNorthward    0.013258
25                     u_current    0.009772
18  MeridonalLatentHeatParameter    0.007229
21                         VWind    0.006283
29                   Temperature    0.001502
17    SensibleHeatTransNorthward    0.001118
40      Zo

In [15]:
import numpy as np

# Try different threshold values
thresholds = np.arange(0.1, 1.0, 0.01)
best_threshold = None
best_accuracy = 0.0

for threshold in thresholds:
    # Convert test_pred and testTargets to binary variables
    test_pred_binary = [1 if x >= threshold else 0 for x in test_pred]
    testTargets_binary = [1 if x >= threshold else 0 for x in testTargets]

    # Calculate accuracy score
    accuracy = accuracy_score(testTargets_binary, test_pred_binary)

    # Update best threshold and accuracy if necessary
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_threshold = threshold

# Convert test_pred and testTargets to binary variables using the best threshold
test_pred_binary = [1 if x >= best_threshold else 0 for x in test_pred]
testTargets_binary = [1 if x >= best_threshold else 0 for x in testTargets]

# Get accuracy score
accuracy = accuracy_score(testTargets_binary, test_pred_binary)
print("Best Threshold:", round(best_threshold, 2))
print("Accuracy Score:", accuracy)

# Get confusion matrix
cm = confusion_matrix(testTargets_binary, test_pred_binary)
print("Confusion Matrix:\n", cm)


Best Threshold: 0.37
Accuracy Score: 0.9981386325503355
Confusion Matrix:
 [[37948    21]
 [   50   125]]


In [22]:
clf_

MLPClassifier(hidden_layer_sizes=(60, 60, 60), max_iter=500, random_state=0)

In [16]:
# Train and validate the neural network
best_auc = 0
for hl in [10, 20, 30, 40, 50, 60]:
    clf = MLPClassifier(hidden_layer_sizes=(hl, hn), max_iter=500, random_state=0)
    clf.fit(trainData, trainTargets)
    val_pred = clf.predict_proba(valData)[:, 1]
    val_auc = roc_auc_score(valTargets, val_pred)
    if val_auc > best_auc:
        best_auc = val_auc
        best_hl = hl
        best_hn = hn

# Test the neural network
clf1 = MLPClassifier(hidden_layer_sizes=(best_hl), max_iter=500, random_state=0)
clf1.fit(trainData, trainTargets)
test_pred = clf1.predict_proba(testData)[:, 1]
test_auc = roc_auc_score(testTargets, test_pred)
print(best_hl)
print(test_auc)

30
0.9878935823585406


In [18]:
import numpy as np

# Try different threshold values
thresholds = np.arange(0.1, 1.0, 0.01)
best_threshold = None
best_accuracy = 0.0

for threshold in thresholds:
    # Convert test_pred and testTargets to binary variables
    test_pred_binary = [1 if x >= threshold else 0 for x in test_pred]
    testTargets_binary = [1 if x >= threshold else 0 for x in testTargets]

    # Calculate accuracy score
    accuracy = accuracy_score(testTargets_binary, test_pred_binary)

    # Update best threshold and accuracy if necessary
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_threshold = threshold

# Convert test_pred and testTargets to binary variables using the best threshold
test_pred_binary = [1 if x >= best_threshold else 0 for x in test_pred]
testTargets_binary = [1 if x >= best_threshold else 0 for x in testTargets]

# Get accuracy score
accuracy = accuracy_score(testTargets_binary, test_pred_binary)
print("Best Threshold:", round(best_threshold, 2))
print("Accuracy Score:", accuracy)

# Get confusion matrix
cm = confusion_matrix(testTargets_binary, test_pred_binary)
print("Confusion Matrix:\n", cm)


Best Threshold: 0.38
Accuracy Score: 0.9972734899328859
Confusion Matrix:
 [[37938    31]
 [   73   102]]


In [19]:
# Train and validate the neural network
best_auc = 0
for h1 in [10, 20, 30, 40, 50, 60]:
    for h2 in [10, 20, 30, 40, 50, 60]:
        for h3 in [10, 20, 30, 40, 50, 60]:
            clf = MLPClassifier(hidden_layer_sizes=(h1, h2, h3), max_iter=500, random_state=0)
            clf.fit(trainData, trainTargets)
            val_pred = clf.predict_proba(valData)[:, 1]
            val_auc = roc_auc_score(valTargets, val_pred)
            if val_auc > best_auc:
                best_auc = val_auc
                best_hl = hl
                best_hn = hn

# Test the neural network
clf3 = MLPClassifier(hidden_layer_sizes=(best_hl), max_iter=500, random_state=0)
clf3.fit(trainData, trainTargets)
test_pred = clf3.predict_proba(testData)[:, 1]
test_auc = roc_auc_score(testTargets, test_pred)
print(best_hl)
print(test_auc)

60
0.9908996436942921


In [20]:
import numpy as np

# Try different threshold values
thresholds = np.arange(0.1, 1.0, 0.01)
best_threshold = None
best_accuracy = 0.0

for threshold in thresholds:
    # Convert test_pred and testTargets to binary variables
    test_pred_binary = [1 if x >= threshold else 0 for x in test_pred]
    testTargets_binary = [1 if x >= threshold else 0 for x in testTargets]

    # Calculate accuracy score
    accuracy = accuracy_score(testTargets_binary, test_pred_binary)

    # Update best threshold and accuracy if necessary
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_threshold = threshold

# Convert test_pred and testTargets to binary variables using the best threshold
test_pred_binary = [1 if x >= best_threshold else 0 for x in test_pred]
testTargets_binary = [1 if x >= best_threshold else 0 for x in testTargets]

# Get accuracy score
accuracy = accuracy_score(testTargets_binary, test_pred_binary)
print("Best Threshold:", round(best_threshold, 2))
print("Accuracy Score:", accuracy)

# Get confusion matrix
cm = confusion_matrix(testTargets_binary, test_pred_binary)
print("Confusion Matrix:\n", cm)


Best Threshold: 0.51
Accuracy Score: 0.9972472734899329
Confusion Matrix:
 [[37938    31]
 [   74   101]]


In [21]:
print(h1)
print(h2)
print(h3)

60
60
60
