In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
import numpy as np

### Load data

In [2]:
# Load the dataset
train = pd.read_csv('data/train.csv')

test = pd.read_csv('data/test.csv')

### Explore the data

In [3]:
# View head of the dataset
train.head(3)

Unnamed: 0,id,Lat_Dec,Lon_Dec,NO2uM,NO3uM,NH3uM,R_TEMP,R_Depth,R_Sal,R_DYNHT,R_Nuts,R_Oxy_micromol.Kg,Unnamed: 12,PO4uM,SiO3uM,TA1.x,Salinity1,Temperature_degC,DIC
0,1,34.38503,-120.66553,0.03,33.8,0.0,7.79,323,141.2,0.642,0.0,37.40948,,2.77,53.86,2287.45,34.198,7.82,2270.17
1,2,31.418333,-121.998333,0.0,34.7,0.0,7.12,323,140.8,0.767,0.0,64.81441,,2.57,52.5,2279.1,34.074,7.15,2254.1
2,3,34.38503,-120.66553,0.18,14.2,0.0,11.68,50,246.8,0.144,0.0,180.2915,,1.29,13.01,2230.8,33.537,11.68,2111.04


In [4]:
# Check for missing values
train.isnull().sum()

id                      0
Lat_Dec                 0
Lon_Dec                 0
NO2uM                   0
NO3uM                   0
NH3uM                   0
R_TEMP                  0
R_Depth                 0
R_Sal                   0
R_DYNHT                 0
R_Nuts                  0
R_Oxy_micromol.Kg       0
Unnamed: 12          1454
PO4uM                   0
SiO3uM                  0
TA1.x                   0
Salinity1               0
Temperature_degC        0
DIC                     0
dtype: int64

In [5]:
# Check shape of the dataset
train.shape

(1454, 19)

In [6]:
# Drop the missing values
train = train.drop('Unnamed: 12', axis = 1)

train.isnull().sum()

id                   0
Lat_Dec              0
Lon_Dec              0
NO2uM                0
NO3uM                0
NH3uM                0
R_TEMP               0
R_Depth              0
R_Sal                0
R_DYNHT              0
R_Nuts               0
R_Oxy_micromol.Kg    0
PO4uM                0
SiO3uM               0
TA1.x                0
Salinity1            0
Temperature_degC     0
DIC                  0
dtype: int64

In [7]:
# Check the data types of the columns
train.dtypes

id                     int64
Lat_Dec              float64
Lon_Dec              float64
NO2uM                float64
NO3uM                float64
NH3uM                float64
R_TEMP               float64
R_Depth                int64
R_Sal                float64
R_DYNHT              float64
R_Nuts               float64
R_Oxy_micromol.Kg    float64
PO4uM                float64
SiO3uM               float64
TA1.x                float64
Salinity1            float64
Temperature_degC     float64
DIC                  float64
dtype: object

### Train our model

In [8]:
# Define features
X = train.drop('DIC', axis = 1)

# Define target
y = train['DIC']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=808)

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the training data
X_train = scaler.fit_transform(X_train)

# Transform the evaluation data
X_test = scaler.transform(X_test)


### Build the model

In [9]:
dt = DecisionTreeRegressor()

param_grid = {
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 6, 7],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

# Best hyperparameters
best_params = grid_search.best_params_
print(f"Best hyperparameters: {best_params}")

# Best model
best_dt_model = grid_search.best_estimator_

Best hyperparameters: {'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 7}


### Evaluate the model

In [10]:
# Predict on the best model
dt_y_train_pred = best_dt_model.predict(X_test)

# RMSE on training data
dt_train_rmse = np.sqrt(mean_squared_error(y_test, dt_y_train_pred))
print(f"RMSE on training data: {dt_train_rmse:.3f}")

RMSE on training data: 9.082


In [11]:
# Examine test data
# Make sure columns are the same as training data
test.isnull().sum()

id                   0
Lat_Dec              0
Lon_Dec              0
NO2uM                0
NO3uM                0
NH3uM                0
R_TEMP               0
R_Depth              0
R_Sal                0
R_DYNHT              0
R_Nuts               0
R_Oxy_micromol.Kg    0
PO4uM                0
SiO3uM               0
TA1                  0
Salinity1            0
Temperature_degC     0
dtype: int64

In [12]:
# Rename TA1 column to training
test = test.rename(columns={'TA1':'TA1.x'})

In [13]:
# Evaluate test data on our training model
X = train.drop('DIC', axis = 1)
y = train['DIC']
best_dt_model.fit(X, y)

# Make predictions on the test data
y_pred_total = best_dt_model.predict(test)

In [14]:
# Create a submission
test['DIC'] = y_pred_total
submission = test[['id', 'DIC']]
submission.head()

Unnamed: 0,id,DIC
0,1455,2169.29
1,1456,2194.345
2,1457,2303.119463
3,1458,1995.07
4,1459,2153.156


In [15]:
# Save the submission to csv
# submission.to_csv('submission.csv', index=False)