In [1]:
# Imports
from google.cloud import bigquery, storage
import numpy as np
import os
import numpy as np
import modin.pandas as pd
import dask.dataframe as dd
from sklearn.model_selection import train_test_split
import gcsfs
import ast
from google.oauth2 import service_account
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

# TensorFlow will allocate memory on the GPU as needed
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

In [2]:
# Constants
BUCKET_NAME = 'events_trabalho_cdle'
TRAINING_FILE = 'training.csv'
TESTING_FILE = 'testing.csv'

# Authenticate using the service account key file with specified scopes
credentials = service_account.Credentials.from_service_account_file(
    '/Users/rcs/Downloads/cdla-trabalho-db9f3d742a66.json',
    scopes=["https://www.googleapis.com/auth/devstorage.read_write"]
)
fs = gcsfs.GCSFileSystem(project='cdla-trabalho', token=credentials)

# Load CSV files into Dask DataFrames using the authenticated GCSFileSystem
storage_options = {'project': 'cdla-trabalho', 'token': credentials}
ddf_train = dd.read_csv(f'gs://{BUCKET_NAME}/{TRAINING_FILE}', storage_options=storage_options)
ddf_test = dd.read_csv(f'gs://{BUCKET_NAME}/{TESTING_FILE}', storage_options=storage_options)

# Preprocess the Padded_Sequence column
def eval_sequence(seq):
    try:
        return ast.literal_eval(seq)
    except (ValueError, SyntaxError):
        return []

# Convert Padded_Sequence to a list of tuples
ddf_train['Padded_Sequence'] = ddf_train['Padded_Sequence'].apply(eval_sequence, meta=('Padded_Sequence', 'object'))
ddf_test['Padded_Sequence'] = ddf_test['Padded_Sequence'].apply(eval_sequence, meta=('Padded_Sequence', 'object'))

# Extract more features: mean, std, max, min, length, unique count, variance, skewness, kurtosis
def extract_features(seq):
    values = []
    for item in seq:
        if isinstance(item, list):
            values.extend([x[1] for x in item if isinstance(x, tuple) and x[1] is not None])
        elif isinstance(item, tuple) and item[1] is not None:
            values.append(item[1])

    if values:
        return pd.Series({
            'mean': np.mean(values),
            'std': np.std(values),
            'max': np.max(values),
            'min': np.min(values),
            'length': len(values),
            'unique': len(set(values)),
            'variance': np.var(values),
            'skewness': pd.Series(values).skew(),
            'kurtosis': pd.Series(values).kurtosis()
        })
    else:
        return pd.Series({
            'mean': 0,
            'std': 0,
            'max': 0,
            'min': 0,
            'length': 0,
            'unique': 0,
            'variance': 0,
            'skewness': 0,
            'kurtosis': 0
        })

# Apply feature extraction
ddf_train_features = ddf_train['Padded_Sequence'].apply(extract_features, meta={
    'mean': 'float64', 'std': 'float64', 'max': 'float64', 'min': 'float64', 'length': 'int', 'unique': 'int',
    'variance': 'float64', 'skewness': 'float64', 'kurtosis': 'float64'
})
ddf_test_features = ddf_test['Padded_Sequence'].apply(extract_features, meta={
    'mean': 'float64', 'std': 'float64', 'max': 'float64', 'min': 'float64', 'length': 'int', 'unique': 'int',
    'variance': 'float64', 'skewness': 'float64', 'kurtosis': 'float64'
})

# Ensure Dask tasks are computed properly
print("Computing train features...")
X_train = ddf_train_features.compute()
print("Computing train targets...")
y_train = ddf_train['LOS'].compute()

print("Computing test features...")
X_test = ddf_test_features.compute()
print("Computing test targets...")
y_test = ddf_test['LOS'].compute()

# Remove constant features
X_train = X_train.loc[:, X_train.apply(pd.Series.nunique) > 1]
X_test = X_test.loc[:, X_test.apply(pd.Series.nunique) > 1]

# Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train-Test split (if you don't have a separate test set)
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_scaled, y_train, test_size=0.2, random_state=42, shuffle=True)

# Create Random Forest Regressor model
rf = RandomForestRegressor(random_state=42)

# Define a parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Grid search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_split, y_train_split)

# Best model from grid search
best_rf = grid_search.best_estimator_

# Predictions
y_pred_train = best_rf.predict(X_train_split)
y_pred_val = best_rf.predict(X_val_split)
y_pred_test = best_rf.predict(X_test_scaled)

# Evaluate the model
mse_train = mean_squared_error(y_train_split, y_pred_train)
mse_val = mean_squared_error(y_val_split, y_pred_val)
mse_test = mean_squared_error(y_test, y_pred_test)

# R-squared
r2_train = r2_score(y_train_split, y_pred_train)
r2_val = r2_score(y_val_split, y_pred_val)
r2_test = r2_score(y_test, y_pred_test)

print(f"Train MSE: {mse_train}")
print(f"Validation MSE: {mse_val}")
print(f"Test MSE: {mse_test}")

print(f"Train R2: {r2_train}")
print(f"Validation R2: {r2_val}")
print(f"Test R2: {r2_test}")


Computing train features...
