In [12]:
# import libraries
import pandas as pd
import numpy as np
import helpers
import cuml, cudf, cupy
import warnings

# import preprocessing libraries and metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer


# import models
import xgboost as xgb # XGBoost
from cuml.metrics import mean_absolute_error, mean_squared_error, r2_score
from cuml.ensemble import RandomForestRegressor # Random Forest

# surpress future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# set dataframe display 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 10000)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 10000)

In [13]:
# import test data

test_df = pd.read_csv('../data/test.csv')

test_ids = test_df['id']

test_df = helpers.clean_headers(test_df)

print(test_df.head())

  sex  length  diameter  height     weight  shucked_weight  viscera_weight  shell_weight
0   I  1.0500    0.7625  0.2750   8.618248        3.657085        1.729319      2.721552
1   I  1.1625    0.8875  0.2750  15.507176        7.030676        3.246018      3.968930
2   F  1.2875    0.9875  0.3250  14.571643        5.556502        3.883882      4.819415
3   F  1.5500    0.9875  0.3875  28.377849       13.380964        6.548735      7.030676
4   I  1.1125    0.8500  0.2625  11.765042        5.528153        2.466407      3.331066


### Normalizing data

In [14]:
# normalize numerical variables and one hot encode categorical variables

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['length', 'diameter', 'height', 'weight','shucked_weight', 'viscera_weight', 'shell_weight']),
        ('cat', OneHotEncoder(), ['sex'])
    ]
)

# apply transformations
scaled_data = preprocessor.fit_transform(test_df)

# get column headers
column_names = preprocessor.get_feature_names_out()

# Convert transformed data back to Dataframe
scaled_df = pd.DataFrame(scaled_data, columns=column_names)


print(scaled_df.head()) # debug

   num__length  num__diameter  num__height  num__weight  num__shucked_weight  num__viscera_weight  num__shell_weight  cat__sex_F  cat__sex_I  cat__sex_M
0    -0.940227      -1.115249    -0.798762    -1.174409            -1.153502            -1.197655          -1.125492         0.0         1.0         0.0
1    -0.548169      -0.586910    -0.798762    -0.629467            -0.552334            -0.654334          -0.776635         0.0         1.0         0.0
2    -0.112548      -0.164238    -0.256807    -0.703471            -0.815029            -0.425835          -0.538778         1.0         0.0         0.0
3     0.802256      -0.164238     0.420637     0.388656             0.579278             0.528784           0.079651         1.0         0.0         0.0
4    -0.722417      -0.745411    -0.934251    -0.925485            -0.820081            -0.933611          -0.955027         0.0         1.0         0.0


In [15]:
### Feature Engineering as train_df

featured_df = scaled_df.copy()

# ratio of shell weight to entire weight
featured_df['shell_weight_ratio'] = featured_df['num__shell_weight'] / featured_df['num__weight']

# meat yield
featured_df['shucked_weight_ratio'] = featured_df['num__shucked_weight'] / featured_df['num__weight']

# ratio of viscera weight to shucked weight
featured_df['vis_shucked_weight_ratio'] = featured_df['num__viscera_weight'] / featured_df['num__shucked_weight']

# General Surface Area
featured_df['surface_area'] = np.pi * 0.5 * featured_df['num__length'] * 0.5 * featured_df['num__diameter'] 


print(featured_df.head())

   num__length  num__diameter  num__height  num__weight  num__shucked_weight  num__viscera_weight  num__shell_weight  cat__sex_F  cat__sex_I  cat__sex_M  shell_weight_ratio  shucked_weight_ratio  vis_shucked_weight_ratio  surface_area
0    -0.940227      -1.115249    -0.798762    -1.174409            -1.153502            -1.197655          -1.125492         0.0         1.0         0.0            0.958347              0.982198                  1.038277      0.823559
1    -0.548169      -0.586910    -0.798762    -0.629467            -0.552334            -0.654334          -0.776635         0.0         1.0         0.0            1.233798              0.877463                  1.184672      0.252683
2    -0.112548      -0.164238    -0.256807    -0.703471            -0.815029            -0.425835          -0.538778         1.0         0.0         0.0            0.765884              1.158582                  0.522478      0.014518
3     0.802256      -0.164238     0.420637     0.388656     

In [16]:
# convert the values to float32 for GPU processing
helpers.convert_float32(featured_df)

num__length                 float32
num__diameter               float32
num__height                 float32
num__weight                 float32
num__shucked_weight         float32
num__viscera_weight         float32
num__shell_weight           float32
cat__sex_F                  float32
cat__sex_I                  float32
cat__sex_M                  float32
shell_weight_ratio          float32
shucked_weight_ratio        float32
vis_shucked_weight_ratio    float32
surface_area                float32
dtype: object


In [17]:
# Initialize XGBoost
bst = xgb.Booster()

# Load model
bst.load_model('best_model.json')

In [18]:
# convert Pandas DF to CuPy array for GPU processing
cupy_array = cupy.array(featured_df.values)

# Create a cuDF DF from CuPy array
cudf_df = cudf.DataFrame(cupy_array, columns = featured_df.columns)

In [19]:
# Conver to DMatrix so XGBoost can process
dtest = xgb.DMatrix(data = featured_df) 

In [20]:
# Predicted values from model
predictions = bst.predict(dtest)

In [21]:
# Convert predictions into df
predictions_df = pd.DataFrame(predictions, columns = ['Age'])

# Add the column id back
predictions_df['id'] = test_ids.values

# Reorder the dataframe
predictions_df = predictions_df[['id', 'Age']]

# round the prediction values and round the ages to integer
predictions_df['Age'] = (np.round(predictions_df['Age'])).astype(int)

print(predictions_df.head())

      id  Age
0  74051    8
1  74052    8
2  74053   10
3  74054   10
4  74055    7
