In [None]:
# import libraries
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# set plot theme
plt.style.use('ggplot')

# set dataframe display 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 10000)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 10000)



In [None]:
# As there are multiple outliers within the dataset, we will scale the values of the independent variables with standard scaler

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline



preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['length', 'diameter', 'height', 'weight', 'shucked_weight', 'viscera_weight', 'shell_weight']),
        ('cat', OneHotEncoder(), ['sex'])
    ]
)

# apply transformations
scaled_data = preprocessor.fit_transform(train_df)

# get column headers
column_names = preprocessor.get_feature_names_out()

# Convert transformed data back to Dataframe
scaled_df = pd.DataFrame(scaled_data, columns=column_names)

scaled_df['age'] = train_df['age'] # add target variable to df

print(scaled_df.head()) # debug

In [None]:
# Seperating the target and response variables
y = scaled_df['age'] # target variable
X = scaled_df.iloc[:, :-1] # independent variable

# Models

#### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate

linreg = LinearRegression()

lin_reg_cv = cross_validate(linreg, X, y, cv=5,
                            scoring = ('neg_mean_absolute_error', 'neg_mean_squared_error', 'r2'),
                            return_train_score = True)

# Print the results
print('Linear Regression Cross Validation Results: ')

for k,v in lin_reg_cv.items():
    print(k,v)


In [None]:
# Function to get CV results out

def get_cv_results(model, X,y, no_of_folds):

    '''
    
    '''

    cv_results = cross_validate(model, X, y, cv = no_of_folds,
                                scoring = ('neg_mean_absolute_error', 'neg_mean_squared_error', 'r2'),
                                return_train_score = True)
    
    print('Cross Validation Results:')

    for k,v in cv_results.items():
        print(k,v)

In [None]:
get_cv_results(linreg, X, y,)

Looking at the cross validations scores from Linear Regression Model, we can see that the 

#### XGBoost

In [None]:


X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)


model = xgb.XGBRegressor()

model = model.fit(X_train, y_train)