In [4]:
# import libraries
import pandas as pd
import numpy as np
import helpers
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# set plot theme
plt.style.use('ggplot')

# set dataframe display 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 10000)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 10000)

In [5]:
# import training data

train_df = pd.read_csv('../data/train.csv')

train_df = helpers.clean_headers(train_df)

print(train_df.head())

  sex  length  diameter  height     weight  shucked_weight  viscera_weight  shell_weight  age
0   I  1.5250    1.1750  0.3750  28.973189       12.728926        6.647958      8.348928    9
1   I  1.1000    0.8250  0.2750  10.418441        4.521745        2.324659      3.401940    8
2   M  1.3875    1.1125  0.3750  24.777463       11.339800        5.556502      6.662133    9
3   F  1.7000    1.4125  0.5000  50.660556       20.354941       10.991839     14.996885   11
4   I  1.2500    1.0125  0.3375  23.289114       11.977664        4.507570      5.953395    8


In [6]:
# normalize numerical variables and one hot encode categorical variables

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['length', 'diameter', 'height', 'weight', 'shucked_weight', 'viscera_weight', 'shell_weight']),
        ('cat', OneHotEncoder(), ['sex'])
    ]
)

# apply transformations
scaled_data = preprocessor.fit_transform(train_df)

# get column headers
column_names = preprocessor.get_feature_names_out()

# Convert transformed data back to Dataframe
scaled_df = pd.DataFrame(scaled_data, columns=column_names)

scaled_df['age'] = train_df['age'] # add target variable to df

print(scaled_df.head()) # debug

   num__length  num__diameter  num__height  num__weight  num__shucked_weight  num__viscera_weight  num__shell_weight  cat__sex_F  cat__sex_I  cat__sex_M  age
0     0.721238       0.633982     0.292400     0.441804             0.467188             0.569186           0.453376         0.0         1.0         0.0    9
1    -0.755712      -0.840356    -0.794163    -1.025198            -0.993688            -0.978880          -0.926788         0.0         1.0         0.0    8
2     0.243401       0.370707     0.292400     0.110076             0.219924             0.178363          -0.017224         0.0         0.0         1.0    9
3     1.329394       1.634426     1.650603     2.156483             1.824616             2.124622           2.308095         1.0         0.0         0.0   11
4    -0.234435      -0.050532    -0.115061    -0.007598             0.333464            -0.197233          -0.214955         0.0         1.0         0.0    8


In [7]:
# Seperating the target and response variables
y = scaled_df['age'] # target variable
X = scaled_df.iloc[:, :-1] # independent variable

# Models

#### Linear Regression

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate

linreg = LinearRegression()

lin_reg_cv = cross_validate(linreg, X, y, cv=5,
                            scoring = ('neg_mean_absolute_error', 'neg_mean_squared_error', 'r2'),
                            return_train_score = True)

# Print the results
print('Linear Regression Cross Validation Results: ')

for k,v in lin_reg_cv.items():
    print(k,v)


Linear Regression Cross Validation Results: 
fit_time [0.01590085 0.00842118 0.01890516 0.01255298 0.00691581]
score_time [0.00370216 0.00126266 0.02033401 0.00135303 0.00127935]
test_neg_mean_absolute_error [-1.47402493 -1.49101239 -1.46569384 -1.49755124 -1.48386011]
train_neg_mean_absolute_error [-1.48501352 -1.47994682 -1.48551906 -1.47837694 -1.48208908]
test_neg_mean_squared_error [-4.49574569 -4.57531402 -4.4214374  -4.66123975 -4.49847563]
train_neg_mean_squared_error [-4.53736852 -4.51706078 -4.55543444 -4.49543106 -4.53621669]
test_r2 [0.55185068 0.55245599 0.55929887 0.54206462 0.5474973 ]
train_r2 [0.55049563 0.55037521 0.5486953  0.55302012 0.55161091]


In [None]:
# Function to get CV results out

def get_cv_results(model, X,y, no_of_folds):

    '''
    
    '''

    cv_results = cross_validate(model, X, y, cv = no_of_folds,
                                scoring = ('neg_mean_absolute_error', 'neg_mean_squared_error', 'r2'),
                                return_train_score = True)
    
    print('Cross Validation Results:')

    for k,v in cv_results.items():
        print(k,v)

In [None]:
get_cv_results(linreg, X, y,)

Looking at the cross validations scores from Linear Regression Model, we can see that the 

#### XGBoost

In [None]:


X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)


model = xgb.XGBRegressor()

model = model.fit(X_train, y_train)