In [286]:
import pandas as pd
from sklearn.linear_model import LinearRegression, Lasso, Ridge, HuberRegressor, TheilSenRegressor, RANSACRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.ensemble import StackingRegressor
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import PolynomialFeatures

### filterarnings ignore

In [None]:
warnings.filterwarnings('ignore')

### read the data from csv

In [287]:
train=pd.read_csv('train.csv')
train=train.drop('id', axis=1)
test=pd.read_csv('test.csv')
test=test.drop('id', axis=1)

### train data = 15.000

In [288]:
train.head(2)

Unnamed: 0,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age
0,I,0.55,0.4125,0.1625,1.715145,0.609514,0.396893,0.56699,4.0
1,F,1.5125,1.2125,0.4,31.312023,13.395139,6.265239,8.930093,10.0


### test data = 10.000

In [289]:
test.head(2)

Unnamed: 0,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight
0,I,0.8625,0.65,0.225,5.854172,2.721552,1.048931,1.743494
1,F,1.2875,1.0,0.325,20.326591,9.412034,4.578444,5.244657


### one-hot encoding the 'Sex' column

In [290]:
train = pd.get_dummies(train, columns=['Sex'], prefix='Sex', dtype=int)
test = pd.get_dummies(test, columns=['Sex'], prefix='Sex',dtype=int )
train=train.drop('Sex_Diameter', axis=1)


### Volume = Length * Diameter * Height 

In [291]:
train['Volume']=train['Length']*train['Diameter']*train['Height']
test['Volume']=test['Length']*test['Diameter']*test['Height']

#### Total Weight  =	Weight  +  Shucked Weight  +  Viscera Weight  +  Shell Weight

In [292]:
train['Total Weight']=train['Weight']+train['Shucked Weight']+train['Viscera Weight']+train['Shell Weight']
test['Total Weight']=test['Weight']+test['Shucked Weight']+test['Viscera Weight']+test['Shell Weight']

### correlation of the train

In [293]:
train.corr().style.background_gradient()

Unnamed: 0,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age,Sex_F,Sex_I,Sex_M,Volume,Total Weight
Length,1.0,0.991985,0.937515,0.93679,0.913771,0.920313,0.919572,0.62887,0.362826,-0.683768,0.325589,0.939008,0.939374
Diameter,0.991985,1.0,0.939571,0.938413,0.913195,0.920507,0.92394,0.635549,0.367181,-0.689171,0.326711,0.941777,0.940748
Height,0.937515,0.939571,1.0,0.919799,0.878135,0.902474,0.922967,0.670477,0.366353,-0.676845,0.315334,0.951699,0.920746
Weight,0.93679,0.938413,0.919799,1.0,0.970093,0.973898,0.967175,0.618143,0.362778,-0.669136,0.311244,0.978352,0.998426
Shucked Weight,0.913771,0.913195,0.878135,0.970093,1.0,0.945081,0.908897,0.509258,0.335802,-0.637522,0.30602,0.945042,0.9779
Viscera Weight,0.920313,0.920507,0.902474,0.973898,0.945081,1.0,0.938999,0.588998,0.365317,-0.665388,0.30515,0.960788,0.977955
Shell Weight,0.919572,0.92394,0.922967,0.967175,0.908897,0.938999,1.0,0.685685,0.37047,-0.667228,0.301936,0.963776,0.968007
Age,0.62887,0.635549,0.670477,0.618143,0.509258,0.588998,0.685685,1.0,0.308692,-0.533182,0.228801,0.628188,0.607677
Sex_F,0.362826,0.367181,0.366353,0.362778,0.335802,0.365317,0.37047,0.308692,1.0,-0.470981,-0.494895,0.365431,0.362744
Sex_I,-0.683768,-0.689171,-0.676845,-0.669136,-0.637522,-0.665388,-0.667228,-0.533182,-0.470981,1.0,-0.533308,-0.665433,-0.669929


### boxplots to all columns

In [256]:
# def boxplot_outliers(df):
#     numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
#     for column in numeric_columns:
#         plt.figure(figsize=(10, 5))
#         sns.boxplot(x=df[column])
#         plt.title(f'Boxplot of {column}')
#         plt.show()
# boxplot_outliers(train)

### to outliers

In [257]:
# with_outliers=train.drop('Age', axis=1)
# numeric_columns = with_outliers.select_dtypes(include=[np.number]).columns.tolist()
# for column in numeric_columns:
#     Q1 = train[column].quantile(0.25)
#     Q3 = train[column].quantile(0.75)
#     IQR = Q3 - Q1
#     lower_bound = Q1 - 1.5 * IQR
#     upper_bound = Q3 + 1.5 * IQR
#     train = train[(train[column] >= lower_bound) & (train[column] <= upper_bound)]
# # numeric_columns

### HuberRegressor

In [294]:
X=train.drop(columns=['Age'], axis=1)
y=train['Age']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model=HuberRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("MAE: {}".format(mean_absolute_error(y_test, y_pred)))
print("R2 score: {}".format(r2_score(y_test, y_pred)))


MAE: 1.3600969133187661
R2 score: 0.5833838419790678


### StackingRegressor: PolynomialFeatures, StandardScaler in pipelines

In [297]:
X = train.drop(columns=['Age'], axis=1)
y = train['Age']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline_huber = Pipeline([
    # ('scaler', StandardScaler()),
    ('model', HuberRegressor(epsilon=1.1))
])
pipeline_linear = Pipeline([
    ('poly', PolynomialFeatures(degree=1)),  
    # ('scaler', StandardScaler()),
    ('model', RANSACRegressor())
])
pipeline_ridge = Pipeline([
    ('poly', PolynomialFeatures(degree=3)), 
    # ('scaler', StandardScaler()),
    ('model', Ridge(alpha=10.0))
])
pipeline_theilsen = Pipeline([
    ('poly', PolynomialFeatures(degree=1)),  
    # ('scaler', StandardScaler()),
    ('model', TheilSenRegressor())
])
pipeline_lasso = Pipeline([
    ('poly', PolynomialFeatures(degree=3)),  
    # ('scaler', StandardScaler()),
    ('model', Lasso(alpha=0.0001))
])

stacking_model = StackingRegressor(
    estimators=[
        ('ridge', pipeline_ridge),
        ('huber', pipeline_huber),
        ('linear', pipeline_linear),
        ('theilsen', pipeline_theilsen), 
        ('lasso', pipeline_lasso), 

    ],
    final_estimator=HuberRegressor(epsilon=1.1),
    cv=5
)


In [298]:
stacking_model.fit(X_train, y_train)
y_pred = stacking_model.predict(X_test)
print("MAE: {}".format(mean_absolute_error(y_test, y_pred)))
print("R2 score: {}".format(r2_score(y_test, y_pred)))

MAE: 1.2837031636634422
R2 score: 0.6099996492253643


### Prediction for test dataset and save in last_submission.csv

In [299]:
subm=pd.read_csv('sample_submission.csv')
subm['Age']=stacking_model.predict(test)
subm.to_csv('last_submission.csv', index=False)

Huber epsil

Eng yaxshi model: Pipeline(steps=[('scaler', StandardScaler()), ('poly', PolynomialFeatures()),
                ('model', HuberRegressor(epsilon=1.1))])
Train MAE: 1.3284540488407133, Test MAE: 1.2844400647616203
Train RMSE: 2.0237052597050873, Test RMSE: 1.993619022611312


Eng yaxshi model: Pipeline(steps=[('scaler', StandardScaler()),
                ('poly', PolynomialFeatures(degree=1)),
                ('model', LinearRegression())])
Train MAE: 1.4145876166347156, Test MAE: 1.3700426710743756
Train RMSE: 2.035709090393528, Test RMSE: 2.0269211672584024


Eng yaxshi model: Pipeline(steps=[('scaler', StandardScaler()),
                ('poly', PolynomialFeatures(degree=3)),
                ('model', Ridge(alpha=10.0))])
Train MAE: 1.3403725835306552, Test MAE: 1.3160749442230226
Train RMSE: 1.943674225589905, Test RMSE: 1.9579454136015901


Eng yaxshi model: Pipeline(steps=[('scaler', StandardScaler()),
                ('poly', PolynomialFeatures(degree=3)),
                ('model', Lasso(alpha=0.001))])
Train MAE: 1.3498526307985017, Test MAE: 1.3185079037079412
Train RMSE: 1.9572480521164486, Test RMSE: 1.9667415227064482

### sample

In [272]:
X = train.drop(columns=['Age'], axis=1)
y = train['Age']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
pipeline_huber = Pipeline([
    # ('scaler', StandardScaler()),  # Optional, depending on your data
    ('model', HuberRegressor())
])

param_grid_huber = {
    'model__epsilon': [1.1, 1.35, 1.5]
}
grid_huber = GridSearchCV(pipeline_huber, param_grid_huber, cv=5, scoring='neg_mean_absolute_error')
grid_huber.fit(X_train, y_train)

pipeline_linear = Pipeline([
    ('poly', PolynomialFeatures()),  
    # ('scaler', StandardScaler()),  # Optional, depending on your data
    ('model', LinearRegression())
])

param_grid_linear = {
    'poly__degree': [1, 2, 3]
}
grid_linear = GridSearchCV(pipeline_linear, param_grid_linear, cv=5, scoring='neg_mean_absolute_error')
grid_linear.fit(X_train, y_train)

pipeline_ridge = Pipeline([
    ('poly', PolynomialFeatures()), 
    # ('scaler', StandardScaler()),  # Optional, depending on your data
    ('model', Ridge())
])

param_grid_ridge = {
    'poly__degree': [1, 2, 3],
    'model__alpha': [0.1, 1.0, 10.0]
}
grid_ridge = GridSearchCV(pipeline_ridge, param_grid_ridge, cv=5, scoring='neg_mean_absolute_error')
grid_ridge.fit(X_train, y_train)

pipeline_theilsen = Pipeline([
    ('poly', PolynomialFeatures()),  
    # ('scaler', StandardScaler()),  # Optional, depending on your data
    ('model', TheilSenRegressor())
])

param_grid_theilsen = {
    'poly__degree': [1, 2, 3]
}
grid_theilsen = GridSearchCV(pipeline_theilsen, param_grid_theilsen, cv=5, scoring='neg_mean_absolute_error')
grid_theilsen.fit(X_train, y_train)

pipeline_lasso = Pipeline([
    ('poly', PolynomialFeatures()),  
    # ('scaler', StandardScaler()),  # Optional, depending on your data
    ('model', Lasso())
])

param_grid_lasso = {
    'poly__degree': [1, 2, 3],
    'model__alpha': [0.0001, 0.001, 0.01]
}
grid_lasso = GridSearchCV(pipeline_lasso, param_grid_lasso, cv=5, scoring='neg_mean_absolute_error')
grid_lasso.fit(X_train, y_train)

best_huber = grid_huber.best_estimator_
best_linear = grid_linear.best_estimator_
best_ridge = grid_ridge.best_estimator_
best_theilsen = grid_theilsen.best_estimator_
best_lasso = grid_lasso.best_estimator_

stacking_model = StackingRegressor(
    estimators=[
        ('ridge', best_ridge),
        ('huber', best_huber),
        ('linear', best_linear),
        ('theilsen', best_theilsen), 
        ('lasso', best_lasso), 
    ],
    final_estimator=HuberRegressor(epsilon=1.1),
    cv=5
)

stacking_model.fit(X_train, y_train)
y_pred = stacking_model.predict(X_test)

print("MAE: {}".format(mean_absolute_error(y_test, y_pred)))
print("R2 score: {}".format(r2_score(y_test, y_pred)))

MAE: 1.3170369329047842
R2 score: 0.6040159091676702
