In [57]:
import pandas as pd
import numpy as np

import mplcyberpunk
import matplotlib.pyplot as plt

from scipy.stats import boxcox

pd.options.display.max_columns
pd.set_option('display.max_rows', 500)

import scipy.stats

plt.style.use("cyberpunk")

from datetime import datetime
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, cross_val_score

import warnings
warnings.filterwarnings('ignore')

In [48]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
s = pd.read_csv("sample_submission.csv")

In [49]:
target = pd.DataFrame(train['Rings'])
train.drop(columns = ['id', 'Rings'], inplace = True)
test.drop(columns=['id'], inplace= True)

train.shape, test.shape

((90615, 8), (60411, 8))

In [50]:
data = pd.concat([train, test], axis = 0).reset_index(drop=True)
data.shape

(151026, 8)

In [51]:
data.rename(columns={'Whole weight.1': 'Shucked weight'}, inplace=True)
data.rename(columns={'Whole weight.2': 'Viscera weight'}, inplace=True)
data.rename(columns={'Whole weight': 'Whole weight'}, inplace=True)
data.rename(columns={'Shell weight': 'Shell weight'}, inplace=True)

In [52]:
data['Sex'] = data['Sex'].astype('category').cat.codes

In [53]:
def boxcox_transform(series):
    series = series + 1e-10
    transformed, _ = boxcox(series)
    return transformed

In [54]:
for i in data.columns:
    if abs(scipy.stats.skew(data[i])) >= 0.5:
        if abs(scipy.stats.skew(np.log1p(data[i])) < 0.5):
            data[i] = np.log1p(data[i])
        

In [55]:
scipy.stats.skew(data)

array([-0.09379876, -0.93272439, -0.86310085,  0.40859532,  0.43184444,
        0.21590715,  0.47648355,  0.4748712 ])

In [56]:
data.columns

Index(['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight',
       'Viscera weight', 'Shell weight'],
      dtype='object')

In [58]:
scaler = StandardScaler()
scaler.fit(data)

data2 = pd.DataFrame(scaler.transform(data), index=data.index, columns = data.columns)

In [59]:
data2

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight
0,-1.325117,0.304335,0.312275,0.378761,-0.039214,0.013068,-0.227315,0.107737
1,-1.325117,0.928483,0.883855,0.247439,0.744160,0.635236,1.061947,0.722876
2,-0.066092,-3.290290,-3.210437,-2.904273,-1.679164,-1.850227,-1.650462,-1.699234
3,1.192934,0.659275,0.743147,0.378761,0.273261,0.245619,0.357811,0.184629
4,-0.066092,0.344278,0.263566,-0.146525,-0.016270,0.216378,-0.093430,-0.219056
...,...,...,...,...,...,...,...,...
151021,-0.066092,-1.455069,-1.447767,-1.328417,-1.337189,-1.412510,-1.417403,-1.353218
151022,-1.325117,0.102667,0.116406,0.247439,0.120301,0.308540,-0.024008,-0.161386
151023,-0.066092,0.620335,0.409185,0.510082,0.726678,0.330183,0.303266,0.299968
151024,-1.325117,1.154671,1.206739,1.429331,1.538459,1.208639,1.865256,1.607138


In [62]:
target['Rings'] = np.log1p(target['Rings'])
scipy.stats.skew(target)

array([-0.01086627])

In [63]:
train2 = data.loc[:train.index.max(), :].copy()
test2 = data.loc[train.index.max()+1:,:].reset_index(drop = True).copy()

In [66]:
from pycaret.regression import *

In [67]:
_ = setup(data=pd.concat([train2, target], axis=1), target = 'Rings')

Unnamed: 0,Description,Value
0,Session id,7042
1,Target,Rings
2,Target type,Regression
3,Original data shape,"(90615, 9)"
4,Transformed data shape,"(90615, 9)"
5,Transformed train set shape,"(63430, 9)"
6,Transformed test set shape,"(27185, 9)"
7,Numeric features,8
8,Preprocess,True
9,Imputation type,simple


In [69]:
model = compare_models(sort='RMSLE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,0.1084,0.0221,0.1485,0.7265,0.0446,0.0468,6.385
lightgbm,Light Gradient Boosting Machine,0.1089,0.0223,0.1492,0.7239,0.0448,0.047,0.486
xgboost,Extreme Gradient Boosting,0.1091,0.0224,0.1496,0.7224,0.0449,0.0471,1.462
gbr,Gradient Boosting Regressor,0.1114,0.0231,0.1521,0.7132,0.0455,0.048,2.51
rf,Random Forest Regressor,0.1118,0.0233,0.1526,0.7115,0.0458,0.0484,13.014
et,Extra Trees Regressor,0.1125,0.0235,0.1534,0.7082,0.0461,0.0487,4.494
knn,K Neighbors Regressor,0.1194,0.0265,0.1628,0.6716,0.0487,0.0514,0.392
ridge,Ridge Regression,0.122,0.0275,0.1657,0.6595,0.0494,0.0524,0.133
br,Bayesian Ridge,0.122,0.0275,0.1657,0.6595,0.0494,0.0524,0.103
lr,Linear Regression,0.122,0.0275,0.1657,0.6595,0.0494,0.0524,0.284


In [72]:
p = np.expm1(model.predict(test2))

In [74]:
s['Rings'] = p

In [75]:
s.to_csv("Submission1.csv", header=True, index = False)