In [None]:
# Competition Link  :   https://www.kaggle.com/competitions/playground-series-s3e16/data

# Evaluation : Mean Absolute Error (MAE)

In [None]:
import pandas as pd
import numpy as np 
from ydata_profiling import ProfileReport

In [None]:
data = pd.read_csv('data/train.csv')

def Overview (data) :
    profile = ProfileReport( data)
    profile.to_file('output.html')


# Overview(data)


In [None]:
data.shape

In [None]:
data.describe()

In [None]:
# As Univarient analysis is perform in Profile Report so we foucs on multivarient Data Analysis

In [None]:
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px
%matplotlib inline

In [None]:
px.scatter(data, x='Age', y='Length', color='Sex')

In [None]:
px.histogram(data, x='Age', color='Sex')

In [None]:
# from this graph we can clearly see age Sex effect the  averagae of the data

In [None]:
def MultiVarient (data, y,  x='Age', color='Sex') :
    fig = px.scatter(data, x=x, y=y, color=color, title=f"Data of Column is   :  {y}")
    fig.show()


In [None]:
# sample_data = data.sample(2000)

# for column in data.columns :
#     if column != 'Sex' or column != 'id' or column != 'Age' :
#         MultiVarient(data, column)

In [None]:
# Outcomes :
# 1 -> Length has linear Type of relation with Age 
# 2 -> Length of I Crabs is smaller then males and females
# 3 -> Diameter has linear Type of relation with Age
# 4 -> The Diameter of female crab at early age have has large dia meter then males and I
# 5 -> I types of diameters have and small dia meter then other crabs
# 6 -> "I" has small height then other crabs
# 7 -> Female crabs have averagely more weights then other crabs
# 8 -> I also small heights

In [None]:
from scipy import stats

def CheckSkewness (data, column) :
    plt.figure(figsize=(14,8))

    plt.subplot(121)
    plt.title(f"The Density Plot of {column}")
    sns.kdeplot(data[column])
    plt.subplot(122)
    plt.title(f"The Density Plot of {column}")
    stats.probplot(data[column], dist='norm', plot=plt)
    plt.tight_layout()
    plt.show()

In [None]:
CheckSkewness(data, 'Age')

In [None]:
numrical_column = data.select_dtypes(np.number)
for column in numrical_column.columns :
    CheckSkewness(data, column=column)

In [None]:
def CreatingDistPlot(data, column, target='Age'):
    plt.title(f"The Density Plot of {column}")
    sns.kdeplot(data[column], label=column)
    sns.kdeplot(data[target], label=target)
    plt.legend()
    plt.show()

In [None]:
CreatingDistPlot(data, 'Length', 'Age')

In [None]:
for column in numrical_column.columns :
    if column != 'id' :
        CreatingDistPlot(data, column)

In [None]:
# Now check the Outliers
data.index = data['id']
numrical_column.index = data['id']

In [None]:
for column in numrical_column.columns :
    print(column)
    sns.boxplot(data[column])
    plt.show()

In [None]:
#  As age also have outliers so we ignore and we can review on this discesion later on

In [None]:
# One Hot Encoding 
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

trf1 = ColumnTransformer([
    ('ohe', OneHotEncoder(sparse=False, drop='first'), ['Sex'])
], remainder='passthrough', verbose_feature_names_out=False).set_output(transform='pandas')

In [None]:
# Spliting the Data into train and test 
from sklearn.model_selection import train_test_split
inputs = data.drop(['id', 'Age'], axis=1)
targets = data['Age']

X_train, x_test, y_train, y_test = train_test_split(inputs, targets, test_size=0.2)

In [None]:
# Make the normal distribution of data 
# As most of data is left skew so we cannot apply log transformation

from sklearn.preprocessing import PowerTransformer

trf2 = ColumnTransformer([
    ('normalize', PowerTransformer(), ['Length', 'Diameter', 'Height', 'Weight', 'Shucked Weight',
        'Viscera Weight', 'Shell Weight'])
], remainder='passthrough', verbose_feature_names_out=False).set_output(transform='pandas')

In [None]:
# Making the pipeline
from sklearn.pipeline import Pipeline
pipe = Pipeline([
    ('encoding', trf1),
    ('normalize', trf2),
])

X_train = pipe.fit_transform(X_train)
X_test = pipe.transform(x_test)


In [None]:
X_train

In [None]:
# Model Training 

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error

In [None]:
def CheckInitialModelPerformances (X_train, y_train, X_test, y_test) :
    model_dict  = {
        'Linear' : LinearRegression(),
        'Ridge' : Ridge(alpha=0.2),
        'KNN' : KNeighborsRegressor(n_neighbors=4, n_jobs=-1),
        'Random' : RandomForestRegressor(random_state=42, n_jobs=-1),
        'XGB' : XGBRegressor(random_state=42, objective= 'reg:linear', eval_metric = 'mae'),
        'LightBGM' : LGBMRegressor(random_state=42, n_jobs=-1, objective = 'regression',metric= 'mae'),
        'CatB' : CatBoostRegressor(random_state=42, loss_function='MAE', verbose=False)
    }

    
    best_model_keys = model_dict.keys()
    best_model = {} 

    for model_name, model in model_dict.items():
        # print("Model is: ", model_name)
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        mae = mean_absolute_error(y_test, pred)
        best_model[model_name] = mae 
    return best_model



In [None]:
models = CheckInitialModelPerformances(X_train, y_train, X_test, y_test)

In [None]:
models

In [None]:
best_model_key = min(models, key=models.get)
best_model = models[best_model_key]


In [None]:
best_model

In [None]:
def HyperParameterTuning(model_name, **params):
    cat_dict = {}
    random_dict = {}
    lgb_dict = {}
    xgb_dict = {}
    
    if model_name == 'CatB':
        cat_dict = params
    if model_name == 'Random':
        random_dict = params
    if model_name == 'LightBGM':
        lgb_dict = params
    if model_name == 'XGB':
        xgb_dict = params
    
    model_dict = {
        'Random': RandomForestRegressor(random_state=42, n_jobs=-1, **random_dict),
        'XGB': XGBRegressor(random_state=42, objective='reg:linear', eval_metric='mae', **xgb_dict),
        'LightBGM': LGBMRegressor(random_state=42, n_jobs=-1, objective='regression', metric='mae', **lgb_dict),
        'CatB': CatBoostRegressor(random_state=42, loss_function='MAE', verbose=False, **cat_dict)
    }
    
    print("Model  :   ", model_name)
    model_dict[model_name].fit(X_train, y_train)
    pred = model_dict[model_name].predict(X_test)
    mae = mean_absolute_error(y_test, pred)
    print("Mean Absolute Error :  ", mae)

In [None]:
HyperParameterTuning('CatB', iterations=200, depth=6, learning_rate=0.3)
# Best is defualt

In [None]:
HyperParameterTuning('XGB', n_estimators=200, learning_rate=0.1, max_depth=5, gamma=0.2, subsample=0.9, colsample_bytree=0.9)


In [None]:
HyperParameterTuning('LightBGM', n_estimators=100, learning_rate=0.1, max_depth=8, subsample=0.7, colsample_bytree=0.7, reg_alpha=0.1, reg_lambda=0.1, min_child_samples=15, num_leaves=28)


In [None]:
HyperParameterTuning('Random', n_estimators=200, max_depth=8, min_samples_split=3, min_samples_leaf=2, max_features=9, bootstrap=True)


In [None]:
final_pipe = Pipeline([
    ('encoding', trf1),
    ('Normalizing', trf2),
    ('model', CatBoostRegressor(random_state=42, loss_function='MAE', verbose=False)),
])

In [None]:
test_data = pd.read_csv('data/test.csv',index_col=[0])

In [None]:
test_data

In [None]:
final_pipe.fit(inputs, targets)

In [None]:
pred = final_pipe.predict(test_data)

In [None]:
predictions = pd.read_csv('data/sample_submission.csv')

In [None]:
predictions['Age'] = pred

In [None]:
predictions.to_csv('prediction.csv', index=False)

In [116]:
# this model is one of the top 35% models