<a href="https://colab.research.google.com/github/theAkashPrabu/Dessertation/blob/main/abalone_rings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
columns = [
    'Sex', 'Length', 'Diameter', 'Height',
    'WholeWeight', 'ShuckedWeight', 'VisceraWeight',
    'ShellWeight', 'Rings'
]

data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/abalone/abalone.data', header=None, names=columns)
data.head()


In [None]:
print("Shape:", data.shape)
print("\nData Types:\n", data.dtypes)
print("\nMissing Values:\n", data.isnull().sum())


In [None]:
data['Sex'].value_counts()


In [None]:
sns.set_theme(style="whitegrid")

def plot_feature_distributions(data):
    num_cols = data.select_dtypes(include=['int64', 'float64']).columns
    n = len(num_cols)
    rows = (n // 3) + 1

    plt.figure(figsize=(16, 4 * rows))

    for i, col in enumerate(num_cols, 1):
        plt.subplot(rows, 3, i)

        # Histogram + KDE
        sns.histplot(data[col], kde=True, bins=25, color="royalblue", edgecolor="black")

        # Title formatting
        plt.title(f"Distribution of {col}", fontsize=14)
        plt.xlabel("Values")
        plt.ylabel("Frequency")

    plt.tight_layout()
    plt.suptitle("Feature Distributions", fontsize=18, y=1.02)
    plt.savefig("feature_distributions_kde.png")
    plt.show()

plot_feature_distributions(data)


In [None]:
continuous_features = ['Length', 'Diameter', 'Height',
    'WholeWeight', 'ShuckedWeight', 'VisceraWeight',
    'ShellWeight', 'Rings']



plt.figure(figsize=(15,10))

for i, col in enumerate(continuous_features):
    plt.subplot(3, 3, i+1)
    sns.histplot(data[col], kde=True, bins=20, color= "royalblue",edgecolor="black")  # assign specific color
    plt.title(f'Distribution of {col}')
    plt.grid(True)

plt.suptitle("Feature Distributions", fontsize=18, y=1.02)
plt.tight_layout()
plt.savefig('Histogram.png')
plt.show()


In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(data.select_dtypes(include=['number']).corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap ")
plt.savefig("Crr Heat Map")
plt.show()


In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(x='Sex', y='Rings', data=data)
plt.title("Age (Rings) distribution by Sex")
plt.show()

In [None]:
data['Age'] = data['Rings'] + 1.5
data[['Rings', 'Age']].head()


In [None]:
plt.figure(figsize=(8,5))
sns.histplot(data['Age'], bins=15, kde=True, color='teal')
plt.title('Distribution of Abalone Age')
plt.xlabel('Age (years)')
plt.ylabel('Count')
plt.savefig("dist age")
plt.grid(True)
plt.show()



In [None]:
features = ['Length', 'Diameter', 'Height',
            'WholeWeight', 'ShuckedWeight',
            'VisceraWeight', 'ShellWeight']


plt.figure(figsize=(15,10))
for i, col in enumerate(features, 1):
    plt.subplot(3, 3, i)
    sns.scatterplot(x=col, y='Age', hue='Sex', data=data)
    plt.title(f'{col} vs Age')
plt.tight_layout()

plt.show()

In [None]:
corr = data.corr(numeric_only=True)['Age'].sort_values(ascending=False)
print("Feature Correlation with Age:")
print(corr)

In [None]:
bins = [0, 5, 10, 15, 20, 30]
labels = ['Young', 'Adult', 'Mature', 'Old', 'Very Old']
data['Age_Group'] = pd.cut(data['Age'], bins=bins, labels=labels)

sns.countplot(x='Age_Group', hue='Sex', data=data)
plt.title('Age Group Distribution by Sex')
plt.savefig("Age Group Dist by sex")
plt.show()


In [None]:
data

In [None]:
from sklearn.preprocessing import LabelEncoder

label = LabelEncoder()
data['Sex'] = label.fit_transform(data['Sex'])
data['Age_Group'] = label.fit_transform(data['Age_Group'])


In [None]:
data

In [None]:
X_feature = data[['Sex', 'Length', 'Diameter', 'Height', 'WholeWeight',
        'ShuckedWeight', 'VisceraWeight', 'ShellWeight']]

Y_target = data['Age_Group']



In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split( X_feature,Y_target, test_size=0.2, random_state=42)

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# Train model
xgb_clf = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    random_state=42,
    eval_metric='mlogloss'
)

xgb_clf.fit(X_train, y_train)
y_pred = xgb_clf.predict(X_test)

#Accuracy
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

#Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, cmap="Blues", fmt='d')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()



In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# Train Logistic Regression model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

# Predictions
y_pred = log_reg.predict(X_test)

# Accuracy + Report
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, cmap="Blues", fmt='d')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()



In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# Train Random Forest model
rf_clf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    random_state=42
)

rf_clf.fit(X_train, y_train)

# Predictions
y_pred = rf_clf.predict(X_test)

# Accuracy
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, cmap="Blues", fmt='d')
plt.title("Random Forest Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()




In [None]:
from sklearn.model_selection import RandomizedSearchCV

xgb_clf = XGBClassifier(
    random_state=42,
    eval_metric="mlogloss",
    tree_method="hist",
    n_jobs=-1
)

param_dist = {
    "n_estimators": [100, 200, 400],
    "max_depth": [3, 4, 5, 6],
    "learning_rate": [0.03, 0.05, 0.1],
    "subsample": [0.7, 0.85, 1.0],
    "colsample_bytree": [0.7, 0.85, 1.0],
    "min_child_weight": [1, 3, 5],
}

search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_dist,
    n_iter=15,
    scoring="accuracy",
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

search.fit(X_train, y_train)

print("Best hyperparameters:", search.best_params_)
best_model = search.best_estimator_
y_pred = best_model.predict(X_test)

print("XGBoost Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))



# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, cmap="Blues", fmt='d')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV


#Define the model
log_reg = LogisticRegression(max_iter=1000, solver='liblinear')  # solver liblinear works for small datasets

#arameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10],       # regularization strength
    'penalty': ['l1', 'l2']        # type of regularization
}

#GridSearchCV
grid_search = GridSearchCV(
    estimator=log_reg,
    param_grid=param_grid,
    scoring='accuracy',
    cv=3,
    verbose=1
)

#Fiting the model
grid_search.fit(X_train, y_train)

#Best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)

#Predict using the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Accuracy
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, cmap="Blues", fmt='d')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()




In [None]:

from sklearn.model_selection import RandomizedSearchCV


rf = RandomForestClassifier(
    random_state=42,
    n_jobs=-1,
    bootstrap=True
)

param_dist = {
    "n_estimators": [150, 250, 400],
    "max_depth": [None, 6, 10, 14],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2", None],
}

search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=15,
    scoring="accuracy",
    cv=3,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

search.fit(X_train, y_train)

print("Best hyperparameters:", search.best_params_)

best_model = search.best_estimator_
y_pred = best_model.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, cmap="Blues", fmt="d")
plt.title("Random Forest Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
