In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,f1_score


from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('./datasets/Dry_Bean_Dataset.csv')
df.head()

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRation,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4,Class
0,28395,610.291,208.178117,173.888747,1.197191,0.549812,28715,190.141097,0.763923,0.988856,0.958027,0.913358,0.007332,0.003147,0.834222,0.998724,SEKER
1,28734,638.018,200.524796,182.734419,1.097356,0.411785,29172,191.272751,0.783968,0.984986,0.887034,0.953861,0.006979,0.003564,0.909851,0.99843,SEKER
2,29380,624.11,212.82613,175.931143,1.209713,0.562727,29690,193.410904,0.778113,0.989559,0.947849,0.908774,0.007244,0.003048,0.825871,0.999066,SEKER
3,30008,645.884,210.557999,182.516516,1.153638,0.498616,30724,195.467062,0.782681,0.976696,0.903936,0.928329,0.007017,0.003215,0.861794,0.994199,SEKER
4,30140,620.134,201.847882,190.279279,1.060798,0.33368,30417,195.896503,0.773098,0.990893,0.984877,0.970516,0.006697,0.003665,0.9419,0.999166,SEKER


In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.dtypes

In [None]:
df.describe().T

# Univarite Analysis

In [None]:
total = len(df.select_dtypes(include=['number']).columns)
cols =  3
rows = int(np.ceil(total/cols))

fig,axes = plt.subplots(rows,cols,figsize=(6*cols,3*rows))
axes = axes.flatten()

for i,col in enumerate(df.select_dtypes(include=['number'])):
    sns.boxplot(df[col],ax=axes[i],orient='h',color='salmon')
    axes[i].set_title(col)

for j in range(i+1,len(axes)):
    axes[j].axis('off')

plt.tight_layout()
plt.show()

In [None]:
total = len(df.select_dtypes(include=['number']).columns)
cols =  3
rows = int(np.ceil(total/cols))

fig,axes = plt.subplots(rows,cols,figsize=(6*cols,3*rows))
axes = axes.flatten()

for i,col in enumerate(df.select_dtypes(include=['number'])):
    sns.histplot(df[col],ax=axes[i],color='salmon',kde=True)
    axes[i].set_title(col)

for j in range(i+1,len(axes)):
    axes[j].axis('off')

plt.tight_layout()
plt.show()

In [None]:
# Target Variable Analysis
class_counts = df['Class'].value_counts()

plt.figure(figsize=(6,6))
plt.pie(class_counts, labels=class_counts.index, autopct='%1.1f%%',colors = sns.color_palette("Set2"))
plt.title("Target Variable Distribution")
plt.axis('equal')
plt.show()


In [None]:
sns.countplot(x='Class', data=df)
plt.show()

In [None]:
corr = df.select_dtypes(include=['number']).corr()
plt.figure(figsize=(10,10))
sns.heatmap(corr,annot=True,linewidths=0.5,cmap='coolwarm')
plt.tight_layout()
plt.show()

In [None]:
# Multivariate analysis
sns.pairplot(df,hue='Class',diag_kind='auto',palette='Set2',corner=True)
plt.title('Multivariate Analysis')
plt.show()


# Data Treatment (Outliner and skewness Treatment)

In [None]:
for i in df.select_dtypes(include=['number']):
    Q1 = df[i].quantile(0.25)
    Q3 = df[i].quantile(0.75)

    IQR = Q3-Q1
    lower_bound = Q1-1.5 *IQR
    upper_bound= Q3+1.5*IQR
    outlier = df[(df[i]<lower_bound)|(df[i]>upper_bound)]

    outlier_count = outlier.shape[0]  
    df[i] = df[i].clip(lower=lower_bound, upper=upper_bound) # Clipping 
    print(f"Feature: {i}") 
    print(f"Lower Bound: {lower_bound}, Upper Bound: {upper_bound}")
    print(f"Number of Outliers: {outlier_count}")
    print("-" * 50)


In [None]:
skew = df.select_dtypes(include=['number']).skew()
print(skew)
highly_skewed = [col for col in skew.index if abs(skew[col])>1]

print("Highly Skewed Variables",highly_skewed)

In [None]:
# Applying log transformation 
df[highly_skewed] = np.log1p(df[highly_skewed])
print(df[highly_skewed].skew())

In [None]:
# Label Encoding
le = LabelEncoder()
df['Class']=le.fit_transform(df['Class'])

In [None]:
# Feature Selection
x = df.drop(columns='Class')
y = df['Class']

In [None]:
# Train Test Split
x_train ,x_test , y_train,y_test = train_test_split(x,y,train_size=0.7,random_state=7,stratify=y)
print('Train Size',x_train.shape[0])

In [None]:
# Feature Scaling
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [None]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(),
    "Naive Bayes": GaussianNB(),
    "AdaBoost": AdaBoostClassifier(),
    "Gradient Boosting": GradientBoostingClassifier()
}

results = []

for name, model in models.items():
    if name == "Logistic Regression":
        model.fit(x_train_scaled, y_train)
        pred = model.predict(x_test_scaled)
    else:
        model.fit(x_train, y_train)
        pred = model.predict(x_test)
    acc = accuracy_score(y_test, pred)
    f1 = f1_score(y_test, pred, average='weighted')
    results.append([name, acc, f1])

pd.DataFrame(results, columns=['Model', 'Accuracy', 'F1 Score']).sort_values(by='Accuracy',ascending=False)

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=7)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

print("Before SMOTE:", y_train.value_counts().to_dict())
print("After SMOTE:", pd.Series(y_train_resampled).value_counts().to_dict())


In [None]:
gradient = GradientBoostingClassifier(random_state=7)
gradient.fit(x_train_resampled, y_train_resampled)
pred_test = gradient.predict(x_test)

In [None]:
sns.countplot(x=y_train)
plt.title("Class Distribution Before SMOTE")
plt.show()

sns.countplot(x=y_train_resampled)
plt.title("Class Distribution After SMOTE")
plt.show()

In [None]:
gradient_cm = confusion_matrix(y_test,pred_test)
sns.heatmap(gradient_cm,annot=True,fmt='d',linewidths=.05,xticklabels=le.classes_,yticklabels=le.classes_,cmap='Blues')
plt.show()

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(GradientBoostingClassifier(), x_train, y_train, cv=5, scoring='f1_weighted')
print("CV F1 Score:", scores.mean())

In [None]:
from sklearn.model_selection import RandomizedSearchCV

params = {
    'n_estimators': [100, 150],
    'max_depth': [3, 5],
    'learning_rate': [ 0.1, 0.2]
}

grid = RandomizedSearchCV(GradientBoostingClassifier(random_state=7), params, cv=2, scoring='f1_weighted',n_iter=5,verbose=1)
grid.fit(x_train_resampled, y_train_resampled)
print("Best Params:", grid.best_params_)
print("Best F1 Score:", grid.best_score_)

In [None]:
best_model = grid.best_estimator_
y_pred = best_model.predict(x_test)

# Metrics
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred, target_names=le.classes_))


cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', xticklabels=le.classes_, yticklabels=le.classes_, cmap='Blues')
plt.title("Confusion Matrix - Best Model")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

In [None]:
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'F1 Score']).sort_values(by='Accuracy',ascending=False)
sns.barplot(data=results_df, x='Accuracy', y='Model', palette='Blues_d')
plt.title("Model Accuracy Comparison")
plt.show()

In [None]:
train_pred = gradient.predict(x_train)
print("Train Accuracy:", accuracy_score(y_train, train_pred))
print("Test Accuracy:", accuracy_score(y_test, pred_test))

# Best model with no overfitting issue GradientBoost -> 92 Test accuracy 94 train

In [None]:
import joblib
joblib.dump(gradient, './models/model.pkl')