In [3]:
import numpy as np
import pandas as pd

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler


In [162]:
df = pd.read_csv('../data/df_train.csv')
df_test = pd.read_csv('../data/df_test.csv')



### Functions


In [163]:
def normalize_column(df, column):
    return RobustScaler().fit_transform(np.array(df[column]).reshape(-1, 1))

In [164]:
def prepare_data(df):
    for col in df.columns:
        if col != "Class":
            df[col] = normalize_column(df, col)
    return df

In [170]:
def select_columns(df):
    columns = ['Area', 'Perimeter', 'MajorAxisLength', 'MinorAxisLength', 
               'AspectRation', 'Eccentricity', 'roundness', 'Extent',
               'Solidity','ShapeFactor1', 'ShapeFactor2', 'ShapeFactor3', 'ShapeFactor4', 'Class']
    df = df[columns]
    return df

In [172]:
data = select_columns(df.copy())
data = prepare_data(data)
data_test = select_columns(df_test.copy())
data_test = prepare_data(data_test)
X_train = data.drop(columns=["Class"])
y_train = data["Class"]
X_test = data_test.drop(columns=["Class"])
y_test = data_test["Class"]

In [11]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV

In [173]:
rfc = RandomForestClassifier(n_estimators=80, criterion='entropy', max_depth=20, random_state=311, max_features=None,
                             warm_start=True)
ada = AdaBoostClassifier(estimator=rfc, algorithm='SAMME', n_estimators=500, random_state=311, learning_rate=0.01)

ada.fit(X_train, y_train)
y_pred = ada.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


0.9275970619097587
              precision    recall  f1-score   support

    BARBUNYA       0.94      0.91      0.93       278
      BOMBAY       0.99      1.00      1.00       109
        CALI       0.94      0.95      0.95       342
    DERMASON       0.91      0.93      0.92       745
       HOROZ       0.96      0.94      0.95       405
       SEKER       0.95      0.94      0.95       426
        SIRA       0.88      0.88      0.88       554

    accuracy                           0.93      2859
   macro avg       0.94      0.94      0.94      2859
weighted avg       0.93      0.93      0.93      2859



In [160]:
param_grid = {"n_estimators": [200, 500, 700], "learning_rate": [0.1],
              "algorithm": ['SAMME']}

grid_search = GridSearchCV(ada, param_grid=param_grid, cv=7, scoring='accuracy', return_train_score=True)
grid_search.fit(X_train, y_train)

final_ada = grid_search.best_estimator_
final_ada

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

In [159]:
final_ada.fit(X_train, y_train)
y_pred = final_ada.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred, labels=final_ada.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=final_ada.classes_)
disp.plot()

NameError: name 'final_ada' is not defined

### SVC

In [174]:
from sklearn.svm import SVC

svc = SVC(C=30, kernel='rbf', random_state=311, tol=1e-1)
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
cf = confusion_matrix(y_test, y_pred, labels=svc.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cf, display_labels=svc.classes_)



0.9307450157397692
              precision    recall  f1-score   support

    BARBUNYA       0.95      0.95      0.95       278
      BOMBAY       1.00      1.00      1.00       109
        CALI       0.95      0.96      0.95       342
    DERMASON       0.91      0.93      0.92       745
       HOROZ       0.97      0.94      0.95       405
       SEKER       0.95      0.96      0.96       426
        SIRA       0.89      0.86      0.88       554

    accuracy                           0.93      2859
   macro avg       0.94      0.94      0.94      2859
weighted avg       0.93      0.93      0.93      2859



### Kroswalidacja SVC

In [182]:
param_grid = {"C": [i for i in range(1, 30)], "kernel": ['rbf']}
grid_search = GridSearchCV(svc, param_grid=param_grid, cv=7, scoring='accuracy', return_train_score=True)
grid_search.fit(X_train, y_train)

### SVC z Kroswalidacji

In [181]:
final_svc = grid_search.best_estimator_
final_svc.fit(X_train, y_train)
final_svc.predict(X_test)
y_pred = final_svc.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

Daje minimalnie gorszą accuracy niż bez

### MLP Classifier

In [176]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier()
mlp.fit(X_train, y_train)
y_pred = mlp.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


0.9338929695697796
              precision    recall  f1-score   support

    BARBUNYA       0.95      0.94      0.94       278
      BOMBAY       1.00      1.00      1.00       109
        CALI       0.96      0.96      0.96       342
    DERMASON       0.92      0.93      0.92       745
       HOROZ       0.96      0.96      0.96       405
       SEKER       0.96      0.96      0.96       426
        SIRA       0.88      0.88      0.88       554

    accuracy                           0.93      2859
   macro avg       0.95      0.95      0.95      2859
weighted avg       0.93      0.93      0.93      2859





# Nowy model dropnął

In [133]:
df = pd.read_csv('../data/df_train.csv')
df_test = pd.read_csv('../data/df_test.csv')


df['ShapeFactor5'] = df['MajorAxisLength'] / df['Perimeter']
data = prepare_data(df)
df_test['ShapeFactor5'] = df_test['MajorAxisLength'] / df_test['Perimeter']
data_test = prepare_data(df_test)
X_train = data.drop(columns=["Class"])
y_train = data["Class"]
X_test = data_test.drop(columns=["Class"])
y_test = data_test["Class"]

### GradientBoosting

In [177]:
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train2 = le.fit_transform(y_train)
y_test2 = le.transform(y_test)

xgb = XGBClassifier(learning_rate=0.07, random_state =42, objective='multi:softmax', max_depth=5, reg_alpha = 0.002, gamma=0.01, verbosity=0)
xgb.fit(X_train, y_train2)
y_pred = xgb.predict(X_test)
print(accuracy_score(y_test2, y_pred))
print(classification_report(y_test2, y_pred))


0.9233997901364114
              precision    recall  f1-score   support

           0       0.95      0.92      0.94       278
           1       0.99      0.99      0.99       109
           2       0.93      0.94      0.94       342
           3       0.91      0.93      0.92       745
           4       0.95      0.93      0.94       405
           5       0.96      0.95      0.95       426
           6       0.87      0.86      0.87       554

    accuracy                           0.92      2859
   macro avg       0.94      0.93      0.93      2859
weighted avg       0.92      0.92      0.92      2859



In [179]:
from lightgbm import LGBMClassifier
lgb = LGBMClassifier(objective='multiclass', random_state=311, learning_rate=0.15, reg_alpha=0.01, verbose=-1)

lgb.fit(X_train, y_train)
y_pred = lgb.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9279468345575376
              precision    recall  f1-score   support

    BARBUNYA       0.96      0.95      0.95       278
      BOMBAY       0.99      1.00      1.00       109
        CALI       0.96      0.95      0.95       342
    DERMASON       0.90      0.93      0.92       745
       HOROZ       0.96      0.95      0.95       405
       SEKER       0.95      0.95      0.95       426
        SIRA       0.88      0.86      0.87       554

    accuracy                           0.93      2859
   macro avg       0.94      0.94      0.94      2859
weighted avg       0.93      0.93      0.93      2859



### Nearest Neighbours


In [180]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(algorithm='auto', weights ='distance', n_neighbors=15, p=2)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9227002448408534
              precision    recall  f1-score   support

    BARBUNYA       0.96      0.90      0.93       278
      BOMBAY       1.00      1.00      1.00       109
        CALI       0.92      0.97      0.95       342
    DERMASON       0.91      0.91      0.91       745
       HOROZ       0.97      0.95      0.96       405
       SEKER       0.96      0.95      0.95       426
        SIRA       0.85      0.87      0.86       554

    accuracy                           0.92      2859
   macro avg       0.94      0.93      0.94      2859
weighted avg       0.92      0.92      0.92      2859



# Voting Classifier

### Hard model

In [183]:
from sklearn.ensemble import VotingClassifier

estimators = [('ada', ada), ('svc', svc), ('mlp', mlp), ('xgb', xgb), ('lgb', lgb), ('knn', knn)]

model_hard = VotingClassifier(estimators=estimators, voting='hard')
model_hard.fit(X_train, y_train)
y_pred = model_hard.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))





0.9331934242742218
              precision    recall  f1-score   support

    BARBUNYA       0.95      0.94      0.95       278
      BOMBAY       1.00      1.00      1.00       109
        CALI       0.95      0.96      0.96       342
    DERMASON       0.91      0.94      0.92       745
       HOROZ       0.97      0.95      0.96       405
       SEKER       0.96      0.96      0.96       426
        SIRA       0.89      0.86      0.88       554

    accuracy                           0.93      2859
   macro avg       0.95      0.94      0.95      2859
weighted avg       0.93      0.93      0.93      2859

