# **Imports**

In [1]:
import numpy as np
import pandas as pd
import os

import cv2
import numpy as np
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Model
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.preprocessing import image

# **Definindo funções de obtenção de imagens**

In [2]:
def get_train_images():
    
    train_images_directory = '/kaggle/input/ufg-icomp-competition-1/train/train/'
    
    train_images = []

    for dirname, _, filenames in os.walk(train_images_directory):
        for filename in filenames:
            train_images.append(os.path.join(dirname, filename))
            
    train_images.sort(key = lambda train_image: int(train_image.replace(train_images_directory, '').replace('.jpg', '')))
        
    return train_images

In [3]:
def get_test_images():
    
    test_images_directory = '/kaggle/input/ufg-icomp-competition-1/test/test/'
    
    test_images = []

    for dirname, _, filenames in os.walk(test_images_directory):
        for filename in filenames:
            test_images.append(os.path.join(dirname, filename))
            
    test_images.sort(key = lambda test_image: int(test_image.replace(test_images_directory, '').replace('.jpg', '')))
        
    return test_images

# **Definindo função de obtenção de dados da classe das imagens de treinamento**

In [4]:
def get_train_classes():
    return pd.read_csv('/kaggle/input/ufg-icomp-competition-1/train.csv')

# **Definindo função de extração de características de cada imagem do dataset**

In [15]:
def image_feature_extraction(image_path, image_size):

  my_image = []
  my_image.append(cv2.resize(cv2.imread(image_path), (image_size, image_size)))
  my_image = np.array(my_image)

  feature_extraction_base_model = ResNet50(weights = 'imagenet')
  feature_extraction_model = Model(inputs = feature_extraction_base_model.input, outputs = feature_extraction_base_model.get_layer('avg_pool').output)

  features_array = np.zeros((my_image.shape[0], 2048))

  for i, img_pos in enumerate(my_image):
    img = img_pos
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    features = feature_extraction_model.predict(x)
    features = features.reshape(2048,)
    features_array[i,:] = features

  return features_array

# **Obtendo características das imagens de treino**

In [16]:
train_images_features = []

for train_image in get_train_images():
    
    image_features = image_feature_extraction(train_image, 224)[0]
    
    train_images_features.append(image_features)
    
    print("Progress: " + str(image_features))


Progress: [0.48112893 0.11049459 0.24500497 ... 0.00417835 0.2090019  0.05677019]
Progress: [1.74670064 0.44321722 0.         ... 0.01130967 1.33284211 0.44061571]
Progress: [0.01996331 0.01708925 0.         ... 0.09743045 3.28816652 0.38709235]
Progress: [1.06253362 0.92481428 0.         ... 0.0086116  1.10981989 0.5821414 ]
Progress: [1.4173646  0.53068185 0.         ... 0.01192337 0.16118188 0.19932647]
Progress: [0.15231185 0.04539957 0.04016677 ... 0.07853434 3.795048   0.41775241]
Progress: [0.17188174 0.02050123 0.19881225 ... 0.00650344 0.01391338 0.15173283]
Progress: [1.45196855 0.04530861 0.         ... 0.         0.78791142 0.54750061]
Progress: [1.85980129 0.0117699  0.         ... 0.         0.02270643 0.54781705]
Progress: [0.02226593 0.19209297 0.20472226 ... 0.16665459 0.37658992 2.55208611]
Progress: [0.45763814 0.65552604 0.67972672 ... 0.01112983 0.4703047  0.04820455]
Progress: [0.92957073 1.40682364 0.00713099 ... 0.08856902 0.72343445 0.10328985]
Progress: [0.512

# **Montando dataframe com as imagens de treino (características e classe)**

In [31]:
df = pd.DataFrame(train_images_features)

df['Label'] = get_train_classes()['Label']

df.to_csv('train_with_features.csv', encoding='utf-8', index = False) # Salvando características já processadas em um csv

In [18]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2039,2040,2041,2042,2043,2044,2045,2046,2047,Label
0,0.481129,0.110495,0.245005,0.046150,1.358364,0.002207,0.117204,0.146936,0.000000,0.000000,...,0.000000,0.005181,0.022839,0.070785,0.479733,0.000000,0.004178,0.209002,0.056770,0
1,1.746701,0.443217,0.000000,0.180776,2.104374,0.345369,1.465372,0.353640,0.077265,0.017185,...,0.035685,0.090223,1.043038,0.000000,0.819558,0.015091,0.011310,1.332842,0.440616,0
2,0.019963,0.017089,0.000000,0.198838,0.255629,0.262440,4.620727,0.001325,0.000000,0.000000,...,0.073738,0.021662,0.089680,0.000000,0.000000,0.021044,0.097430,3.288167,0.387092,0
3,1.062534,0.924814,0.000000,0.134585,0.176833,0.079172,0.656850,0.236275,0.042239,0.052203,...,0.303030,0.051948,0.018386,0.043918,0.726823,0.000000,0.008612,1.109820,0.582141,0
4,1.417365,0.530682,0.000000,0.352051,0.676105,0.409778,0.132919,0.096590,0.467966,0.000000,...,0.517158,0.000000,0.066418,0.124786,1.796521,0.000000,0.011923,0.161182,0.199326,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2865,0.338388,2.163781,0.046198,1.567598,0.583425,0.000000,1.481694,0.023842,0.006774,0.000000,...,0.000000,0.000000,0.240603,0.006686,0.528352,0.000000,0.000000,4.520302,0.283419,3
2866,0.354773,0.686532,0.000000,1.667492,1.366258,0.159109,0.930379,0.095080,0.238652,0.000000,...,0.000118,0.509221,0.052366,0.106817,0.197381,0.021594,0.008470,4.270976,0.006905,3
2867,0.774314,1.512719,0.034903,1.970267,0.350562,0.097967,1.919245,0.059107,0.947980,0.000000,...,0.023963,0.042052,0.041765,0.000000,1.701573,0.019171,0.007436,1.390643,0.077609,3
2868,0.452794,0.669157,0.000000,2.134618,0.102346,0.112167,2.538076,0.039057,0.103423,0.005414,...,0.000000,0.467075,0.029480,0.462681,2.296216,0.008297,0.015189,1.957559,0.347418,3


# **Obtendo características das imagens de teste**

In [19]:
test_images_features = []

for test_image in get_test_images():
    
    image_features = image_feature_extraction(test_image, 224)[0]
    
    test_images_features.append(image_features)
    
    print("Progress: " + str(image_features))

Progress: [0.18452057 0.1127876  0.38285244 ... 0.0580228  1.55455232 0.30949023]
Progress: [0.31706715 0.48846647 0.10951214 ... 0.         3.94599748 0.39100218]
Progress: [0.45527107 0.04716894 0.10051391 ... 0.         2.76291728 0.36351547]
Progress: [0.24252701 0.16235718 0.         ... 0.06223474 3.04867435 2.18558836]
Progress: [0.10905415 0.10307802 0.00936613 ... 0.         0.78473234 0.07577375]
Progress: [1.26116717 0.28692093 0.         ... 0.02844694 0.97338629 0.89217001]
Progress: [0.74923426 0.         0.         ... 0.04161656 5.21759319 0.06177421]
Progress: [0.37199864 0.10795473 0.         ... 0.00448416 0.82552665 0.00488657]
Progress: [1.14847982 0.40824276 0.10465018 ... 0.0233235  0.04835828 2.14655638]
Progress: [0.12608799 0.46518514 0.02543001 ... 0.32732618 2.87657332 0.02277171]
Progress: [1.28547156e+00 1.24719724e-01 1.91225767e-01 ... 7.69454287e-04
 2.28658652e+00 4.47761178e-01]
Progress: [0.19271566 1.23480308 0.21522236 ... 0.0149041  0.99924016 0.9

# **Montando dataframe com as imagens de teste (características)**

In [6]:
dft = pd.DataFrame(test_images_features)

dft.to_csv('test_with_features.csv', encoding='utf-8', index = False) # Salvando características já processadas em um csv

dft

NameError: name 'test_images_features' is not defined

# **Iniciando dataframe de imagens de treino de um dataset com as características das imagens já extraídas**

In [5]:
df = pd.read_csv('../input/ufgcompimagesfeatures/train_with_features.csv')

df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2039,2040,2041,2042,2043,2044,2045,2046,2047,Label
0,0.481129,0.110495,0.245005,0.046150,1.358364,0.002207,0.117204,0.146936,0.000000,0.000000,...,0.000000,0.005181,0.022839,0.070785,0.479733,0.000000,0.004178,0.209002,0.056770,0
1,1.746701,0.443217,0.000000,0.180776,2.104374,0.345369,1.465372,0.353640,0.077265,0.017185,...,0.035685,0.090223,1.043038,0.000000,0.819558,0.015091,0.011310,1.332842,0.440616,0
2,0.019963,0.017089,0.000000,0.198838,0.255629,0.262440,4.620727,0.001325,0.000000,0.000000,...,0.073738,0.021662,0.089680,0.000000,0.000000,0.021044,0.097430,3.288167,0.387092,0
3,1.062534,0.924814,0.000000,0.134585,0.176833,0.079172,0.656850,0.236275,0.042239,0.052203,...,0.303030,0.051948,0.018386,0.043918,0.726823,0.000000,0.008612,1.109820,0.582141,0
4,1.417365,0.530682,0.000000,0.352051,0.676105,0.409778,0.132919,0.096590,0.467966,0.000000,...,0.517158,0.000000,0.066418,0.124786,1.796521,0.000000,0.011923,0.161182,0.199326,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2865,0.338388,2.163781,0.046198,1.567598,0.583425,0.000000,1.481694,0.023842,0.006774,0.000000,...,0.000000,0.000000,0.240603,0.006686,0.528352,0.000000,0.000000,4.520302,0.283419,3
2866,0.354773,0.686532,0.000000,1.667492,1.366258,0.159109,0.930379,0.095080,0.238652,0.000000,...,0.000118,0.509221,0.052366,0.106817,0.197381,0.021594,0.008470,4.270976,0.006905,3
2867,0.774314,1.512719,0.034903,1.970267,0.350562,0.097967,1.919245,0.059107,0.947980,0.000000,...,0.023963,0.042052,0.041765,0.000000,1.701573,0.019171,0.007436,1.390643,0.077609,3
2868,0.452794,0.669157,0.000000,2.134618,0.102346,0.112167,2.538076,0.039057,0.103423,0.005414,...,0.000000,0.467075,0.029480,0.462681,2.296216,0.008297,0.015189,1.957559,0.347418,3


# **Iniciando dataframe de imagens de teste de um dataset com as características das imagens já extraídas**

In [6]:
dft = pd.read_csv('../input/ufgcompimagesfeatures/test_with_features.csv')

dft

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,0.184521,0.112788,0.382852,0.576658,0.051139,0.737694,4.844190,0.012314,0.041923,0.000000,...,0.069032,0.264427,0.000000,0.011821,0.000259,0.422304,0.000000,0.058023,1.554552,0.309490
1,0.317067,0.488466,0.109512,0.636280,0.491730,0.000000,1.954262,0.004347,0.089414,0.000000,...,0.160460,0.007780,0.010500,0.003300,0.014604,1.708110,0.000000,0.000000,3.945997,0.391002
2,0.455271,0.047169,0.100514,0.715632,0.198834,0.189533,0.473584,0.001704,0.804443,0.009069,...,0.002837,0.000000,0.000000,0.052122,0.076673,0.379333,0.000000,0.000000,2.762917,0.363515
3,0.242527,0.162357,0.000000,1.020584,1.339205,0.051134,8.570017,0.100760,0.041109,0.000000,...,0.021328,0.000000,0.008632,0.247423,0.028140,1.231768,0.000000,0.062235,3.048674,2.185588
4,0.109054,0.103078,0.009366,0.110690,1.687509,0.007094,0.361665,0.107283,0.346607,0.000000,...,0.000000,0.065300,0.000000,0.648471,0.111093,1.238796,0.000000,0.000000,0.784732,0.075774
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
389,1.596848,1.028050,0.275508,1.263110,0.471685,0.187878,1.686235,0.222798,0.595308,0.035971,...,0.054011,0.103340,0.000000,0.269392,0.000000,4.022872,0.067134,0.090153,1.439293,0.095942
390,0.000000,1.595532,0.000000,0.592972,1.522988,0.059643,1.093688,0.000000,0.490793,0.000000,...,0.025110,0.000000,0.000000,0.386733,0.000000,0.397294,0.000000,0.013840,2.393480,0.020435
391,1.596848,1.028050,0.275508,1.263110,0.471685,0.187878,1.686235,0.222798,0.595308,0.035971,...,0.054011,0.103340,0.000000,0.269392,0.000000,4.022872,0.067134,0.090153,1.439293,0.095942
392,1.596848,1.028050,0.275508,1.263110,0.471685,0.187878,1.686235,0.222798,0.595308,0.035971,...,0.054011,0.103340,0.000000,0.269392,0.000000,4.022872,0.067134,0.090153,1.439293,0.095942


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2870 entries, 0 to 2869
Columns: 2049 entries, 0 to Label
dtypes: float64(2048), int64(1)
memory usage: 44.9 MB


In [8]:
dft.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 394 entries, 0 to 393
Columns: 2048 entries, 0 to 2047
dtypes: float64(2048)
memory usage: 6.2 MB


# **Verificando balanceamento de classes (classe 0 desbalanceada)**

In [9]:
print(df.loc[df.Label == 0].shape)
print(df.loc[df.Label == 1].shape)
print(df.loc[df.Label == 2].shape)
print(df.loc[df.Label == 3].shape)

(395, 2049)
(826, 2049)
(822, 2049)
(827, 2049)


# **Utilizando técnica OVERSAMPLING para balancear classe desbalanceada**

In [10]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(df.drop(columns=['Label']), df['Label'])

print(sorted(Counter(y_resampled.values).items()))

[(0, 827), (1, 827), (2, 827), (3, 827)]


# **Imports referentes ao classificador**

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score, classification_report
from sklearn.preprocessing import Normalizer, MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer, LabelEncoder

# **Tentativa de encontrar melhores parâmetros para a MLP**

In [12]:
pipe = Pipeline(steps = [
    ('preprocess', StandardScaler(copy=True, with_mean=True, with_std=True)),
    ('classification', MLPClassifier())
])

mlp_param_grid = [
    {
        'classification__activation': ['relu'],
        'classification__solver': ['adam'],
        'classification__random_state': [42],
        'classification__max_iter': [200],
        'classification__alpha': [0.02],
        'classification__hidden_layer_sizes': [(500), (900)]
    }
]

strat_k_fold = StratifiedKFold(
    n_splits = 4,
    random_state = 42,
    shuffle = True
)

mlp_grid = GridSearchCV(
    pipe,
    param_grid = mlp_param_grid,
    cv = strat_k_fold,
    scoring='accuracy',
    n_jobs = -2,
    verbose = 3,
    return_train_score=True
).fit(X_resampled, y_resampled)

print('\n\nBest params: ')

print(mlp_grid.best_params_)

print('\nBest score: {:.2f}%'.format(mlp_grid.best_score_ * 100))

Fitting 4 folds for each of 2 candidates, totalling 8 fits
[CV 1/4] END classification__activation=relu, classification__alpha=0.02, classification__hidden_layer_sizes=500, classification__max_iter=200, classification__random_state=42, classification__solver=adam;, score=(train=1.000, test=0.935) total time= 1.9min
[CV 2/4] END classification__activation=relu, classification__alpha=0.02, classification__hidden_layer_sizes=500, classification__max_iter=200, classification__random_state=42, classification__solver=adam;, score=(train=1.000, test=0.932) total time= 1.8min
[CV 3/4] END classification__activation=relu, classification__alpha=0.02, classification__hidden_layer_sizes=500, classification__max_iter=200, classification__random_state=42, classification__solver=adam;, score=(train=1.000, test=0.929) total time= 1.8min
[CV 4/4] END classification__activation=relu, classification__alpha=0.02, classification__hidden_layer_sizes=500, classification__max_iter=200, classification__random_

In [62]:
from sklearn.model_selection import RandomizedSearchCV

mlp_rand_grid = RandomizedSearchCV(pipe, mlp_param_grid, cv = strat_k_fold, scoring = 'accuracy', random_state = 36, n_jobs = 3,verbose = 3).fit(df.drop(columns=['Label']), df['Label'])

print(mlp_rand_grid.best_params_)

print('\n\nBest F1 score: {:.2f}%'.format(mlp_rand_grid.best_score_ * 100))



Fitting 2 folds for each of 8 candidates, totalling 16 fits
{'preprocess': StandardScaler(), 'classification__solver': 'adam', 'classification__random_state': 36, 'classification__max_iter': 200, 'classification__alpha': 0.01, 'classification__activation': 'relu'}


Best F1 score: 89.37%


# **Criação e treinamento MLP**

In [13]:
scaler = StandardScaler(copy=True, with_mean=True, with_std=True)

X = scaler.fit_transform(X_resampled)
y = y_resampled

In [14]:
clf = MLPClassifier(
    hidden_layer_sizes=(900),
    max_iter = 200,
    alpha = 0.02,
    activation = 'relu',
    solver = 'adam',
    random_state = 42,
    verbose = True,
).fit(X, y)

Iteration 1, loss = 0.62332308
Iteration 2, loss = 0.16362341
Iteration 3, loss = 0.09186312
Iteration 4, loss = 0.07421027
Iteration 5, loss = 0.07017733
Iteration 6, loss = 0.06846669
Iteration 7, loss = 0.06753879
Iteration 8, loss = 0.06672909
Iteration 9, loss = 0.06597846
Iteration 10, loss = 0.06524536
Iteration 11, loss = 0.06449998
Iteration 12, loss = 0.06375941
Iteration 13, loss = 0.06299579
Iteration 14, loss = 0.06223044
Iteration 15, loss = 0.06144714
Iteration 16, loss = 0.06065815
Iteration 17, loss = 0.05986300
Iteration 18, loss = 0.05906001
Iteration 19, loss = 0.05824821
Iteration 20, loss = 0.05742876
Iteration 21, loss = 0.05660642
Iteration 22, loss = 0.05577660
Iteration 23, loss = 0.05494670
Iteration 24, loss = 0.05411120
Iteration 25, loss = 0.05327688
Iteration 26, loss = 0.05243923
Iteration 27, loss = 0.05159700
Iteration 28, loss = 0.05075943
Iteration 29, loss = 0.04992411
Iteration 30, loss = 0.04908587
Iteration 31, loss = 0.04825271
Iteration 32, los

# **Classificando imagens de teste**

In [15]:
X_test = scaler.fit_transform(dft)

In [16]:
predicted = clf.predict(X_test)

predicted

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 2, 2, 3,
       2, 2, 2, 0, 1, 2, 1, 1, 2, 2, 2, 2, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
       1, 1, 2, 0, 1, 2, 1, 2, 2, 0, 0, 2, 2, 1, 1, 2, 3, 1, 2, 1, 1, 0,
       1, 0, 1, 0, 1, 2, 2, 2, 0, 2, 1, 1, 1, 1, 1, 1, 0, 1, 1, 2, 0, 1,
       1, 1, 2, 1, 2, 2, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 3, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2,
       2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

# **Montando .csv de resposta da classificação das imagens de teste**

In [17]:
test_images_ids = []

for test_image in get_test_images():
    test_images_ids.append(int(test_image.replace('/kaggle/input/ufg-icomp-competition-1/test/test/', '').replace('.jpg', '')))
    
df_result = pd.DataFrame(test_images_ids, columns = ['ImageId'])

df_result['Label'] = predicted

df_result

Unnamed: 0,ImageId,Label
0,5000,0
1,5001,0
2,5002,0
3,5003,0
4,5004,0
...,...,...
389,5389,3
390,5390,1
391,5391,3
392,5392,3


In [18]:
df_result.to_csv('test_result24.csv', encoding='utf-8', index = False)