In [2]:
import pandas as pd
import os
import glob
import sklearn
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score
from sklearn.svm import SVC
import joblib

In [3]:
folder_base = './'
txt_files = glob.glob(os.path.join(folder_base, "*.txt"))

In [4]:
df = pd.read_csv("./DTD_resNet18_pool5.txt", delimiter=' ', header=None)

classes = df.iloc[:, -4].unique()

In [5]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,506,507,508,509,510,511,512,513,514,515
0,1.401093,0.816147,0.186683,0.768541,0.396009,0.005310,0.000000,1.563444,5.909267,0.372972,...,0.600579,1.180932,0.501532,1.232592,0.000000,0.094560,banded,banded_0002.jpg,,
1,1.323530,0.376637,0.981083,0.266220,0.273762,0.000530,0.016784,0.380518,4.391771,0.017420,...,0.026473,1.650099,0.240781,0.680948,0.005703,0.093568,banded,banded_0004.jpg,,
2,0.762521,0.954157,0.350528,1.994333,0.491460,0.000000,0.000000,0.000000,1.731208,0.121720,...,0.016289,3.168774,0.362966,0.811515,0.000000,0.070316,banded,banded_0005.jpg,,
3,1.827668,0.215558,0.633809,0.462106,0.577476,0.056016,0.048682,0.814718,5.406581,0.001428,...,0.001001,2.046370,0.221951,1.241225,0.000000,0.081447,banded,banded_0006.jpg,,
4,1.736761,0.538976,0.589170,0.637313,0.000000,0.918905,0.020358,5.730178,10.859345,0.004495,...,1.716462,0.232384,0.070388,1.627825,0.003812,0.002872,banded,banded_0008.jpg,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5635,0.505411,0.368111,3.136477,0.629915,0.218004,0.220789,0.197898,1.613177,4.857334,1.085175,...,0.601361,0.097267,0.712732,0.034155,0.000000,0.907973,zigzagged,zigzagged_0129.jpg,,
5636,1.345472,0.602400,1.621051,1.365416,0.004893,0.520376,0.453060,0.640227,5.402066,0.376753,...,0.267648,0.256170,0.074153,0.067122,0.085155,0.907500,zigzagged,zigzagged_0130.jpg,,
5637,0.275727,0.195363,1.852899,0.202427,0.153757,0.891153,0.529367,0.575820,2.336051,0.024416,...,0.927539,0.341117,2.158360,0.282615,1.545072,1.217988,zigzagged,zigzagged_0131.jpg,,
5638,4.051950,0.433785,5.012679,0.468834,0.035086,0.494274,2.305364,0.131759,1.527218,4.049398,...,0.000000,0.246274,2.385875,0.009398,0.179675,0.464651,zigzagged,zigzagged_0132.jpg,,


# UTILS

In [6]:
sc = StandardScaler()

In [8]:
def preprocess_data(file_path, folder_base):
    
    name = file_path.rsplit("_")[1] #Nome Extrator
    base = file_path.rsplit("_")[0]
    base = base.rsplit('\\')[1]
    
    df = pd.read_csv(file_path, delimiter=" ", header=None)
    df = df.iloc[:, :-2]
    df.iloc[:, -1] = df.iloc[:, -1].str.rsplit(".").str[0]
    df["numero"] = df.iloc[:, -1].str.rsplit("_").str[1]
    df["classe"] = df.iloc[:, -3]
    df.drop(df.columns[-3], inplace=True, axis=1)
    df.drop(df.columns[-3], inplace=True, axis=1)
    classes = df["classe"].unique()
    df["classe"] = df["classe"].astype('category')
    df["classe"] = df["classe"].cat.codes
    df.sort_values(by="classe", ascending=True)
    df.reset_index(drop=True, inplace=True)
    df["ordem"] = df.index
    
    return df, name, base

In [9]:
def read_data(df):
    X = df.iloc[:, :-3]
    y = df.iloc[:, -2]
    ids = df.iloc[:, -1]
    return X, y, ids

In [10]:
def predict_stratified(model, X_test, y_test):
    y_pred = model.predict_proba(X_test)
    
    return y_pred

In [11]:
def concat(y_test, y_pred):
    y_test = y_test.reset_index(drop=True)
    
    prob_df2 = pd.DataFrame(y_pred, columns=classes)

    prob_df2 = prob_df2.round(5)
    
    frames = [prob_df2,y_test]
    info_resp = pd.concat(frames,axis=1)
    
    return info_resp

In [12]:
def fit_svm_stratified(X_train, y_train):
    modelo = "svm"
    
    svc = SVC(probability=True)
    
    param_grid = {
        'C': [0.001, 0.1, 1, 10, 100],
        'kernel': ['rbf'],
        'gamma': ['scale', 'auto']
    }

    # Set up the GridSearchCV
    grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='f1_weighted')

    # Fit the grid search to the training data
    best_model = grid_search.fit(X_train, y_train)

    #joblib.dump(best_model,models+'best_'+ extractor +"_"+modelo+"_"+str(iteration)+'.pkl')

    return best_model

In [13]:
def fit_stratified(X,y,ids,data,save_probs, extractor, base):
    X = sc.fit_transform(X)
    
    skf = StratifiedKFold(n_splits=10)
    skf.get_n_splits(X, y)
    
    prob_columns = ['banded', 'blotchy', 'braided', 'bubbly', 'bumpy', 'chequered',
       'cobwebbed', 'cracked', 'crosshatched', 'crystalline', 'dotted',
       'fibrous', 'flecked', 'freckled', 'frilly', 'gauzy', 'grid',
       'grooved', 'honeycombed', 'interlaced', 'knitted', 'lacelike',
       'lined', 'marbled', 'matted', 'meshed', 'paisley', 'perforated',
       'pitted', 'pleated', 'polka-dotted', 'porous', 'potholed', 'scaly',
       'smeared', 'spiralled', 'sprinkled', 'stained', 'stratified',
       'striped', 'studded', 'swirly', 'veined', 'waffled', 'woven',
       'wrinkled', 'zigzagged', 'classe', 'numero', 'ordem']

    dummy_df = pd.DataFrame(columns=prob_columns)
    
    for i, (train_index, test_index) in enumerate(skf.split(X, y)):
        dados2 = data.iloc[test_index, -3:-1]
        dados2["ordem"] = data.iloc[:, -1]
        dados = dados2.reset_index(drop=True)
        
        X_train = X[train_index]
        y_train = y[train_index]
        
        X_test = X[test_index]
        y_test = y[test_index]
        
        model = fit_svm_stratified(X_train, y_train)
        
        y_pred = predict_stratified(model, X_test, y_test)
        res = concat(y_test, y_pred)
        
        prob_prediction = pd.DataFrame(data=res, columns=classes).reset_index(drop=True)
        labels = pd.DataFrame(data=y_test).reset_index(drop=True)

        tudo = pd.concat([prob_prediction, dados], axis=1)
        dummy_df = pd.concat([dummy_df, tudo], axis=0)
        #tudo.to_csv(path_save_proba_test +tipo+"_"+"tudo_"+ extractor +"_"+ str(j) + ".csv", index=False)
    
    dummy_df.to_csv(save_probs +base+"_"+extractor +"_1x1"+".csv", index=False)
    return 

# MAIN

In [14]:
save_probs = './probs/'
if not os.path.exists(save_probs):
    os.makedirs(save_probs)


for file_path in txt_files:
    df, extractor, base = preprocess_data(file_path, txt_files)
    X,y,ids = read_data(df)
    fit_stratified(X,y,ids,df,save_probs, extractor, base)
    

Fitting 5 folds for each of 10 candidates, totalling 50 fits
