# IMPORTS

In [1]:
import pandas as pd
import os
import glob
import sklearn
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score
from sklearn.svm import SVC
import joblib

In [4]:
folder_base = './'
txt_files = glob.glob(os.path.join(folder_base, "*.txt"))

In [5]:
df = pd.read_csv(txt_files[0], delimiter=' ', header=None)

In [9]:
classes = df.iloc[:, -4].unique().tolist()

# UTILS

In [11]:
sc = StandardScaler()

In [13]:
def preprocess_data(file_path, folder_base):
    
    name = file_path.rsplit("_")[1] #Nome Extrator
    base = file_path.rsplit("_")[0]
    base = base.rsplit('\\')[1]
    
    df = pd.read_csv(file_path, delimiter=" ", header=None)
    df = df.iloc[:, :-2]
    df.iloc[:, -1] = df.iloc[:, -1].str.rsplit(".").str[0]
    df["numero"] = df.iloc[:, -1].str.rsplit("_").str[2]
    df["classe"] = df.iloc[:, -3]
    df.drop(df.columns[-3], inplace=True, axis=1)
    df.drop(df.columns[-3], inplace=True, axis=1)
    classes = df["classe"].unique()
    df["classe"] = df["classe"].astype('category')
    df["classe"] = df["classe"].cat.codes
    df.sort_values(by="classe", ascending=True)
    df.reset_index(drop=True, inplace=True)
    df["ordem"] = df.index
    
    return df, name, base

In [14]:
def read_data(df):
    X = df.iloc[:, :-3]
    y = df.iloc[:, -2]
    ids = df.iloc[:, -1]
    return X, y, ids

In [15]:
def predict_stratified(model, X_test, y_test):
    y_pred = model.predict_proba(X_test)
    
    return y_pred

In [16]:
def concat(y_test, y_pred):
    y_test = y_test.reset_index(drop=True)
    
    prob_df2 = pd.DataFrame(y_pred, columns=classes)

    prob_df2 = prob_df2.round(5)
    
    frames = [prob_df2,y_test]
    info_resp = pd.concat(frames,axis=1)
    
    return info_resp

In [17]:
def fit_svm_stratified(X_train, y_train):
    modelo = "svm"
    
    svc = SVC(probability=True)
    
    param_grid = {
        'C': [0.001, 0.1, 1, 10, 100],
        'kernel': ['rbf'],
        'gamma': ['scale', 'auto']
    }

    # Set up the GridSearchCV
    grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='f1_weighted')

    # Fit the grid search to the training data
    best_model = grid_search.fit(X_train, y_train)

    #joblib.dump(best_model,models+'best_'+ extractor +"_"+modelo+"_"+str(iteration)+'.pkl')

    return best_model

In [18]:
def fit_stratified(X,y,ids,data,save_probs, extractor, base):
    X = sc.fit_transform(X)
    
    skf = StratifiedKFold(n_splits=10)
    skf.get_n_splits(X, y)
    
    prob_columns = ['fabric', 'foliage', 'glass', 'leather', 'metal', 'paper',
       'plastic', 'stone', 'water', 'wood', 'classe', 'numero', 'ordem']

    dummy_df = pd.DataFrame(columns=prob_columns)
    
    for i, (train_index, test_index) in enumerate(skf.split(X, y)):
        dados2 = data.iloc[test_index, -3:-1]
        dados2["ordem"] = data.iloc[:, -1]
        dados = dados2.reset_index(drop=True)
        
        X_train = X[train_index]
        y_train = y[train_index]
        
        X_test = X[test_index]
        y_test = y[test_index]
        
        model = fit_svm_stratified(X_train, y_train)
        
        y_pred = predict_stratified(model, X_test, y_test)
        res = concat(y_test, y_pred)
        
        prob_prediction = pd.DataFrame(data=res, columns=classes).reset_index(drop=True)
        labels = pd.DataFrame(data=y_test).reset_index(drop=True)

        tudo = pd.concat([prob_prediction, dados], axis=1)
        dummy_df = pd.concat([dummy_df, tudo], axis=0)
        #tudo.to_csv(path_save_proba_test +tipo+"_"+"tudo_"+ extractor +"_"+ str(j) + ".csv", index=False)
    
    dummy_df.to_csv(save_probs +base+"_"+extractor +"_1x1"+".csv", index=False)
    return 

# MAIN

In [19]:
save_probs = './probs/'
if not os.path.exists(save_probs):
    os.makedirs(save_probs)


for file_path in txt_files:
    df, extractor, base = preprocess_data(file_path, txt_files)
    X,y,ids = read_data(df)
    fit_stratified(X,y,ids,df,save_probs, extractor, base)
    

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [20]:
df2 = pd.read_csv("./pro")

FileNotFoundError: [Errno 2] No such file or directory: './prob/FMD_resNet18_1x1.csv'