<a href="https://colab.research.google.com/github/williamteles/Classification_Data-Mining/blob/main/Classification_Minera%C3%A7%C3%A3o_de_Dados.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas-profiling==2.7.1

!pip install pycaret

In [None]:
import pandas as pd
import numpy as np
import pandas_profiling
from sklearn import model_selection
from sklearn.utils import resample
from sklearn.preprocessing import scale
from sklearn.ensemble import *
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score
from pycaret.classification import setup
from pycaret.classification import compare_models

In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00571/hcvdat0.csv"
df = pd.read_csv(url, delimiter=',', index_col=0)
df.head()

In [None]:
categories = df.Category.unique()
categories

In [None]:
df.Category.replace(categories, [0,0,1,1,1], inplace=True)


In [None]:
df['Sex'] = df['Sex'].map({'m': 0, 'f': 1})
df

In [None]:
df.isnull().sum()

In [None]:
df = df.dropna()
df.shape

In [None]:
df.describe()

In [None]:
def limitar_anomalias(data, anomalia_cols):
  for col in anomalia_cols:
    q25, q75 = np.percentile(data[col], 25), np.percentile(data[col], 75)

    iiq = q75 - q25

    print(f'Percentis coluna {col}: 25% = {q25:.3f}, 75% = {q75:.3f}, IIQ = {iiq:.3f}')

    corte = iiq * 1.5

    inferior, superior = q25 - corte, q75 + corte

    anomalias = [x for x in data[col] if x < inferior or x > superior]

    print(f'Outliers Identificados na coluna {col}: {len(anomalias)}')
    print()

    data[col] = np.where(data[col] > superior, superior, 
                         np.where(data[col] < inferior, inferior, 
                                  data[col]))
  
  return data


def normalizar(data, normalizar_cols):
  for col in normalizar_cols:
    data[col] = (data[col] - data[col].min()) / (data[col].max() - data[col].min())

  return data

In [None]:
anomalia_cols = ['ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT']
df = limitar_anomalias(df.copy(), anomalia_cols)

df.describe()

In [None]:
normalizar_cols = ['ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT']

df = normalizar(df.copy(), anomalia_cols)

df.describe()

In [None]:
duplicados = df.duplicated()
print(duplicados.any())

In [None]:
mascara = df.Category == 0
df_0 = df[mascara]
df_1 = df[~mascara]
df_oversample = resample(df_1, n_samples=len(df_0), random_state=13)
df2 = pd.concat([df_0, df_oversample])
df2.Category.value_counts()

In [None]:
duplicados = df2.duplicated()
print(duplicados.any())

In [None]:
clf = setup(data=df, target='Category', silent=True, verbose=False, log_experiment=True, session_id=13)
best = compare_models(n_select=5)
for model in best:
  print(model)

In [None]:
model1 = [AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=13), 'AdaBoost']
model2 = [RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=13, verbose=0,
                       warm_start=False),'Random Forest']

models = [model1, model2]

In [None]:
y = df['Category'].values
X = df.drop(columns='Category').values

In [None]:
def overSampling(X_train, y_train):
  mascara = y_train == 0
  y_0 = y_train[mascara]
  y_1 = y_train[~mascara]
  X_0 = X_train[mascara]
  X_1 = X_train[~mascara]
  y_oversample = resample(y_1, n_samples=len(y_0))
  X_oversample = resample(X_1, n_samples=len(X_0))
  y_train = np.concatenate((y_0, y_oversample))
  X_train = np.concatenate((X_0, X_oversample))
  # print(f'Class 0 / 1 quant: {y_train[y_train == 0].size} / {y_train[y_train == 1].size}')

  return X_train, y_train


In [None]:
skf = StratifiedKFold(n_splits=10)

for mod in models:
    acc = []
    f1 = []
    precision = []

    for train_indexs, test_indexs in skf.split(X, y):
        X_train, y_train = overSampling(X[train_indexs,:], y[train_indexs])
        X_test, y_test = X[test_indexs,:], y[test_indexs]

        model = mod[0]
        model.fit(X_train, y_train)
        y_predicted = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_predicted)
        f1_scr = f1_score(y_test, y_predicted)
        prec_score = precision_score(y_test, y_predicted)
        acc.append(accuracy)
        f1.append(f1_scr)
        precision.append(prec_score)

    print(f"{mod[1]}, Accuracy:\t{np.mean(acc):.2f} +/- {np.std(acc):.4f}")
    print(f"{mod[1]}, F1 Score:\t{np.mean(f1):.2f} +/- {np.std(f1):.4f}")
    print(f"{mod[1]}, Precision:\t{np.mean(precision):.2f} +/- {np.std(precision):.4f}")
    print("-"*60)
