# Projeto AM

Alunos: Vinícius Pereira, Giuseppe Vicente, Nikolas Antes e Gustavo Beato

In [None]:
import pandas as pd
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression  # Example model
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

## Explolaroty Analysis

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
train.shape

In [None]:
train.Class.value_counts()

In [None]:
train.describe()

### Getting cols that have more than one value
- A maior partes das colunas é feita apenas de um valor (2000 colunas)
- Isso não será útil para o treinamento, logo vamos descartar
- Sobraram 1308 colunas

In [None]:
# getting cols that have more than one value
util_cols = []
for col in train.columns:
    if train[col].value_counts().shape[0] < 2:
        continue
    else:
        util_cols.append(col)

train = train[util_cols]

In [None]:
len(util_cols)

In [None]:
train.dtypes.value_counts()

### Find and fill NA values
Fill feito com a média

In [None]:
train.isna().sum()[train.isna().sum().values.astype(bool)]

In [None]:
train.X1942.fillna(train.X1942.mean(), inplace=True)

### Find correlation inter features
- Se faz primeiro o scaling das features para poder conseguir a matriz de covariância
- Agrupam-se as features que possuem alta correlação (acima de 70%)
- Selecionar-se uma feature de cada grupo de features, reduzindo dimensionalidade para 245 e evitando colunas altamente correlacionadas no treino, o que faz mal a alguns modelos como o KNN
- Percebe-se que é possível determinar a classe através de algumas features com alta correlação


![Alt text](/Users/viniciuspereira/Documents/ai/lorena/violon.png)

In [None]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(train.drop(columns=["Class"]))

In [None]:
train_scaled = pd.DataFrame(scaled_data, columns=train.drop(columns=["Class"]).columns)

In [None]:
train_scaled["Class"] = train["Class"]

In [None]:
covs = abs(train_scaled.cov())

In [None]:
# Suponha que 'df' é o seu DataFrame com as features numéricas
df = train_scaled

corr_matrix = df.corr().abs()  # Usamos o valor absoluto da correlação
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

THRESHOLD = 0.7

high_corr_pairs = [(column, index) for column in upper.columns for index in upper.index if not pd.isna(upper.loc[index, column]) and upper.loc[index, column] > THRESHOLD]


G = nx.Graph()
G.add_nodes_from(df.columns)
G.add_edges_from(high_corr_pairs)

# Encontrar os componentes conectados (grupos de features correlacionadas)
groups = list(nx.connected_components(G))

# Exibir os grupos
print("\nGrupos de features com alta correlação:")
for i, group in enumerate(groups, 1):
    print(f"Grupo {i}: {group}")

In [None]:
principal_df = train_scaled[['X913','Class']]

In [None]:
# plt.figure(figsize=(10, 6))
# for class_value in principal_df["Class"].unique():
#     subset = principal_df[principal_df["Class"] == class_value]
#     plt.scatter( subset["Class"],subset["X913"], label=f'Class {class_value}', alpha=0.6)
# plt.title(f'Histogram of {col} by Class')
# plt.xlabel(col)
# plt.ylabel('Frequency')
# plt.legend(title='Class')
# plt.show()

In [None]:
px.violin(principal_df, y="X913", x="Class", color="Class", box=True, points="all",
          hover_data=principal_df.columns)

In [None]:
low_corr_cols = [list(group)[0] for group in groups]

In [None]:
len(low_corr_cols)

In [None]:
train_no_high_corr = train_scaled[low_corr_cols]

In [None]:
train_no_high_corr['Id'] = train['Id']

In [None]:
train_no_high_corr['Class'] = train['Class']

In [None]:
train_no_high_corr

### Cross Validation

In [None]:
X = train[low_corr_cols]

In [None]:
X.drop(columns=["Id"], inplace=True)

In [None]:
y = train['Class']

In [None]:
models = [DecisionTreeClassifier(), KNeighborsClassifier(1), KNeighborsClassifier(3), KNeighborsClassifier(5)]

In [None]:
pipelines = [Pipeline([ ('scaler', StandardScaler()), ('model', model)]) for model in models]

In [None]:
for pipeline in pipelines:    
    skf = StratifiedKFold(n_splits=5)
    scores = cross_val_score(pipeline, X, y, cv=skf)
    pipeline.fit(X, y)
    model = pipeline['model']
    print("------------------------------------------------------")
    print(model.__class__.__name__)
    if model.__class__.__name__ == "KNeighborsClassifier":
        print("K:", model.n_neighbors)
    print("Stratified cross-validation scores:", scores)
    print("Mean stratified cross-validation score:", scores.mean())
    print("Standard deviation of stratified cross-validation score:", scores.std())

## Testes

In [None]:
test = test[low_corr_cols]

In [None]:
test.drop(columns=["Id"], inplace=True)

In [None]:
results = []    
for pipeline in pipelines:
    pipeline.predict(test)
    results.append(pipeline.predict_proba(test))

In [None]:
results[3]

In [None]:
r = pd.DataFrame()

In [None]:
r['Id'] = pd.read_csv('test.csv')['Id']

In [None]:
r[["Prob1","Prob2"]] = pd.DataFrame(results[2], columns=["Prob1","Prob2"])

In [None]:
r.to_csv('submission2.csv', index=False)

In [None]:
importances =pd.DataFrame(index=test.columns,data =pipelines[0]['model'].feature_importances_, columns=["Feature Importance"]).sort_values(by="Feature Importance", ascending=False)
importances

## Dicussões

- Primeiramente, 

## Conclusão