# Análisis Multivariado — Wine (UCI) — v2

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, confusion_matrix, accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import train_test_split
import os
np.random.seed(42)
FIG_DIR = os.path.abspath(os.path.join('..','figuras'))
os.makedirs(FIG_DIR, exist_ok=True)


In [None]:
wine = datasets.load_wine()
X = pd.DataFrame(wine.data, columns=wine.feature_names)
y = pd.Series(wine.target, name='clase')
X.shape, y.value_counts().sort_index()


In [None]:
desc = X.describe().T
desc


In [None]:
plt.figure()
X.hist(bins=20, figsize=(12,10))
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, 'histogramas_variables_v2.png'), dpi=150)
plt.show()


In [None]:
scaler = StandardScaler()
X_std = scaler.fit_transform(X)


In [None]:
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X_std)
exp_var = pca.explained_variance_ratio_
cum_exp_var = np.cumsum(exp_var)
plt.figure()
plt.plot(range(1, len(exp_var)+1), cum_exp_var, marker='o')
plt.xlabel('Número de componentes')
plt.ylabel('Varianza explicada acumulada')
plt.title('PCA — Varianza explicada acumulada (v2)')
plt.grid(True, linestyle=':')
plt.savefig(os.path.join(FIG_DIR, 'pca_varianza_acumulada_v2.png'), dpi=150)
plt.show()
import pandas as pd
loadings = pd.DataFrame(pca.components_.T, index=wine.feature_names, columns=[f'PC{{i+1}}' for i in range(pca.n_components_)])
loadings


In [None]:
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_std)
sil = silhouette_score(X_std, clusters)
sil


In [None]:
plt.figure()
plt.scatter(X_pca[:,0], X_pca[:,1], c=clusters)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('K-Means en espacio PCA — v2')
plt.savefig(os.path.join(FIG_DIR, 'kmeans_pca_scatter_v2.png'), dpi=150)
plt.show()


In [None]:
ct = pd.crosstab(pd.Series(clusters, name='cluster'), y)
ct


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.25, random_state=42, stratify=y)
lda = LDA()
lda.fit(X_train, y_train)
y_pred = lda.predict(X_test)
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
acc, cm


In [None]:
plt.figure()
plt.scatter(*lda.transform(X_std).T[:2], c=y)
plt.title('Proyección LDA — v2')
plt.xlabel('LD1')
plt.ylabel('LD2')
plt.savefig(os.path.join(FIG_DIR, 'lda_scatter_v2.png'), dpi=150)
plt.show()


In [None]:
import pandas as pd
pd.DataFrame({'PC':[f'PC{i+1}' for i in range(len(exp_var))], 'exp_var':exp_var, 'cum_exp_var':cum_exp_var}).to_csv(
    os.path.join(FIG_DIR, 'pca_varianza_v2.csv'), index=False)
loadings.to_csv(os.path.join(FIG_DIR, 'pca_cargas_v2.csv'))
ct.to_csv(os.path.join(FIG_DIR, 'kmeans_vs_clase_v2.csv'))
import numpy as np
pd.DataFrame(cm, index=[f'Real_{i}' for i in sorted(y.unique())], columns=[f'Pred_{i}' for i in sorted(y.unique())]).to_csv(
    os.path.join(FIG_DIR, 'lda_matriz_confusion_v2.csv'))
with open(os.path.join(FIG_DIR, 'metrics_v2.txt'), 'w') as f:
    f.write(f'Silhouette: {sil:.4f}\nAccuracy_LDA: {acc:.4f}\n')
print('Artefactos guardados en', FIG_DIR)
