# Análisis de un conjunto de datos de origen biológico mediante técnicas de _machine learning_ supervisadas y no supervisadas 

## Preparación de los Datos

In [3]:
import pandas as pd
from pathlib import Path

# Librerías de scikit learn
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
# Librerías para crear gráficos
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

In [4]:

with open("./data/column_names.txt", "r") as file:
    nombres_columnas = file.read().splitlines()

datos = pd.read_csv("./data/gene_expression.csv", delimiter=";", dtype=str, header=None)

datos.columns = nombres_columnas


columnas_expresion_genica = datos.columns[0:]  
for columna in columnas_expresion_genica:
    datos[columna] = pd.to_numeric(datos[columna].str.replace('.', ''), errors='coerce')


archivo2 = pd.read_csv("./data/classes.csv", delimiter=";", dtype=str, header=None)
archivo2.columns = ["Sample", "Classes"]


resultado = pd.concat([archivo2, datos], axis=1)

resultado.to_csv("resultado_fusion.csv", index=False)


resultado_final_final = pd.read_csv("resultado_fusion.csv")
resultado_final_final.head(10)



Unnamed: 0,Sample,Classes,DUOXA1,INTU,UMOD,STXBP4,PARN,TSC22D4,TTLL5,ZNF880,...,NPTN,BCAR3,PVALB,PTPN11,C19orf33,GASK1A,TMEM114,UCN,RAB25,TTC31
0,sample_0,CHC,118813893163,102825788041,115329889746,981895119181,893633759163,847987724867,731958180857,905250827689,...,301795752966,113645122458,950730903611,0,619014270722,980511346208,10876140876,701469003795,0,971396382113
1,sample_1,CGC,104056414257,981692284325,106919790064,761975046082,834971026864,102964803508,936025013476,914894935277,...,100439352315,108542606407,967853324016,117478989305,323658291026,482864173576,718363538147,50308701475,0,110819094596
2,sample_2,CHC,116091695019,970163742069,117103673606,888356888908,762772355631,824570956408,522808369063,106589701266,...,452595434703,104570730166,887680238777,0,796597754048,589701044747,83241085226,796597754048,0,101952983789
3,sample_3,CHC,119221718266,103012904076,966261319427,900339422715,677282260347,798432161126,521856420485,101028155764,...,126735601178,113540344496,891678712486,0,193141831808,762423983363,107498359149,646916477404,0,100981900336
4,sample_4,CFB,108111739707,102756356677,958222855959,103734742145,751069297079,965474342118,759795569673,918430261317,...,261280069087,10978038284,869266431714,0,625421986729,81005046588,10041590711,630697305714,0,105499001559
5,sample_5,CHC,114251214693,102202936742,107000234961,886287597911,867257384705,836961915556,749908847267,980357901212,...,200201836053,108034611583,908216235029,515409693386,471878902802,795103987322,966394763265,857036046325,0,104942455845
6,sample_6,AGH,109127995391,110395222717,113762122306,924315731858,879202564224,118738747035,847886667208,94858393693,...,703543696305,109995561339,905524175744,919683313397,613142280095,808070029555,100502568016,657092432535,0,103607044533
7,sample_7,CHC,116468556786,107632206677,110353558536,968775883475,893156783066,935904450967,843776871471,962498914226,...,230486145149,109926197471,889945360291,148104055612,460223712872,729331530051,106152066051,61101126377,0,993220589199
8,sample_8,CFB,109661088545,110999460694,910862115592,962154275856,743973947924,973921523253,50361472316,104742940707,...,355929601433,972513786281,948285433192,635336494705,742820102775,817061112028,997517445762,596372379959,0,105846994855
9,sample_9,CHC,108929753326,946579493246,108204502402,102162487095,743024345028,765222116378,70015658266,97300421053,...,324427819858,115303136661,90169001939,120414071678,479347455824,971221057017,107653109654,691067270824,0,103790858992


In [None]:
resultado_final_final.shape

In [None]:
total_nan = resultado_final_final.isna().sum().sum()

print(f"Número total de valores nulos: {total_nan}")

In [None]:
data_clean = resultado_final_final.dropna(axis=1, how="any")
data_clean.shape

In [None]:
resultado_final_final.dtypes

In [None]:
data_clean = resultado_final_final.drop(["Sample", "Classes"], axis=1)

imputador_knn = KNNImputer(n_neighbors=5)  # Puedes ajustar el número de vecinos (n_neighbors)

# Realizar la imputación
datos_imputados = pd.DataFrame(imputador_knn.fit_transform(data_clean), columns=data_clean.columns)

# Guardar los datos imputados en un nuevo archivo CSV
datos_imputados.to_csv("datos_imputados.csv", index=False)



datos_imputados.head(10)


In [None]:
data_clean.shape
total_nan = datos_imputados.isna().sum().sum()

print(f"Número total de valores nulos: {total_nan}")

In [None]:
# Estandarización de los datos
scaler = StandardScaler() # Crea una instancia de StandardScaler

# Estandariza los datos utilizando la instancia de StandarScaler
data_standardized = scaler.fit_transform(datos_imputados)

# Muestra los datos estandarizados
data_standardized[10]

## Técnicas no supervisadas 

### Uso de PCA

In [None]:
# PCA para 2 componentes
pca_2d = PCA(n_components=2)
components_2d = pca_2d.fit_transform(data_standardized)

# PCA para 3 componentes
pca_3d = PCA(n_components=3)
components_3d = pca_3d.fit_transform(data_standardized)

In [None]:
# Varianza explicada para 2 y 3 componentes
explained_variance_2d = pca_2d.explained_variance_ratio_
explained_variance_3d = pca_3d.explained_variance_ratio_

# Cálculo de la varianza explicada acumulada
cumulative_variance_2d = explained_variance_2d.sum()
cumulative_variance_3d = explained_variance_3d.sum()

# Mostrar resultados
print(f"Varianza explicada PCA2: {explained_variance_2d}")
print(f"Varianza acumulada PCA2: {cumulative_variance_2d}")
print(f"Varianza explicada PCA3: {explained_variance_3d}")
print(f"Varianza acumulada PCA3: {cumulative_variance_3d}")