# Self-Organizing Map (SOM) for PheWAS Data

## Import Libraries

In this section, we will import the necessary libraries for the SOM analysis.


In [147]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from minisom import MiniSom


## Load Data

In [148]:
# Load the data
data = pd.read_csv('Data/hla-phewas-catalog.csv')
data.head()

Unnamed: 0,snp,phewas_code,phewas_string,cases,controls,category_string,odds_ratio,p,l95,u95,gene_name,maf,a1,a2,chromosome,nchrobs
0,HLA_A_01,8.0,Intestinal infection,683,27384,infectious diseases,0.887,0.1654,0.749,1.051,A,0.1655,P,A,6,57678
1,HLA_A_0101,8.0,Intestinal infection,683,27384,infectious diseases,0.882,0.147,0.745,1.045,A,0.1653,P,A,6,57678
2,HLA_A_02,8.0,Intestinal infection,683,27384,infectious diseases,0.928,0.3378,0.797,1.081,A,0.3025,P,A,6,57678
3,HLA_A_0201,8.0,Intestinal infection,683,27384,infectious diseases,0.919,0.2778,0.789,1.07,A,0.2906,P,A,6,57678
4,HLA_A_03,8.0,Intestinal infection,683,27384,infectious diseases,1.107,0.2359,0.936,1.31,A,0.1435,P,A,6,57678


In [149]:
# Check the shape of the data
data.shape

(240768, 16)

In [150]:
# Check the columns of the data
data.columns

Index(['snp', 'phewas_code', 'phewas_string', 'cases', 'controls',
       'category_string', 'odds_ratio', 'p', 'l95', 'u95', 'gene_name', 'maf',
       'a1', 'a2', 'chromosome', 'nchrobs'],
      dtype='object')

In [151]:
# Check the data types of the columns
data.dtypes

snp                 object
phewas_code        float64
phewas_string       object
cases                int64
controls             int64
category_string     object
odds_ratio         float64
p                  float64
l95                float64
u95                float64
gene_name           object
maf                float64
a1                  object
a2                  object
chromosome           int64
nchrobs              int64
dtype: object

In [152]:
# Check the missing values in the data
data.isnull().sum()

snp                  0
phewas_code          0
phewas_string        0
cases                0
controls             0
category_string    176
odds_ratio           0
p                    0
l95                  0
u95                  0
gene_name            0
maf                  0
a1                   0
a2                   0
chromosome           0
nchrobs              0
dtype: int64

In [153]:
# Print the specific rows with missing values
data[data.isnull().any(axis=1)]

Unnamed: 0,snp,phewas_code,phewas_string,cases,controls,category_string,odds_ratio,p,l95,u95,gene_name,maf,a1,a2,chromosome,nchrobs
225280,HLA_A_01,797.2,Septic shock,451,27328,,1.079,0.4575,0.883,1.317,A,0.16550,P,A,6,57678
225281,HLA_A_0101,797.2,Septic shock,451,27328,,1.080,0.4487,0.885,1.319,A,0.16530,P,A,6,57678
225282,HLA_A_02,797.2,Septic shock,451,27328,,0.972,0.7613,0.806,1.171,A,0.30250,P,A,6,57678
225283,HLA_A_0201,797.2,Septic shock,451,27328,,1.011,0.9078,0.839,1.218,A,0.29060,P,A,6,57678
225284,HLA_A_03,797.2,Septic shock,451,27328,,0.866,0.1961,0.697,1.077,A,0.14350,P,A,6,57678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225451,HLA_DRB1_14,797.2,Septic shock,451,27328,,1.086,0.7154,0.698,1.690,DRB1,0.02207,P,A,6,57678
225452,HLA_DRB1_1401,797.2,Septic shock,451,27328,,0.745,0.3377,0.408,1.360,DRB1,0.01644,P,A,6,57678
225453,HLA_DRB1_15,797.2,Septic shock,451,27328,,0.847,0.1330,0.683,1.052,DRB1,0.14790,P,A,6,57678
225454,HLA_DRB1_1501,797.2,Septic shock,451,27328,,0.837,0.1124,0.671,1.043,DRB1,0.14080,P,A,6,57678


In [154]:
# Check that all the septic shock categories are missing
data[data['phewas_string'] == 'Septic shock'].shape

(176, 16)

In [155]:
# Impute the missing values with infectious diseases as is closest to the missing values
data['category_string'] = data['category_string'].fillna('infectious diseases')

# Check the missing values in the data
data.isnull().sum()

snp                0
phewas_code        0
phewas_string      0
cases              0
controls           0
category_string    0
odds_ratio         0
p                  0
l95                0
u95                0
gene_name          0
maf                0
a1                 0
a2                 0
chromosome         0
nchrobs            0
dtype: int64

In [156]:
# Check for unique values in chromosome and nchrobs column
len(data['chromosome'].unique())

1

In [157]:
len(data['nchrobs'].unique())

1

In [158]:
# Drop chromosome and nchrobs columns as they are the same for all the rows (because the data is from the same chromosome)
data = data.drop(['chromosome', 'nchrobs'], axis=1)

## Data Preprocessing

In [159]:
# Normalise the data
from sklearn.preprocessing import StandardScaler