# Data Preprocessing

## Missing values
### Check Missing values

In [1]:
import pandas as pd 
dataset = pd.read_csv('data/genomics_clean.tsv', sep='\t')
dataset.head()

  dataset = pd.read_csv('data/genomics_clean.tsv', sep='\t')


Unnamed: 0,Chromosome,Position,Reference,Alternate_Allele,Gene_Symbol,Clinical_Significance,Clinical_Review_Status,Clinical_Disease_Name,Missense_Variant,Allele_Frequencies_ESP,Allele_Frequencies_EXAC,Allele_Frequencies_TGP
0,1,66926,AG,A,OR4F5,Uncertain_significance,"criteria_provided,_single_submitter",Retinitis_pigmentosa,SO:0001627|intron_variant,,,
1,1,69134,A,G,OR4F5,Likely_benign,"criteria_provided,_single_submitter",not_specified,SO:0001583|missense_variant,,,
2,1,69308,A,G,OR4F5,Uncertain_significance,"criteria_provided,_single_submitter",not_specified,SO:0001583|missense_variant,,,
3,1,69314,T,G,OR4F5,Uncertain_significance,"criteria_provided,_single_submitter",not_specified,SO:0001583|missense_variant,,,
4,1,69404,T,C,OR4F5,Uncertain_significance,"criteria_provided,_single_submitter",not_specified,SO:0001583|missense_variant,,,


In [2]:
dataset.isna().sum()

Chromosome                       0
Position                         0
Reference                        0
Alternate_Allele              1023
Gene_Symbol                    767
Clinical_Significance         4526
Clinical_Review_Status        4526
Clinical_Disease_Name         5191
Missense_Variant             19339
Allele_Frequencies_ESP     3275634
Allele_Frequencies_EXAC    2734172
Allele_Frequencies_TGP     3240811
dtype: int64

In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3687735 entries, 0 to 3687734
Data columns (total 12 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   Chromosome               object 
 1   Position                 int64  
 2   Reference                object 
 3   Alternate_Allele         object 
 4   Gene_Symbol              object 
 5   Clinical_Significance    object 
 6   Clinical_Review_Status   object 
 7   Clinical_Disease_Name    object 
 8   Missense_Variant         object 
 9   Allele_Frequencies_ESP   float64
 10  Allele_Frequencies_EXAC  float64
 11  Allele_Frequencies_TGP   float64
dtypes: float64(3), int64(1), object(8)
memory usage: 337.6+ MB


### Drop highly missing values columns

In [4]:
# dataset = dataset.dropna(subset=['Allele_Frequencies_ESP', 'Allele_Frequencies_EXAC', 'Allele_Frequencies_TGP'])
dataset = dataset.drop(columns=['Allele_Frequencies_ESP', 'Allele_Frequencies_EXAC', 'Allele_Frequencies_TGP'])
dataset.isna().sum()

Chromosome                    0
Position                      0
Reference                     0
Alternate_Allele           1023
Gene_Symbol                 767
Clinical_Significance      4526
Clinical_Review_Status     4526
Clinical_Disease_Name      5191
Missense_Variant          19339
dtype: int64

### Fill categorical columns

In [5]:
dataset['Gene_Symbol'].value_counts()

Gene_Symbol
TTN          36438
BRCA2        19899
ATM          18418
APC          16037
NF1          15900
             ...  
BCAP29           1
RPL39            1
MCRIP2           1
TMEM225B         1
C20orf202        1
Name: count, Length: 18517, dtype: int64

In [6]:
dataset['Gene_Symbol'] = dataset['Gene_Symbol'].fillna('Unknown')
dataset['Gene_Symbol'].isna().sum()

np.int64(0)

In [7]:
dataset['Clinical_Disease_Name'] = dataset['Clinical_Disease_Name'].fillna('No_disease_reported')
dataset['Clinical_Disease_Name'].isna().sum()

np.int64(0)

In [8]:
dataset['Clinical_Review_Status'] = dataset['Clinical_Review_Status'].fillna('Unreviewed')
dataset['Clinical_Review_Status'].isna().sum()

np.int64(0)

In [9]:
dataset.isna().sum()

Chromosome                    0
Position                      0
Reference                     0
Alternate_Allele           1023
Gene_Symbol                   0
Clinical_Significance      4526
Clinical_Review_Status        0
Clinical_Disease_Name         0
Missense_Variant          19339
dtype: int64

### Drop NaN rows 

In [10]:
dataset = dataset.dropna(subset=['Alternate_Allele', 'Clinical_Significance'])
dataset.isna().sum()

Chromosome                    0
Position                      0
Reference                     0
Alternate_Allele              0
Gene_Symbol                   0
Clinical_Significance         0
Clinical_Review_Status        0
Clinical_Disease_Name         0
Missense_Variant          19288
dtype: int64

In [11]:
dataset['Missense_Variant'].value_counts()

Missense_Variant
SO:0001583|missense_variant                                                                                                                   1569692
SO:0001819|synonymous_variant                                                                                                                  525944
SO:0001627|intron_variant                                                                                                                      514772
SO:0001583|missense_variant,SO:0001619|non-coding_transcript_variant                                                                           213676
SO:0001583|missense_variant,SO:0001627|intron_variant                                                                                          130995
                                                                                                                                               ...   
SO:0001578|stop_lost,SO:0001619|non-coding_transcript_variant,SO:0001820|inframe_in

In [12]:
dataset['Missense_Variant'] = dataset['Missense_Variant'].fillna(0)
dataset.isna().sum()

Chromosome                0
Position                  0
Reference                 0
Alternate_Allele          0
Gene_Symbol               0
Clinical_Significance     0
Clinical_Review_Status    0
Clinical_Disease_Name     0
Missense_Variant          0
dtype: int64

In [13]:
dataset.isna().sum()

Chromosome                0
Position                  0
Reference                 0
Alternate_Allele          0
Gene_Symbol               0
Clinical_Significance     0
Clinical_Review_Status    0
Clinical_Disease_Name     0
Missense_Variant          0
dtype: int64

## Feature Engineering

### Label encoding - Clinical Significance

In [14]:
dataset.head()

Unnamed: 0,Chromosome,Position,Reference,Alternate_Allele,Gene_Symbol,Clinical_Significance,Clinical_Review_Status,Clinical_Disease_Name,Missense_Variant
0,1,66926,AG,A,OR4F5,Uncertain_significance,"criteria_provided,_single_submitter",Retinitis_pigmentosa,SO:0001627|intron_variant
1,1,69134,A,G,OR4F5,Likely_benign,"criteria_provided,_single_submitter",not_specified,SO:0001583|missense_variant
2,1,69308,A,G,OR4F5,Uncertain_significance,"criteria_provided,_single_submitter",not_specified,SO:0001583|missense_variant
3,1,69314,T,G,OR4F5,Uncertain_significance,"criteria_provided,_single_submitter",not_specified,SO:0001583|missense_variant
4,1,69404,T,C,OR4F5,Uncertain_significance,"criteria_provided,_single_submitter",not_specified,SO:0001583|missense_variant


In [15]:
dataset['Clinical_Significance'].unique()

array(['Uncertain_significance', 'Likely_benign', 'Benign',
       'Conflicting_classifications_of_pathogenicity',
       'Benign/Likely_benign', 'Pathogenic', 'Likely_pathogenic',
       'Pathogenic/Likely_pathogenic', 'not_provided',
       'no_classifications_from_unflagged_records', 'risk_factor',
       'Affects', 'no_classification_for_the_single_variant',
       'association', 'Benign|other', 'Pathogenic|risk_factor',
       'Benign/Likely_benign|other', 'drug_response',
       'Conflicting_classifications_of_pathogenicity|association',
       'Benign|association', 'Uncertain_risk_allele',
       'Conflicting_classifications_of_pathogenicity|other', 'other',
       'Likely_benign|association', 'Likely_risk_allele',
       'Pathogenic/Likely_pathogenic/Pathogenic,_low_penetrance',
       'Pathogenic/Likely_pathogenic|other', 'Benign|drug_response',
       'Pathogenic/Likely_pathogenic/Pathogenic,_low_penetrance|other',
       'protective', 'Pathogenic/Likely_pathogenic|risk_facto

In [16]:
import numpy as np

def simplify_clinsig(value):
    if pd.isna(value):
        return 'Unknown'
    clin_sig_value = value.lower()
    
    if 'pathogenic' in clin_sig_value:
        return 'Pathogenic'
    elif 'benign' in clin_sig_value:
        return 'Benign'
    elif 'uncertain' in clin_sig_value:
        return 'Uncertain'
    elif 'conflicting' in clin_sig_value:
        return 'Conflicting'
    elif 'risk_factor' in clin_sig_value or 'risk' in clin_sig_value or 'protective' in clin_sig_value or 'sensitivity' in clin_sig_value:
        return 'Risk_factor'
    elif 'drug_response' in clin_sig_value:
        return 'Drug_response'
    elif 'association' in clin_sig_value or 'affects' in clin_sig_value:
        return 'Association'
    elif 'not_provided' in clin_sig_value or 'no_classification' in clin_sig_value or 'other' in clin_sig_value:
        return 'Other'
    else:
        return 'Other' 

In [17]:
dataset['Clinical_Significance'] = dataset['Clinical_Significance'].apply(simplify_clinsig)
dataset['Clinical_Significance'].unique()

array(['Uncertain', 'Benign', 'Pathogenic', 'Other', 'Risk_factor',
       'Association', 'Drug_response'], dtype=object)

In [18]:
dataset['Clinical_Significance'].value_counts()

Clinical_Significance
Uncertain        1939966
Benign           1266122
Pathogenic        463873
Other               9384
Drug_response       1861
Risk_factor          517
Association          463
Name: count, dtype: int64

In [19]:
sig_label_map = {
    'Benign': 0,
    'Uncertain': 1,
    'Pathogenic': 2,
    'Conflicting': 3,
    'Risk_factor': 4,
    'Drug_response': 5,
    'Association': 6,
    'Other': 7
}

dataset['Clinical_Significance_Encoded'] = dataset['Clinical_Significance'].map(sig_label_map)
dataset['Clinical_Significance_Encoded'].value_counts()

Clinical_Significance_Encoded
1    1939966
0    1266122
2     463873
7       9384
5       1861
4        517
6        463
Name: count, dtype: int64

### Map and Enable encoding - Chromosome

In [20]:
dataset['Chromosome'].unique()

array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
       20, 21, 22, '22', 'X', 'Y', 'MT', 'NT_113889.1', 'NT_187633.1',
       'NT_187661.1', 'NT_187693.1', 'NW_009646201.1'], dtype=object)

In [21]:
dataset['Chromosome'].value_counts()

Chromosome
2                 333754
1                 328143
17                224188
11                218427
19                206253
3                 200989
16                194165
5                 181798
7                 179750
12                167605
6                 164854
9                 164713
10                136996
X                 135910
4                 130277
15                130127
8                 125273
14                117813
20                 77985
13                 74590
22                 71191
18                 63059
21                 41717
22                  9357
MT                  3104
Y                    115
NT_187633.1           13
NT_187693.1           10
NT_187661.1            8
NT_113889.1            1
NW_009646201.1         1
Name: count, dtype: int64

In [22]:
chrom_map = {'X':23, 'Y':24, 'MT': 25, 'NT_187633.1': 26, 'NT_187693.1': 27, 'NT_187661.1': 28, 'NT_113889.1': 29,
            'NW_009646201.1': 30}
dataset['Chromosome_Encoded'] = dataset['Chromosome'].replace(chrom_map)
dataset['Chromosome_Encoded'].value_counts()

Chromosome_Encoded
2     333754
1     328143
17    224188
11    218427
19    206253
3     200989
16    194165
5     181798
7     179750
12    167605
6     164854
9     164713
10    136996
23    135910
4     130277
15    130127
8     125273
14    117813
20     77985
13     74590
22     71191
18     63059
21     41717
22      9357
25      3104
24       115
26        13
27        10
28         8
29         1
30         1
Name: count, dtype: int64

In [23]:
dataset['Clinical_Review_Status'].unique()

array(['criteria_provided,_single_submitter',
       'criteria_provided,_multiple_submitters,_no_conflicts',
       'no_assertion_criteria_provided',
       'criteria_provided,_conflicting_classifications',
       'no_classification_provided',
       'no_classifications_from_unflagged_records',
       'no_classification_for_the_single_variant',
       'reviewed_by_expert_panel', 'practice_guideline'], dtype=object)

In [24]:
dataset['Clinical_Review_Status'].value_counts()

Clinical_Review_Status
criteria_provided,_single_submitter                     2801017
criteria_provided,_multiple_submitters,_no_conflicts     593757
criteria_provided,_conflicting_classifications           149571
no_assertion_criteria_provided                           109832
reviewed_by_expert_panel                                  20123
no_classification_provided                                 6993
no_classification_for_the_single_variant                    627
no_classifications_from_unflagged_records                   215
practice_guideline                                           51
Name: count, dtype: int64

In [25]:
review_map = {
    'no_assertion_criteria_provided': 0,
    'criteria_provided,_single_submitter':1,
    'criteria_provided,_multiple_submitters,_no_conflicts':2,
    'reviewed_by_expert_panel': 3,
    'practice_guideline': 4,
    'criteria_provided,_conflicting_classifications': 5,
    'no_classification_provided':6,
    'no_classifications_from_unflagged_records':7,
    'no_classification_for_the_single_variant':8
}

dataset['Clinical_Review_Status_Encoded'] = dataset['Clinical_Review_Status'].map(review_map)
dataset['Clinical_Review_Status_Encoded'].value_counts()

Clinical_Review_Status_Encoded
1    2801017
2     593757
5     149571
0     109832
3      20123
6       6993
8        627
7        215
4         51
Name: count, dtype: int64

In [26]:
dataset.head()

Unnamed: 0,Chromosome,Position,Reference,Alternate_Allele,Gene_Symbol,Clinical_Significance,Clinical_Review_Status,Clinical_Disease_Name,Missense_Variant,Clinical_Significance_Encoded,Chromosome_Encoded,Clinical_Review_Status_Encoded
0,1,66926,AG,A,OR4F5,Uncertain,"criteria_provided,_single_submitter",Retinitis_pigmentosa,SO:0001627|intron_variant,1,1,1
1,1,69134,A,G,OR4F5,Benign,"criteria_provided,_single_submitter",not_specified,SO:0001583|missense_variant,0,1,1
2,1,69308,A,G,OR4F5,Uncertain,"criteria_provided,_single_submitter",not_specified,SO:0001583|missense_variant,1,1,1
3,1,69314,T,G,OR4F5,Uncertain,"criteria_provided,_single_submitter",not_specified,SO:0001583|missense_variant,1,1,1
4,1,69404,T,C,OR4F5,Uncertain,"criteria_provided,_single_submitter",not_specified,SO:0001583|missense_variant,1,1,1


### Encoding - Gene Symbol

In [27]:
dataset['Gene_Symbol'].unique()

array(['OR4F5', 'LINC01409', 'SAMD11', ..., 'GSTT1', 'CCL3L1', 'LILRA3'],
      shape=(18516,), dtype=object)

In [28]:
gene_counts = dataset['Gene_Symbol'].value_counts()
gene_counts

Gene_Symbol
TTN         36437
BRCA2       19889
ATM         18410
APC         16023
NF1         15898
            ...  
IRX2-DT         1
DUX4            1
ANKRD37         1
TP53I11         1
TMEM225B        1
Name: count, Length: 18516, dtype: int64

In [29]:
dataset['Gene_Symbol_Encoded'] = dataset['Gene_Symbol'].map(gene_counts)
dataset.head()

Unnamed: 0,Chromosome,Position,Reference,Alternate_Allele,Gene_Symbol,Clinical_Significance,Clinical_Review_Status,Clinical_Disease_Name,Missense_Variant,Clinical_Significance_Encoded,Chromosome_Encoded,Clinical_Review_Status_Encoded,Gene_Symbol_Encoded
0,1,66926,AG,A,OR4F5,Uncertain,"criteria_provided,_single_submitter",Retinitis_pigmentosa,SO:0001627|intron_variant,1,1,1,12
1,1,69134,A,G,OR4F5,Benign,"criteria_provided,_single_submitter",not_specified,SO:0001583|missense_variant,0,1,1,12
2,1,69308,A,G,OR4F5,Uncertain,"criteria_provided,_single_submitter",not_specified,SO:0001583|missense_variant,1,1,1,12
3,1,69314,T,G,OR4F5,Uncertain,"criteria_provided,_single_submitter",not_specified,SO:0001583|missense_variant,1,1,1,12
4,1,69404,T,C,OR4F5,Uncertain,"criteria_provided,_single_submitter",not_specified,SO:0001583|missense_variant,1,1,1,12


### Position based feature - Chromosome position

In [30]:
dataset['POS_Percentile'] = dataset.groupby('Chromosome')['Position'].rank(pct=True)
dataset.head()

Unnamed: 0,Chromosome,Position,Reference,Alternate_Allele,Gene_Symbol,Clinical_Significance,Clinical_Review_Status,Clinical_Disease_Name,Missense_Variant,Clinical_Significance_Encoded,Chromosome_Encoded,Clinical_Review_Status_Encoded,Gene_Symbol_Encoded,POS_Percentile
0,1,66926,AG,A,OR4F5,Uncertain,"criteria_provided,_single_submitter",Retinitis_pigmentosa,SO:0001627|intron_variant,1,1,1,12,3e-06
1,1,69134,A,G,OR4F5,Benign,"criteria_provided,_single_submitter",not_specified,SO:0001583|missense_variant,0,1,1,12,6e-06
2,1,69308,A,G,OR4F5,Uncertain,"criteria_provided,_single_submitter",not_specified,SO:0001583|missense_variant,1,1,1,12,9e-06
3,1,69314,T,G,OR4F5,Uncertain,"criteria_provided,_single_submitter",not_specified,SO:0001583|missense_variant,1,1,1,12,1.2e-05
4,1,69404,T,C,OR4F5,Uncertain,"criteria_provided,_single_submitter",not_specified,SO:0001583|missense_variant,1,1,1,12,1.5e-05


In [31]:
dataset.groupby('Chromosome')['Position'].rank(pct=True)

0          0.000003
1          0.000006
2          0.000009
3          0.000012
4          0.000015
             ...   
3687730    0.700000
3687731    0.800000
3687732    0.900000
3687733    1.000000
3687734    1.000000
Name: Position, Length: 3682186, dtype: float64

### Mutation feature

In [32]:
dataset['IS_SNP'] = dataset.apply(lambda x: 1 if len(x['Reference']) == 1 and len(x['Alternate_Allele']) == 1 else 0, axis=1)
dataset['IS_INDEL'] = dataset.apply(lambda x: 1 if len(x['Reference']) != len(x['Alternate_Allele']) else 0, axis=1)
dataset.head()

Unnamed: 0,Chromosome,Position,Reference,Alternate_Allele,Gene_Symbol,Clinical_Significance,Clinical_Review_Status,Clinical_Disease_Name,Missense_Variant,Clinical_Significance_Encoded,Chromosome_Encoded,Clinical_Review_Status_Encoded,Gene_Symbol_Encoded,POS_Percentile,IS_SNP,IS_INDEL
0,1,66926,AG,A,OR4F5,Uncertain,"criteria_provided,_single_submitter",Retinitis_pigmentosa,SO:0001627|intron_variant,1,1,1,12,3e-06,0,1
1,1,69134,A,G,OR4F5,Benign,"criteria_provided,_single_submitter",not_specified,SO:0001583|missense_variant,0,1,1,12,6e-06,1,0
2,1,69308,A,G,OR4F5,Uncertain,"criteria_provided,_single_submitter",not_specified,SO:0001583|missense_variant,1,1,1,12,9e-06,1,0
3,1,69314,T,G,OR4F5,Uncertain,"criteria_provided,_single_submitter",not_specified,SO:0001583|missense_variant,1,1,1,12,1.2e-05,1,0
4,1,69404,T,C,OR4F5,Uncertain,"criteria_provided,_single_submitter",not_specified,SO:0001583|missense_variant,1,1,1,12,1.5e-05,1,0


### Drop unused columns

In [33]:
dataset.drop(columns=['Reference', 'Alternate_Allele', 'Gene_Symbol', 
                      'Clinical_Review_Status', 'Clinical_Disease_Name', 'Clinical_Significance', 'Position',
                      'Chromosome', 'Missense_Variant'
                     ], inplace=True)
dataset.columns

Index(['Clinical_Significance_Encoded', 'Chromosome_Encoded',
       'Clinical_Review_Status_Encoded', 'Gene_Symbol_Encoded',
       'POS_Percentile', 'IS_SNP', 'IS_INDEL'],
      dtype='object')

In [34]:
dataset.head()

Unnamed: 0,Clinical_Significance_Encoded,Chromosome_Encoded,Clinical_Review_Status_Encoded,Gene_Symbol_Encoded,POS_Percentile,IS_SNP,IS_INDEL
0,1,1,1,12,3e-06,0,1
1,0,1,1,12,6e-06,1,0
2,1,1,1,12,9e-06,1,0
3,1,1,1,12,1.2e-05,1,0
4,1,1,1,12,1.5e-05,1,0


## Writing the dataset in files

In [35]:
dataset.to_csv('data/ml_ready_genomics.csv')