# Data preparation and exploration

Library import

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import networkx as nx
from scipy import sparse
from collections import defaultdict
from google.colab import drive

Mounting the drive folder

In [2]:
drive.mount('/content/gdrive')
path = 'gdrive/MyDrive/project/AIDL_recommended_systems/data/original_data/'

Mounted at /content/gdrive


Reading the names for drugs, proteins and diseases

In [3]:
drugs = list(pd.read_csv(path+'drug.txt', sep="\n", header=None)[0])
proteins = list(pd.read_csv(path+'protein.txt', sep="\n", header=None)[0])
diseases = list(pd.read_csv(path+'disease.txt', sep="\n", header=None)[0])

We've got 708 drugs, 1512 proteins and 5603 diseases

In [4]:
print("Drugs:"+str(len(drugs)))
print("Proteins:"+str(len(proteins)))
print("Diseases:"+str(len(diseases)))

Drugs:708
Proteins:1512
Diseases:5603


Checking for duplicates:

In [5]:
print("Drugs:"+str(len(drugs)-len(set(drugs))))
print("Proteins:"+str(len(proteins)-len(set(proteins))))
print("Diseases:"+str(len(diseases)-len(set(diseases))))

Drugs:0
Proteins:19
Diseases:0


When dealing with protein data we will have to check on the duplicates.

## DRUG-DISEASE
Reading drug-disease data:

In [6]:
df_drug_disease = pd.read_csv(path+'mat_drug_disease.txt', sep=" ", names=diseases)
df_drug_disease.index = drugs
df_drug_disease.head()

Unnamed: 0,depressive disorder,drug-induced liver injury,mercury poisoning,necrosis,neoplasms,"anemia, hemolytic",attention deficit and disruptive behavior disorders,autistic disorder,cognition disorders,cystitis,...,holoprosencephaly 5,"heterotaxy, visceral, x-linked",acro-osteolysis,mandibuloacral dysplasia with type b lipodystrophy,"nephrolithiasis, uric acid, susceptibility to",ehlers-danlos syndrome 6b,retinitis pigmentosa 58,"spinocerebellar ataxia, autosomal recessive 5",seborrhea-like dermatitis with psoriasiform elements,"mental retardation, x-linked 45"
DB00050,1,1,1,1,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
DB00152,1,1,0,0,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
DB00162,1,1,0,1,1,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
DB00175,1,1,0,1,1,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
DB00176,1,1,0,1,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


We've got 199214 drug-disease interactions

In [7]:
drug_disease_interactions=np.sum(df_drug_disease.to_numpy())
print(drug_disease_interactions)

199214


There is no null in the data:

In [8]:
df_drug_disease.isnull().values.any()

False

Computing sparsity:

In [9]:
(1 - drug_disease_interactions/(len(drugs)*len(diseases)))*100

94.97812410825112

## DRUG-PROTEIN

There are no null values.

In [10]:
pd.read_csv(path+'mat_drug_protein.txt', sep=" ").isnull().values.any()

False

We remember that we have to be careful with the duplicates:

In [11]:
print("Proteins:"+str(len(proteins)-len(set(proteins))))

Proteins:19


It is not unique: 

In [None]:
print(len(proteins))
print(len(set(proteins)))

1512
1493


Let's check if in the data it is also duplicated: 

In [None]:
len(pd.read_csv(path+'mat_drug_protein.txt', sep=" ").columns)

1512

As we see there are going to be duplicated columns per protein. We expect it to have the same values. The next function for each duplicated protein it tell us the column number of the duplication. 

In [None]:
def list_duplicates(seq):
    tally = defaultdict(list)
    for i,item in enumerate(seq):
        tally[item].append(i)
    return ((key,locs) for key,locs in tally.items() 
                            if len(locs)>1)

for dup in sorted(list_duplicates(proteins)):
    print(dup)

('A6NG28', [671, 1332])
('O60882', [617, 1196])
('P01024', [792, 1126])
('P07949', [674, 1022])
('P09622', [62, 422])
('P13051', [848, 1227])
('P15289', [844, 1095])
('P17050', [822, 1490])
('P20941', [1102, 1312])
('P30419', [796, 1369])
('P49247', [802, 811, 1101, 1387])
('Q5JAM2', [235, 908])
('Q5T6L4', [382, 454])
('Q5VZ30', [341, 909])
('Q8IVA8', [214, 907, 913])
('Q9UNI1', [1046, 1222])


Function for checking duplicates:

In [None]:
def check_duplicate(data,columns):
  list_to_check = list_duplicates(columns)
  for i in list_to_check:
    print(i[0] +" has duplicates in indexes "+str(i[1]))
    changes = False
    for j in range(len(i[1])):
      for k in range(j+1,len(i[1])):
        if not (data.iloc[:,j].equals(data.iloc[:,k])):
          print(" These indexes "+ str(i[1][j])+" and " + str(i[1][k])+" have different values in the columns")
          changes=True
    if not changes:
      print(" The duplicated columns have the same values")
      print("")




They are not the same!! What shall we do?

In [None]:
check_duplicate(pd.read_csv(path+'mat_drug_protein.txt', sep=" "),proteins)

P09622 has duplicates in indexes [62, 422]
 The duplicated columns have the same values

Q8IVA8 has duplicates in indexes [214, 907, 913]
 These indexes 214 and 913 have different values in the columns
 These indexes 907 and 913 have different values in the columns
Q5JAM2 has duplicates in indexes [235, 908]
 The duplicated columns have the same values

Q5VZ30 has duplicates in indexes [341, 909]
 The duplicated columns have the same values

Q5T6L4 has duplicates in indexes [382, 454]
 The duplicated columns have the same values

O60882 has duplicates in indexes [617, 1196]
 The duplicated columns have the same values

A6NG28 has duplicates in indexes [671, 1332]
 The duplicated columns have the same values

P07949 has duplicates in indexes [674, 1022]
 The duplicated columns have the same values

P01024 has duplicates in indexes [792, 1126]
 The duplicated columns have the same values

P30419 has duplicates in indexes [796, 1369]
 The duplicated columns have the same values

P49247 ha

In case we have the same column, with the same values duplicated we will keep just one column. If there are divergences we will delete all the columns related to the protein. 
l is the index of the column to delete. We end of having 1491 proteins for our study.

In [None]:
l = [422,214,907,913,908,909,454,1196,1332,1022,1126,1369,802,811,1101,1387,1490,1095,1227,1222,1312]
s = [ str(i) for i in l]
unique_proteins=[proteins[i] for i in range(len(proteins)) if i not in l]

In [None]:
len(unique_proteins)

1491

In [None]:
df_drug_protein = pd.read_csv(path+'mat_drug_protein.txt', sep=" ", names=[str(i) for i in range(len(proteins))]).drop(s, axis=1)
df_drug_protein.columns = unique_proteins
df_drug_protein.index = drugs
df_drug_protein.head()

Unnamed: 0,Q9UI32,P00488,P35228,P06737,P11766,P50213,P30542,P00519,P12319,P00451,...,O15111,Q99835,P02708,P11230,Q07001,P07510,Q04844,P31327,P0C0L4,P0C0L5
DB00050,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DB00152,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DB00162,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DB00175,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DB00176,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


We've got 1920 drug-protein interactions.

In [None]:
drug_protein_interactions= np.sum(df_drug_protein.to_numpy())
print(drug_protein_interactions)

1920


Computing sparsity:

In [None]:
(1 - drug_protein_interactions/(len(drugs)*len(unique_proteins)))*100

99.81811774602417

# PROTEIN-DISEASE
Reading Protein-disease data (and deleting the rows associated twice to a protein):

In [None]:
df_protein_disease = pd.read_csv(path+'mat_protein_disease.txt', sep=" ", names=diseases)
df_protein_disease.drop(l, inplace=True)
df_protein_disease.index = unique_proteins
df_protein_disease.head()

Unnamed: 0,depressive disorder,drug-induced liver injury,mercury poisoning,necrosis,neoplasms,"anemia, hemolytic",attention deficit and disruptive behavior disorders,autistic disorder,cognition disorders,cystitis,...,holoprosencephaly 5,"heterotaxy, visceral, x-linked",acro-osteolysis,mandibuloacral dysplasia with type b lipodystrophy,"nephrolithiasis, uric acid, susceptibility to",ehlers-danlos syndrome 6b,retinitis pigmentosa 58,"spinocerebellar ataxia, autosomal recessive 5",seborrhea-like dermatitis with psoriasiform elements,"mental retardation, x-linked 45"
Q9UI32,1,1,0,1,1,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
P00488,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
P35228,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
P06737,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
P11766,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


There are no null:

In [None]:
df_protein_disease.isnull().values.any()

False

We've got 1574445 interactions

In [None]:
protein_disease_interactions= np.sum(df_protein_disease.to_numpy())
print(protein_disease_interactions)


1574445


Checking sparsity

In [None]:
(1 - protein_disease_interactions/(len(diseases)*len(unique_proteins)))*100

81.15356425542367

# Saving the data

In [None]:
saving_path="gdrive/MyDrive/postgraduate_program/project/AIDL_recommended_systems/data/cleaned_data/"

In [None]:
df_protein_disease.to_csv(saving_path+'df_protein_disease.csv', header=diseases, index=unique_proteins, sep='\t')
df_drug_protein.to_csv(saving_path+'df_drug_protein.csv', header=unique_proteins, index=drugs, sep='\t')
df_drug_disease.to_csv(saving_path+'df_drug_disease.csv', header=diseases, index=drugs, sep='\t')
