# Data preparation and exploration

Library import

In [25]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import networkx as nx
from scipy import sparse
from collections import defaultdict
from google.colab import drive

Mounting the drive folder

In [26]:
drive.mount('/content/gdrive')
path = 'gdrive/MyDrive/postgraduate_program/project/data/'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Reading the names for drugs, proteins and diseases

In [27]:
drugs = list(pd.read_csv(path+'drug.txt', sep="\n", header=None)[0])
proteins = list(pd.read_csv(path+'protein.txt', sep="\n", header=None)[0])
diseases = list(pd.read_csv(path+'disease.txt', sep="\n", header=None)[0])

We've got 708 drugs, 1512 proteins and 5603 diseases

In [28]:
print("Drugs:"+str(len(drugs)))
print("Proteins:"+str(len(proteins)))
print("Diseases:"+str(len(diseases)))

Drugs:708
Proteins:1512
Diseases:5603


Checking for duplicates:

In [29]:
print("Drugs:"+str(len(drugs)-len(set(drugs))))
print("Proteins:"+str(len(proteins)-len(set(proteins))))
print("Diseases:"+str(len(diseases)-len(set(diseases))))

Drugs:0
Proteins:19
Diseases:0


When dealing with protein data we will have to check on the duplicates.

## DRUG-DISEASE
Reading drug-disease data:

In [30]:
df_drug_disease = pd.read_csv(path+'mat_drug_disease.txt', sep=" ", names=diseases)
df_drug_disease.index = drugs
df_drug_disease.head()

Unnamed: 0,depressive disorder,drug-induced liver injury,mercury poisoning,necrosis,neoplasms,"anemia, hemolytic",attention deficit and disruptive behavior disorders,autistic disorder,cognition disorders,cystitis,heavy metal toxicity,hemolysis,hypertension,lead poisoning,learning disorders,lung injury,"micronuclei, chromosome-defective",pneumonia,anxiety disorders,ataxia,autoimmune diseases,brain diseases,brain edema,brain injuries,cardiomyopathies,craniofacial abnormalities,death,"disease models, animal",drug toxicity,epilepsy,"epilepsy, absence",fetal growth retardation,hyperkinesis,"hypersensitivity, delayed",kidney diseases,"lead poisoning, nervous system",memory disorders,movement disorders,myoclonus,nerve degeneration,...,"neuropathy, hereditary sensory and autonomic, type iia",odontoonychodermal dysplasia,schopf-schulz-passarge syndrome,split-hand/foot malformation 6,tetra-amelia autosomal recessive,"46,xx sex reversal with dysgenesis of kidneys, adrenals, and lungs",mullerian aplasia and hyperandrogenism,rokitansky kuster hauser syndrome,al awadi syndrome,"fibular aplasia or hypoplasia, femoral bowing and poly-, syn-, and oligodactyly",denys-drash syndrome,frasier syndrome,meacham winn culler syndrome,"nephrotic syndrome, type 4",wilson-turner x-linked mental retardation syndrome,major affective disorder 7,"xanthinuria, type i","xeroderma pigmentosum, complementation group c",nephronophthisis-like nephropathy 1,"charcot-marie-tooth disease, dominant intermediate c",t cell immunodeficiency primary,"skeletal defects, genital hypoplasia, and mental retardation","mental retardation, x-linked 91","mental retardation, x-linked, syndromic, raymond type","corneal dystrophy, fuchs endothelial, 6","corneal dystrophy, posterior polymorphous, 3",mowat-wilson syndrome,diaphragmatic hernia 3,"spastic paraplegia 15, autosomal recessive","spastic paraplegia 33, autosomal dominant",holoprosencephaly 5,"heterotaxy, visceral, x-linked",acro-osteolysis,mandibuloacral dysplasia with type b lipodystrophy,"nephrolithiasis, uric acid, susceptibility to",ehlers-danlos syndrome 6b,retinitis pigmentosa 58,"spinocerebellar ataxia, autosomal recessive 5",seborrhea-like dermatitis with psoriasiform elements,"mental retardation, x-linked 45"
DB00050,1,1,1,1,1,0,0,1,1,0,0,0,1,0,1,1,0,1,0,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
DB00152,1,1,0,0,1,0,0,1,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
DB00162,1,1,0,1,1,1,0,1,1,1,0,0,1,0,1,1,1,1,1,0,1,1,1,1,1,1,0,1,1,1,1,0,1,1,1,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
DB00175,1,1,0,1,1,1,0,1,1,0,0,0,1,1,1,1,0,1,1,0,1,1,1,1,1,1,0,1,1,1,0,0,0,0,1,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
DB00176,1,1,0,1,0,0,1,1,0,0,0,0,1,0,1,0,0,0,1,0,0,1,1,0,1,0,0,0,1,1,0,0,1,0,1,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


We've got 199214 drug-disease interactions

In [31]:
drug_disease_interactions=np.sum(df_drug_disease.to_numpy())
print(drug_disease_interactions)

199214


There is no null in the data:

In [32]:
df_drug_disease.isnull().values.any()

False

Computing sparsity:

In [33]:
(1 - drug_disease_interactions/(len(drugs)*len(diseases)))*100

94.97812410825112

## DRUG-PROTEIN

There are no null values.

In [34]:
pd.read_csv(path+'mat_drug_protein.txt', sep=" ").isnull().values.any()

False

We remember that we have to be careful with the duplicates:

In [35]:
print("Proteins:"+str(len(proteins)-len(set(proteins))))

Proteins:19


It is not unique: 

In [36]:
print(len(proteins))
print(len(set(proteins)))

1512
1493


Let's check if in the data it is also duplicated: 

In [37]:
len(pd.read_csv(path+'mat_drug_protein.txt', sep=" ").columns)

1512

As we see there are going to be duplicated columns per protein. We expect it to have the same values. The next function for each duplicated protein it tell us the column number of the duplication. 

In [38]:
def list_duplicates(seq):
    tally = defaultdict(list)
    for i,item in enumerate(seq):
        tally[item].append(i)
    return ((key,locs) for key,locs in tally.items() 
                            if len(locs)>1)

for dup in sorted(list_duplicates(proteins)):
    print(dup)

('A6NG28', [671, 1332])
('O60882', [617, 1196])
('P01024', [792, 1126])
('P07949', [674, 1022])
('P09622', [62, 422])
('P13051', [848, 1227])
('P15289', [844, 1095])
('P17050', [822, 1490])
('P20941', [1102, 1312])
('P30419', [796, 1369])
('P49247', [802, 811, 1101, 1387])
('Q5JAM2', [235, 908])
('Q5T6L4', [382, 454])
('Q5VZ30', [341, 909])
('Q8IVA8', [214, 907, 913])
('Q9UNI1', [1046, 1222])


Function for checking duplicates:

In [39]:
def check_duplicate(data,columns):
  list_to_check = list_duplicates(columns)
  for i in list_to_check:
    print(i[0] +" has duplicates in indexes "+str(i[1]))
    changes = False
    for j in range(len(i[1])):
      for k in range(j+1,len(i[1])):
        if not (data.iloc[:,j].equals(data.iloc[:,k])):
          print(" These indexes "+ str(i[1][j])+" and " + str(i[1][k])+" have different values in the columns")
          changes=True
    if not changes:
      print(" The duplicated columns have the same values")
      print("")




They are not the same!! What shall we do?

In [40]:
check_duplicate(pd.read_csv(path+'mat_drug_protein.txt', sep=" "),proteins)

P09622 has duplicates in indexes [62, 422]
 The duplicated columns have the same values

Q8IVA8 has duplicates in indexes [214, 907, 913]
 These indexes 214 and 913 have different values in the columns
 These indexes 907 and 913 have different values in the columns
Q5JAM2 has duplicates in indexes [235, 908]
 The duplicated columns have the same values

Q5VZ30 has duplicates in indexes [341, 909]
 The duplicated columns have the same values

Q5T6L4 has duplicates in indexes [382, 454]
 The duplicated columns have the same values

O60882 has duplicates in indexes [617, 1196]
 The duplicated columns have the same values

A6NG28 has duplicates in indexes [671, 1332]
 The duplicated columns have the same values

P07949 has duplicates in indexes [674, 1022]
 The duplicated columns have the same values

P01024 has duplicates in indexes [792, 1126]
 The duplicated columns have the same values

P30419 has duplicates in indexes [796, 1369]
 The duplicated columns have the same values

P49247 ha

In case we have the same column, with the same values duplicated we will keep just one column. If there are divergences we will delete all the columns related to the protein. 
l is the index of the column to delete. We end of having 1491 proteins for our study.

In [41]:
l = [422,214,907,913,908,909,454,1196,1332,1022,1126,1369,802,811,1101,1387,1490,1095,1227,1222,1312]
s = [ str(i) for i in l]
unique_proteins=[proteins[i] for i in range(len(proteins)) if i not in l]

In [42]:
len(unique_proteins)

1491

In [43]:
df_drug_protein = pd.read_csv(path+'mat_drug_protein.txt', sep=" ", names=[str(i) for i in range(len(proteins))]).drop(s, axis=1)
df_drug_protein.columns = unique_proteins
df_drug_protein.index = drugs
df_drug_protein.head()

Unnamed: 0,Q9UI32,P00488,P35228,P06737,P11766,P50213,P30542,P00519,P12319,P00451,P23219,P35626,P21728,P35916,P51168,P02452,Q9H4B7,P56181,P17948,Q9UPY5,P06213,P04049,Q9Y285,Q9Y234,P12259,P24530,P20309,P30613,P00734,P30273,P13716,P48167,P21554,P30556,P34995,P10515,P11836,P29475,P07195,Q07869,...,Q92993,O60493,P55789,O60264,Q92830,Q16613,O15066,P78330,Q9BZZ2,Q01082,Q15382,Q99873,P61587,P36222,O75676,Q53H96,P61221,P40937,P60953,Q99661,Q96QT4,O14964,Q96KC2,O00187,Q9UL51,Q9UL54,P41091,Q96GD3,P36405,P30041,O15111,Q99835,P02708,P11230,Q07001,P07510,Q04844,P31327,P0C0L4,P0C0L5
DB00050,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
DB00152,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
DB00162,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
DB00175,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
DB00176,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


We've got 1920 drug-protein interactions.

In [44]:
drug_protein_interactions= np.sum(df_drug_protein.to_numpy())
print(drug_protein_interactions)

1920


Computing sparsity:

In [45]:
(1 - drug_protein_interactions/(len(drugs)*len(unique_proteins)))*100

99.81811774602417

# PROTEIN-DISEASE
Reading Protein-disease data (and deleting the rows associated twice to a protein):

In [66]:
df_protein_disease = pd.read_csv(path+'mat_protein_disease.txt', sep=" ", names=diseases)
df_protein_disease.drop(l, inplace=True)
df_protein_disease.index = unique_proteins
df_protein_disease.head()

Unnamed: 0,depressive disorder,drug-induced liver injury,mercury poisoning,necrosis,neoplasms,"anemia, hemolytic",attention deficit and disruptive behavior disorders,autistic disorder,cognition disorders,cystitis,heavy metal toxicity,hemolysis,hypertension,lead poisoning,learning disorders,lung injury,"micronuclei, chromosome-defective",pneumonia,anxiety disorders,ataxia,autoimmune diseases,brain diseases,brain edema,brain injuries,cardiomyopathies,craniofacial abnormalities,death,"disease models, animal",drug toxicity,epilepsy,"epilepsy, absence",fetal growth retardation,hyperkinesis,"hypersensitivity, delayed",kidney diseases,"lead poisoning, nervous system",memory disorders,movement disorders,myoclonus,nerve degeneration,...,"neuropathy, hereditary sensory and autonomic, type iia",odontoonychodermal dysplasia,schopf-schulz-passarge syndrome,split-hand/foot malformation 6,tetra-amelia autosomal recessive,"46,xx sex reversal with dysgenesis of kidneys, adrenals, and lungs",mullerian aplasia and hyperandrogenism,rokitansky kuster hauser syndrome,al awadi syndrome,"fibular aplasia or hypoplasia, femoral bowing and poly-, syn-, and oligodactyly",denys-drash syndrome,frasier syndrome,meacham winn culler syndrome,"nephrotic syndrome, type 4",wilson-turner x-linked mental retardation syndrome,major affective disorder 7,"xanthinuria, type i","xeroderma pigmentosum, complementation group c",nephronophthisis-like nephropathy 1,"charcot-marie-tooth disease, dominant intermediate c",t cell immunodeficiency primary,"skeletal defects, genital hypoplasia, and mental retardation","mental retardation, x-linked 91","mental retardation, x-linked, syndromic, raymond type","corneal dystrophy, fuchs endothelial, 6","corneal dystrophy, posterior polymorphous, 3",mowat-wilson syndrome,diaphragmatic hernia 3,"spastic paraplegia 15, autosomal recessive","spastic paraplegia 33, autosomal dominant",holoprosencephaly 5,"heterotaxy, visceral, x-linked",acro-osteolysis,mandibuloacral dysplasia with type b lipodystrophy,"nephrolithiasis, uric acid, susceptibility to",ehlers-danlos syndrome 6b,retinitis pigmentosa 58,"spinocerebellar ataxia, autosomal recessive 5",seborrhea-like dermatitis with psoriasiform elements,"mental retardation, x-linked 45"
Q9UI32,1,1,0,1,1,1,0,1,1,1,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
P00488,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
P35228,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
P06737,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
P11766,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


There are no null:

In [65]:
df_protein_disease.isnull().values.any()

False

We've got 1574445 interactions

In [67]:
protein_disease_interactions= np.sum(df_protein_disease.to_numpy())
print(protein_disease_interactions)


1574445


Checking sparsity

In [68]:
(1 - protein_disease_interactions/(len(diseases)*len(unique_proteins)))*100

81.15356425542367

In [None]:
df_drug_drug = pd.read_csv(path+'mat_drug_drug.txt', sep=" ", names=drugs)
df_drug_drug.index = drugs
df_drug_drug.head()

In [None]:
np.sum(df_drug_drug.drop_duplicates().to_numpy())

In [None]:
(1 - 9609/(708*708))*100

In [None]:
adjacency_matrix = sparse.csr_matrix(df_drug_drug)
G2 = nx.from_scipy_sparse_matrix(adjacency_matrix)

nx.draw( G2,nx.spring_layout(G2),node_size=1, node_color='blue',width=1)
plt.axis()

In [None]:
df_protein_protein = pd.read_csv(path+'mat_protein_protein.txt', sep=" ", names=[str(i) for i in range(len(proteins))]).drop(s, axis=1)
len(df_protein_protein.columns)

In [None]:
len(df_protein_protein.index)

In [None]:

df_protein_protein.columns = only_prot
# df_protein_protein=df_protein_protein.drop_duplicates()


In [None]:
df_protein_protein.drop(l, inplace=True)


In [None]:
df_protein_protein.index = only_prot

In [None]:
len(df_protein_protein.index)

In [None]:
df_protein_protein.head(20)

In [None]:
np.sum(df_protein_protein.drop_duplicates().to_numpy())

In [None]:
(1 - 7159/(1491*1491))*100

In [None]:
adjacency_matrix = sparse.csr_matrix(df_protein_protein)
G2 = nx.from_scipy_sparse_matrix(adjacency_matrix)

nx.draw( G2,nx.spring_layout(G2),node_size=1, node_color='blue',width=1)
plt.axis()

similarity matrix drugs

In [None]:
a = pd.read_csv(path+'Similarity_Matrix_Drugs.txt', sep=" ", names=drugs).replace(np.nan,0).to_numpy()
adj = np.where(a > 0 , 1, 0)

In [None]:
adj

In [None]:

adjacency_matrix = sparse.csr_matrix(adj)

In [None]:

G2 = nx.from_scipy_sparse_matrix(adjacency_matrix)


In [None]:

nx.draw( G2,nx.spring_layout(G2),node_size=1, node_color='blue',width=0.01)
plt.axis()

In [None]:
nx.draw( G2,nx.random_layout(G2),node_size=1, node_color='blue',width=0.01)
plt.axis()

In [None]:
a = pd.read_csv(path+'Similarity_Matrix_Proteins.txt', sep=" ", names=drugs).replace(np.nan,0).to_numpy()
adj = np.where(a > 0 , 1, 0)
adjacency_matrix = sparse.csr_matrix(adj)
G2 = nx.from_scipy_sparse_matrix(adjacency_matrix)

In [None]:
nx.draw(G2,node_size=10, node_color='lightblue', linewidths=1)
plt.axis()

In [None]:
se = list(pd.read_csv(path+'se.txt', sep="\n", header=None)[0])
df_drug_se = pd.read_csv(path+'mat_drug_se.txt', sep=" ", names=se)
df_drug_se.index = drugs
df_drug_se.head()

In [None]:
from sklearn.preprocessing import StandardScaler
df =df_drug_se.T
# Separating out the features
x = df.values
# Standardizing the features
x = StandardScaler().fit_transform(x)
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])
principalDf.head()

In [None]:
principalDf.plot.scatter(x='principal component 1',  y='principal component 2')

In [None]:
from sklearn.preprocessing import StandardScaler
df =df_drug_disease.T
# Separating out the features
x = df.values
# Standardizing the features
x = StandardScaler().fit_transform(x)
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])
principalDf.head()

In [None]:
principalDf.plot.scatter(x='principal component 1',  y='principal component 2')


In [None]:
pca.explained_variance_ratio_

In [None]:
df =df_protein_disease.T
# Separating out the features
x = df.values
# Standardizing the features
x = StandardScaler().fit_transform(x)
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])
principalDf.head()

In [None]:
principalDf.plot.scatter(x='principal component 1',  y='principal component 2')


In [None]:
pca.explained_variance_ratio_

In [None]:

df =df_drug_protein
# Separating out the features
x = df.values
# Standardizing the features
x = StandardScaler().fit_transform(x)
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])
principalDf.head()

In [None]:
principalDf.plot.scatter(x='principal component 1',  y='principal component 2')


In [None]:
pca.explained_variance_ratio_