<a href="https://colab.research.google.com/github/unique-subedi/gene-expression/blob/main/Vinod_gene_expression_master.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import datetime
import time
import math
import numpy as np
from numpy import linalg as LA
import pandas as pd
import urllib.request
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.distributions.empirical_distribution import ECDF

In [2]:
!pip install qnorm
import qnorm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting qnorm
  Downloading qnorm-0.8.1-py3-none-any.whl (15 kB)
Installing collected packages: qnorm
Successfully installed qnorm-0.8.1


In [23]:
!pip install pyreadr
import pyreadr
urllib.request.urlretrieve("https://raw.githubusercontent.com/unique-subedi/gene-expression/main/data/brain.rda", "brain.rda")
brain = pyreadr.read_r("brain.rda")

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [24]:
expression = pd.DataFrame(brain["expression"])
genes = pd.DataFrame(brain["genes"])
samples = pd.DataFrame(brain["samples"])

In [33]:
ACC = "A.C. cortex"
CER = "cerebellum"
DLPFC = "D.L.P.F. cortex"


# Compute Bacterial Genes

In [25]:
# Need to compute all bacteria genes. Contains AFFX, does Not have HUM
genes.at["AFFX-BioDn-3_at", 'chrom'] = float("Nan")
genes_control = genes[['AFFX' in s for s in genes.index]]
genes_control = genes_control[['HUM' not in s for s in genes_control.index]]
genes_control = genes_control[['hum' not in s for s in genes_control.index]]
genes_bac_index = genes_control[genes_control.chrom.isnull() & genes_control.sym.isnull()].index
print(genes_bac_index)


Index(['AFFX-BioB-3_at', 'AFFX-BioB-3_st', 'AFFX-BioB-5_at', 'AFFX-BioB-5_st',
       'AFFX-BioB-M_at', 'AFFX-BioB-M_st', 'AFFX-BioC-3_at', 'AFFX-BioC-3_st',
       'AFFX-BioC-5_at', 'AFFX-BioC-5_st', 'AFFX-BioDn-3_at',
       'AFFX-BioDn-3_st', 'AFFX-BioDn-5_at', 'AFFX-BioDn-5_st',
       'AFFX-CreX-3_at', 'AFFX-CreX-3_st', 'AFFX-CreX-5_at', 'AFFX-CreX-5_st',
       'AFFX-DapX-3_at', 'AFFX-DapX-5_at', 'AFFX-DapX-M_at', 'AFFX-LysX-3_at',
       'AFFX-LysX-5_at', 'AFFX-LysX-M_at', 'AFFX-M27830_3_at',
       'AFFX-M27830_5_at', 'AFFX-M27830_M_at', 'AFFX-MurFAS_at',
       'AFFX-MurIL10_at', 'AFFX-MurIL2_at', 'AFFX-MurIL4_at', 'AFFX-PheX-3_at',
       'AFFX-PheX-5_at', 'AFFX-PheX-M_at', 'AFFX-ThrX-3_at', 'AFFX-ThrX-5_at',
       'AFFX-ThrX-M_at', 'AFFX-TrpnX-3_at', 'AFFX-TrpnX-5_at',
       'AFFX-TrpnX-M_at', 'AFFX-YEL002c/WBP1_at', 'AFFX-YEL018w/_at',
       'AFFX-YEL021w/URA3_at', 'AFFX-YEL024w/RIP1_at'],
      dtype='object', name='rownames')


# Normalize Data

In [50]:
genes_Y_crom = genes[genes.chrom == "Y"].index
mean_bac_exp = expression[genes_bac_index].mean(axis=1)
std_bac_exp = expression[genes_bac_index].std(axis=1)
expression_norm =  (expression - mean_bac_exp.values[:, None]).divide(std_bac_exp, axis=0)
expression_qnorm = qnorm.quantile_normalize(expression, axis=0)
expression_ecdf = expression.copy()
for i in range(len(expression)):
  ecdf = ECDF(expression_ecdf.iloc[i, :].values)
  expression_ecdf.iloc[i, :] = ecdf(expression_ecdf.iloc[i, :].values)

In [58]:
def compute_top_genes_tissue(data, tissue):
  express_norm_samp = pd.concat([data, samples], axis=1)
  express_samp_loc = express_norm_samp

  df_male = express_samp_loc[(express_samp_loc.region == tissue) & (express_samp_loc.sex == "male")]
  df_female = express_samp_loc[(express_samp_loc.region == tissue) & (express_samp_loc.sex == "female")]

  unique_genes = data.columns
  p_values= []

  for gene in unique_genes:
    stat, p = stats.ks_2samp(data.loc[df_male.index][gene], data.loc[df_female.index][gene])
    # _, _, p = stats.anderson_ksamp([expression_norm.loc[df_male.index][gene], expression_norm.loc[df_female.index][gene]])
    p_values.append((p, gene))

  sorted_pval = sorted(p_values, key=lambda tup: tup[0])
  top10genes = [tup[1] for tup in sorted_pval[:20]]

  op_chrom = genes.loc[top10genes].loc[:, 'chrom'].values
  op_genes = genes.loc[top10genes].loc[:, 'sym'].values
  chrom_counter = Counter(op_chrom)
  count = np.sum([chrom_counter[key] for key, value in chrom_counter.items() if key in ["X", "Y", "X Y"]])
  return op_chrom, op_genes, top10genes, count

In [60]:
op_chrom, op_genes, top10genes, cnt = compute_top_genes_tissue(expression_qnorm, ACC)
print(cnt, op_chrom, op_genes, top10genes)

8 ['Y', 'Y', 'Y', 'Y', 'X', ..., '21', 'Y', '10', NaN, '12']
Length: 20
Categories (39, object): ['1', '10', '10ak*', '10ptp*', ..., 'X', 'X Y', 'Y', 'na'] ['UTY', 'KDM5D', 'DDX3Y', 'RPS4Y1', 'XIST', ..., 'SIM2', 'EIF1AY', 'IFIT5', NaN, 'MSI1']
Length: 20
Categories (8783, object): ['AADAC', 'AAK1', 'AAMP', 'AANAT', ..., 'ZYX', 'ZZEF1', 'ZZZ3', 'psiTPTE22'] ['34477_at', '37583_at', '38355_at', '41214_at', '38446_at', '40382_at', '35885_at', '36413_at', '33956_at', '35476_at', '34420_at', '841_at', '34207_at', '36228_at', '37309_at', '39608_at', '40097_at', '1046_at', '31331_at', '31395_i_at']


In [8]:
express_norm_samp = pd.concat([expression_qnorm, samples], axis=1)
express_samp_loc = express_norm_samp
c = 0
for tissue in express_samp_loc["region"].unique():
  print(tissue)
  df_male = express_samp_loc[(express_samp_loc.region == tissue) & (express_samp_loc.sex == "male")]
  df_female = express_samp_loc[(express_samp_loc.region == tissue) & (express_samp_loc.sex == "female")]

  unique_genes = expression_qnorm.columns
  p_values= []

  for gene in unique_genes:
    stat, p = stats.ks_2samp(expression_qnorm.loc[df_male.index][gene], expression_qnorm.loc[df_female.index][gene])
    # _, _, p = stats.anderson_ksamp([expression_norm.loc[df_male.index][gene], expression_norm.loc[df_female.index][gene]])
    p_values.append((p, gene))

  sorted_pval = sorted(p_values, key=lambda tup: tup[0])
  top10genes = [tup[1] for tup in sorted_pval[:10]]
  # print(top10genes)

  op_chrom = genes.loc[top10genes].loc[:, 'chrom'].values
  op_genes = genes.loc[top10genes].loc[:, 'sym'].values
  chrom_counter = Counter(op_chrom)
  count = np.sum([chrom_counter[key] for key, value in chrom_counter.items() if key in ["X", "Y", "X Y"]])
  print(count)
  print(op_chrom, op_genes)
  print("")
  c += 1





A.C. cortex
7
['Y', 'Y', 'Y', 'X', 'Y', 'Y', NaN, 'Y', '1', '12']
Categories (39, object): ['1', '10', '10ak*', '10ptp*', ..., 'X', 'X Y', 'Y', 'na'] ['DDX3Y', 'RPS4Y1', 'UTY', 'XIST', 'USP9Y', 'KDM5D', NaN, 'EIF1AY', 'KCNH1', 'SNRNP35']
Categories (8783, object): ['AADAC', 'AAK1', 'AAMP', 'AANAT', ..., 'ZYX', 'ZZEF1', 'ZZZ3', 'psiTPTE22']

cerebellum
7
['Y', 'Y', 'Y', 'X', 'Y', '6', 'Y', 'Y', '20', '10']
Categories (39, object): ['1', '10', '10ak*', '10ptp*', ..., 'X', 'X Y', 'Y', 'na'] ['DDX3Y', 'RPS4Y1', 'KDM5D', 'XIST', 'UTY', 'CD24', 'TTTY15', 'USP9Y', 'HNF4A', 'DUSP5']
Categories (8783, object): ['AADAC', 'AAK1', 'AAMP', 'AANAT', ..., 'ZYX', 'ZZEF1', 'ZZZ3', 'psiTPTE22']

D.L.P.F. cortex
8
['Y', 'Y', 'Y', 'X', 'Y', 'Y', '10', '3', 'Y', 'X']
Categories (39, object): ['1', '10', '10ak*', '10ptp*', ..., 'X', 'X Y', 'Y', 'na'] ['DDX3Y', 'RPS4Y1', 'USP9Y', 'XIST', 'UTY', 'KDM5D', 'HNRNPF', 'PPP1R2', 'TTTY15', NaN]
Categories (8783, object): ['AADAC', 'AAK1', 'AAMP', 'AANAT', ..., 'ZYX

In [133]:
for i in range(len(expression)):
  ecdf = ECDF(expression.iloc[i, :].values)
  expression.iloc[i, :] = ecdf(expression.iloc[i, :].values)

In [142]:
express_norm_samp = pd.concat([expression, samples], axis=1)
express_samp_loc = express_norm_samp
c = 0
for tissue in express_samp_loc["region"].unique():
  print(tissue)
  df_male = express_samp_loc[(express_samp_loc.region == tissue) & (express_samp_loc.sex == "male")]
  df_female = express_samp_loc[(express_samp_loc.region == tissue) & (express_samp_loc.sex == "female")]

  unique_genes = expression.columns
  p_values= []

  for gene in unique_genes:
    # stat, p = stats.ttest_ind(a=expression.loc[df_male.index][gene], b=expression.loc[df_female.index][gene], equal_var=False)
    # print(len(expression_norm.loc[df_male.index][gene]))
    stat, p = stats.ks_2samp(expression.loc[df_male.index][gene], expression.loc[df_female.index][gene])
    # _, _, p = stats.anderson_ksamp([expression.loc[df_male.index][gene], expression.loc[df_female.index][gene]])
    p_values.append((p, gene))

  sorted_pval = sorted(p_values, key=lambda tup: tup[0])
  top10genes = [tup[1] for tup in sorted_pval[:10]]
  # print(top10genes)

  op_chrom = genes.loc[top10genes].loc[:, 'chrom'].values
  op_genes = genes.loc[top10genes].loc[:, 'sym'].values
  chrom_counter = Counter(op_chrom)
  count = np.sum([chrom_counter[key] for key, value in chrom_counter.items() if key in ["X", "Y", "X Y"]])
  print(count)
  print(op_chrom, op_genes)
  print("")
  c += 1





A.C. cortex
7
['Y', 'Y', 'Y', 'Y', 'X', '8', 'Y', 'X', '8', '10']
Categories (39, object): ['1', '10', '10ak*', '10ptp*', ..., 'X', 'X Y', 'Y', 'na'] ['UTY', 'KDM5D', 'DDX3Y', 'RPS4Y1', 'XIST', 'CRH', 'USP9Y', 'P2RY10', 'LY96', 'SH3PXD2A']
Categories (8783, object): ['AADAC', 'AAK1', 'AAMP', 'AANAT', ..., 'ZYX', 'ZZEF1', 'ZZZ3', 'psiTPTE22']

cerebellum
8
['Y', 'Y', 'Y', 'X', 'Y', 'Y', 'Y', '6', 'X', '8']
Categories (39, object): ['1', '10', '10ak*', '10ptp*', ..., 'X', 'X Y', 'Y', 'na'] ['DDX3Y', 'RPS4Y1', 'KDM5D', 'XIST', 'TTTY15', 'UTY', 'USP9Y', 'CD24', 'PLXNB3', 'YWHAZ']
Categories (8783, object): ['AADAC', 'AAK1', 'AAMP', 'AANAT', ..., 'ZYX', 'ZZEF1', 'ZZZ3', 'psiTPTE22']

D.L.P.F. cortex
9
['Y', 'Y', 'Y', 'Y', 'Y', 'X', 'Y', 'Y', 'X', '9']
Categories (39, object): ['1', '10', '10ak*', '10ptp*', ..., 'X', 'X Y', 'Y', 'na'] ['KDM5D', 'DDX3Y', 'RPS4Y1', 'NLGN4Y', 'TTTY15', 'XIST', 'UTY', 'USP9Y', NaN, 'NR4A3']
Categories (8783, object): ['AADAC', 'AAK1', 'AAMP', 'AANAT', ..., 'ZYX'

#OLS

expression

In [None]:
#need to create a design matrix - model 
# expression = beta_1*gender + beta_2* lab + beta_3* tissue * beta_4 * chrom + beta_5 * chip_version

#train one model per gene. 

#so need to make this dataset
human_genes = genes[genes.chrom.notnull() & genes.sym.notnull()].index;
#est2.conf_int(alpha=0.05, cols=None)

coef_gene = []
for gene in human_genes[:]:
  samples_tmp = samples.drop(columns= ["patient"]);
  one_hot_encoding = pd.get_dummies(samples_tmp);
  one_hot_encoding = one_hot_encoding.drop(columns= ["sex_female", 'chip.version_v2', "region_cerebellum", "lab_Davis"]);
  # one_hot_encoding = one_hot_encoding.drop(columns= ["sex_female", 'chip.version_v2', "patient_patient_10", "region_cerebellum", "lab_Davis"]);
  X = sm.add_constant(one_hot_encoding);
  model = sm.OLS(expression[gene],X);
  results = model.fit();
  lcb = results.conf_int(alpha= 0.001).loc["sex_male"][0]
  ucb = results.conf_int(alpha= 0.001).loc["sex_male"][1]
  if not lcb < 0 < ucb:
    coef_gene.append((gene, np.abs(results.params["sex_male"]), lcb, ucb))


In [None]:
  sorted_coef_gene = sorted(coef_gene, key=lambda tup: tup[1])
  print(sorted_coef_gene[-10:])
  top10genes = [tup[0] for tup in sorted_coef_gene[-10:]]
  op_chrom = genes.loc[top10genes].loc[:, 'chrom'].values
  op_genes = genes.loc[top10genes].loc[:, 'sym'].values

  print(op_chrom, op_genes)

[('38446_at', 0.32211163562325, -0.6392439071327769, -0.004979364113723184), ('32052_at', 0.6618093936031971, -1.2418658692260938, -0.08175291798030049), ('38355_at', 0.8080392266143466, 0.40988997154483925, 1.206188481683854), ('41214_at', 1.198685352333169, 0.8018963739565279, 1.5954743307098103)]
['X', '11', 'Y', 'Y']
Categories (39, object): ['1', '10', '10ak*', '10ptp*', ..., 'X', 'X Y', 'Y', 'na'] ['XIST', 'HBB', 'DDX3Y', 'RPS4Y1']
Categories (8783, object): ['AADAC', 'AAK1', 'AAMP', 'AANAT', ..., 'ZYX', 'ZZEF1', 'ZZZ3', 'psiTPTE22']


expression_norm

In [None]:
#need to create a design matrix - model 
# expression = beta_1*gender + beta_2* lab + beta_3* tissue * beta_4 * chrom + beta_5 * chip_version

#train one model per gene. 

#so need to make this dataset
human_genes = genes[genes.chrom.notnull() & genes.sym.notnull()].index;
#est2.conf_int(alpha=0.05, cols=None)

coef_gene = []
for gene in human_genes[:]:
  samples_tmp = samples.drop(columns= ["patient"]);
  one_hot_encoding = pd.get_dummies(samples_tmp);
  one_hot_encoding = one_hot_encoding.drop(columns= ["sex_female", 'chip.version_v2', "region_cerebellum", "lab_Davis"]);
  # one_hot_encoding = one_hot_encoding.drop(columns= ["sex_female", 'chip.version_v2', "patient_patient_10", "region_cerebellum", "lab_Davis"]);
  X = sm.add_constant(one_hot_encoding);
  model = sm.OLS(expression[gene],X);
  results = model.fit();
  lcb = results.conf_int(alpha= 0.05).loc["lab_Michigan"][0]
  ucb = results.conf_int(alpha= 0.05).loc["lab_Michigan"][1]
  # if not lcb < 0 < ucb:
  coef_gene.append((gene, np.abs(results.params["lab_Michigan"] - results.params["lab_Irvine"]), lcb, ucb))


In [None]:
  sorted_coef_gene = sorted(coef_gene, key=lambda tup: tup[1])
  print(sorted_coef_gene[-10:])

[('39332_at', 4.47904464885951, -2.861892868285674, -1.869963654207997), ('38032_at', 4.480369541215368, -3.0167140310342293, -1.9652224158591367), ('38711_at', 4.480434385659045, -3.931502010098435, -2.89440882356684), ('38406_f_at', 4.480728639317509, -2.356090694292533, -1.4555582590719984), ('36410_f_at', 4.502172403652644, -2.2511984979069717, -1.4052621836064687), ('38708_at', 4.590756672580022, -3.6366487312536155, -2.5655383131606224), ('36285_at', 4.620910422292189, -3.0267875594923352, -1.8557433924323363), ('35778_at', 4.633202311285724, -3.7011445550498294, -2.67724605636883), ('38308_g_at', 4.640368895846539, -2.868998385275601, -1.575558789557934), ('37760_at', 4.673930828643323, -3.185903396188331, -2.29050951082583)]
