# Structure expression data

Tong Shu Li

We explore the expression data in GSE77110.

In [1]:
import pandas as pd
import sys

In [2]:
sys.path.append("../../..")

In [3]:
from src.stats import calc_percentile

---

## Read the table of expression values

The first 64 lines and the last line of the GSE matrix file are metadata which need to be skipped.

In [4]:
data = pd.read_csv("GSE77110_series_matrix.txt", sep = '\t', comment = '!')

In [5]:
data.shape

(22625, 16)

In [6]:
data.head()

Unnamed: 0,ID_REF,GSM2044469,GSM2044470,GSM2044471,GSM2044472,GSM2044473,GSM2044474,GSM2044475,GSM2044476,GSM2044477,GSM2044478,GSM2044479,GSM2044480,GSM2044481,GSM2044482,GSM2044483
0,171720_x_at,8.717144,8.759925,8.810304,8.873956,8.713713,8.69833,8.54298,8.874033,9.085419,8.777119,9.186312,9.337125,9.161829,9.165603,8.992359
1,171721_x_at,11.016367,11.073513,11.116418,11.322379,10.970184,10.874218,10.704798,10.915562,10.947203,10.999002,10.948462,10.655318,10.873164,11.075366,11.214671
2,171722_x_at,11.86311,11.484897,11.436445,11.523513,11.63118,11.313651,11.397194,11.944652,11.712431,11.438866,11.922669,12.105895,11.782277,12.045137,12.21793
3,171723_x_at,14.5019,14.12001,14.095455,14.057207,14.03278,13.895247,14.078284,13.804691,13.707833,13.826551,13.495122,13.526148,14.02669,13.164302,13.281513
4,171724_x_at,10.306531,10.582328,10.456459,10.337638,10.012204,9.818688,10.002994,10.036317,10.079002,9.987183,10.817489,10.312446,10.961027,11.083455,10.897401


In [7]:
data.isnull().sum()

ID_REF        0
GSM2044469    0
GSM2044470    0
GSM2044471    0
GSM2044472    0
GSM2044473    0
GSM2044474    0
GSM2044475    0
GSM2044476    0
GSM2044477    0
GSM2044478    0
GSM2044479    0
GSM2044480    0
GSM2044481    0
GSM2044482    0
GSM2044483    0
dtype: int64

According to the information on GEO, the values given here are already log2 normalized expression intensities after running RMA normalization.

## Reshape the expression dataframe

We need to restructure the data to long format.

In [8]:
data = (pd.melt(data, id_vars = ["ID_REF"])
    .rename(columns = {"ID_REF": "probe_id", "variable": "geo_id", "value": "log2_exp"})
)

In [9]:
data.shape

(339375, 3)

In [10]:
data.head()

Unnamed: 0,probe_id,geo_id,log2_exp
0,171720_x_at,GSM2044469,8.717144
1,171721_x_at,GSM2044469,11.016367
2,171722_x_at,GSM2044469,11.86311
3,171723_x_at,GSM2044469,14.5019
4,171724_x_at,GSM2044469,10.306531


In [11]:
data["probe_id"] = data["probe_id"].astype("category")
data["geo_id"] = data["geo_id"].astype("category")

## Add metadata

In [12]:
metadata = pd.read_csv("sample_metadata.tsv", sep = '\t')

In [13]:
metadata

Unnamed: 0,sample,geo_accession,diet,days_old
0,N2_AL_AD2,GSM2044469,AL,2
1,N2_AL_AD4,GSM2044470,AL,4
2,N2_CR_AD4,GSM2044471,CR,4
3,N2_IF_AD4,GSM2044472,IF,4
4,N2_AL_AD6,GSM2044473,AL,6
5,N2_CR_AD6,GSM2044474,CR,6
6,N2_IF_AD6,GSM2044475,IF,6
7,N2_AL_AD8,GSM2044476,AL,8
8,N2_CR_AD8,GSM2044477,CR,8
9,N2_IF_AD8,GSM2044478,IF,8


### Check that the GSM numbers match

In [14]:
set(data["geo_id"]) == set(metadata["geo_accession"])

True

## Merge in the metadata

In [15]:
data = pd.merge(data, metadata, how = "inner", left_on = "geo_id", right_on = "geo_accession")
data = data.drop("geo_accession", axis = 1)

for col in ["probe_id", "geo_id", "sample", "diet"]:
    data[col] = data[col].astype('category')

In [16]:
data.shape

(339375, 6)

In [17]:
data.head()

Unnamed: 0,probe_id,geo_id,log2_exp,sample,diet,days_old
0,171720_x_at,GSM2044469,8.717144,N2_AL_AD2,AL,2
1,171721_x_at,GSM2044469,11.016367,N2_AL_AD2,AL,2
2,171722_x_at,GSM2044469,11.86311,N2_AL_AD2,AL,2
3,171723_x_at,GSM2044469,14.5019,N2_AL_AD2,AL,2
4,171724_x_at,GSM2044469,10.306531,N2_AL_AD2,AL,2


## Read the probe id mapping file

In [18]:
mapping = pd.read_csv("GPL200_id_mapping.tsv", sep = '\t')

In [19]:
mapping.shape

(22625, 5)

In [20]:
mapping.head()

Unnamed: 0,probe_id,entrez_id,wormbase_id,ensembl_id,other_id
0,171720_x_at,174997,,WBGene00013011,AV179929
1,171721_x_at,172609,,WBGene00011344,6767
2,171722_x_at,176907,,WBGene00018934,AV189310
3,171723_x_at,180646,CE26817,WBGene00006928,CEC7564
4,171724_x_at,172353,CE11778,WBGene00000386,AV178012


## Annotate genes in the data

In [21]:
data = pd.merge(data, mapping, how = "left", on = "probe_id")

In [22]:
data.shape

(339375, 10)

In [23]:
data.head()

Unnamed: 0,probe_id,geo_id,log2_exp,sample,diet,days_old,entrez_id,wormbase_id,ensembl_id,other_id
0,171720_x_at,GSM2044469,8.717144,N2_AL_AD2,AL,2,174997,,WBGene00013011,AV179929
1,171721_x_at,GSM2044469,11.016367,N2_AL_AD2,AL,2,172609,,WBGene00011344,6767
2,171722_x_at,GSM2044469,11.86311,N2_AL_AD2,AL,2,176907,,WBGene00018934,AV189310
3,171723_x_at,GSM2044469,14.5019,N2_AL_AD2,AL,2,180646,CE26817,WBGene00006928,CEC7564
4,171724_x_at,GSM2044469,10.306531,N2_AL_AD2,AL,2,172353,CE11778,WBGene00000386,AV178012


## Calculate log fold expression

In [24]:
ref = data.query("diet == 'AL' & days_old == 2")[["probe_id", "log2_exp"]]
ref = ref.rename(columns = {"log2_exp": "ref_log2_exp"})

In [25]:
ref["exp_percentile"] = calc_percentile(ref["ref_log2_exp"])

In [26]:
ref.head()

Unnamed: 0,probe_id,ref_log2_exp,exp_percentile
0,171720_x_at,8.717144,70.514917
1,171721_x_at,11.016367,91.973481
2,171722_x_at,11.86311,96.296133
3,171723_x_at,14.5019,99.98674
4,171724_x_at,10.306531,86.205525


In [27]:
data = pd.merge(data, ref, how = "left", on = "probe_id")

In [28]:
data["lfe"] = data["log2_exp"] - data["ref_log2_exp"]

In [29]:
data.head()

Unnamed: 0,probe_id,geo_id,log2_exp,sample,diet,days_old,entrez_id,wormbase_id,ensembl_id,other_id,ref_log2_exp,exp_percentile,lfe
0,171720_x_at,GSM2044469,8.717144,N2_AL_AD2,AL,2,174997,,WBGene00013011,AV179929,8.717144,70.514917,0.0
1,171721_x_at,GSM2044469,11.016367,N2_AL_AD2,AL,2,172609,,WBGene00011344,6767,11.016367,91.973481,0.0
2,171722_x_at,GSM2044469,11.86311,N2_AL_AD2,AL,2,176907,,WBGene00018934,AV189310,11.86311,96.296133,0.0
3,171723_x_at,GSM2044469,14.5019,N2_AL_AD2,AL,2,180646,CE26817,WBGene00006928,CEC7564,14.5019,99.98674,0.0
4,171724_x_at,GSM2044469,10.306531,N2_AL_AD2,AL,2,172353,CE11778,WBGene00000386,AV178012,10.306531,86.205525,0.0


In [30]:
data.shape

(339375, 13)

## Save to file

In [31]:
data.to_csv("annot_GSE77110.tsv", sep = '\t', index = False)