# Clean GSE77110 data

Tong Shu Li

We will reshape the expression value matrix to long form and add in relevant metadata about each sample. Conversion of probe ids to other ids is not done here due to complexities with probe mappings.

In [1]:
import pandas as pd
import sys

In [2]:
sys.path.append("../..")

In [3]:
from src.geo import parse_series_matrix

## Read GSE77110 series matrix

In [4]:
series, samples, exp = parse_series_matrix("GSE77110_series_matrix.txt")

In [5]:
samples.shape

(15, 36)

In [6]:
samples.head(2)

Unnamed: 0,title,geo_accession,status,submission_date,last_update_date,type,channel_count,source_name_ch1,organism_ch1,characteristics_ch1,...,contact_email,contact_laboratory,contact_department,contact_institute,contact_address,contact_city,contact_zip/postal_code,contact_country,supplementary_file,data_row_count
0,N2_AL_AD2,GSM2044469,Public on Jan 22 2016,Jan 21 2016,Jan 22 2016,RNA,1,"N2 worms under AL condition, on adult day 2",Caenorhabditis elegans,strain: N2,...,jackie.jdhan@gmail.com,Jing-Dong J. Han's Lab,CAS-MPG Partner Institute for Computational Bi...,"Shanghai Institutes for Biological Sciences, C...",Yueyang Road 320,Shanghai,200031,China,ftp://ftp.ncbi.nlm.nih.gov/pub/geo/DATA/supple...,22625
1,N2_AL_AD4,GSM2044470,Public on Jan 22 2016,Jan 21 2016,Jan 22 2016,RNA,1,"N2 worms under AL condition, on adult day 4",Caenorhabditis elegans,strain: N2,...,jackie.jdhan@gmail.com,Jing-Dong J. Han's Lab,CAS-MPG Partner Institute for Computational Bi...,"Shanghai Institutes for Biological Sciences, C...",Yueyang Road 320,Shanghai,200031,China,ftp://ftp.ncbi.nlm.nih.gov/pub/geo/DATA/supple...,22625


In [7]:
exp.shape

(339375, 3)

In [8]:
exp.head()

Unnamed: 0,probe_id,geo_id,log2_exp
0,171720_x_at,GSM2044469,8.717144
1,171721_x_at,GSM2044469,11.016367
2,171722_x_at,GSM2044469,11.86311
3,171723_x_at,GSM2044469,14.5019
4,171724_x_at,GSM2044469,10.306531


In [9]:
exp.groupby("geo_id").size().value_counts()

22625    15
dtype: int64

## Extract sample metadata

Determine the diet and age of each sample. The experimenters did not seem to have multiple biological replicates for each condition.

In [10]:
meta = (
    samples[["title", "geo_accession"]]
        .assign(
            diet = lambda df: df["title"].str[3:5],
        
            days_old = lambda df:
                pd.to_numeric(df["title"].str.extract(r'AD(\d+)', expand = False))
        )
        .rename(columns = {"title": "sample"})
)

In [11]:
meta

Unnamed: 0,sample,geo_accession,days_old,diet
0,N2_AL_AD2,GSM2044469,2,AL
1,N2_AL_AD4,GSM2044470,4,AL
2,N2_CR_AD4,GSM2044471,4,CR
3,N2_IF_AD4,GSM2044472,4,IF
4,N2_AL_AD6,GSM2044473,6,AL
5,N2_CR_AD6,GSM2044474,6,CR
6,N2_IF_AD6,GSM2044475,6,IF
7,N2_AL_AD8,GSM2044476,8,AL
8,N2_CR_AD8,GSM2044477,8,CR
9,N2_IF_AD8,GSM2044478,8,IF


### Save metadata to file

In [12]:
meta.to_csv("sample_metadata.tsv", sep = '\t', index = False)

## Add metadata to the expression values

According to the information on GEO, the values given here are already log2 normalized expression intensities after running RMA normalization.

In [13]:
exp = (
    exp.merge(
        meta, how = "left",
        left_on = "geo_id", right_on = "geo_accession"
    )
    .drop("geo_accession", axis = 1)
)

In [14]:
exp.shape

(339375, 6)

In [15]:
exp.head()

Unnamed: 0,probe_id,geo_id,log2_exp,sample,days_old,diet
0,171720_x_at,GSM2044469,8.717144,N2_AL_AD2,2,AL
1,171721_x_at,GSM2044469,11.016367,N2_AL_AD2,2,AL
2,171722_x_at,GSM2044469,11.86311,N2_AL_AD2,2,AL
3,171723_x_at,GSM2044469,14.5019,N2_AL_AD2,2,AL
4,171724_x_at,GSM2044469,10.306531,N2_AL_AD2,2,AL


In [16]:
exp.groupby("geo_id")["probe_id"].nunique().value_counts()

22625    15
Name: probe_id, dtype: int64

Each of the 15 samples has a total of 22625 unique probes.

## Save to file

In [17]:
exp.to_csv("annot_GSE77110.tsv", sep = '\t', index = False)