# Clean and prepare GSE21784 raw data

Tong Shu Li

In [1]:
import pandas as pd
import sys

In [2]:
sys.path.append("../..")

In [3]:
from src.geo import parse_series_matrix

---

## Read series matrix

In [4]:
series, samples, exp = parse_series_matrix("GSE21784_series_matrix.txt")

In [5]:
samples.shape

(9, 37)

In [6]:
samples.head(2)

Unnamed: 0,title,geo_accession,status,submission_date,last_update_date,type,channel_count,source_name_ch1,organism_ch1,characteristics_ch1,...,contact_laboratory,contact_department,contact_institute,contact_address,contact_city,contact_state,contact_zip/postal_code,contact_country,supplementary_file,data_row_count
0,"L4 larvae, biological rep1",GSM542652,Public on May 11 2011,May 11 2010,May 11 2011,RNA,1,C. elegans L4 larvae,Caenorhabditis elegans,strain: Bristol N2,...,Dennis Kim,Biology,MIT,77 Massachusetts Ave. (68-440D),Cambridge,MA,2139,USA,ftp://ftp.ncbi.nlm.nih.gov/pub/geo/DATA/supple...,22625
1,"L4 larvae, biological rep2",GSM542653,Public on May 11 2011,May 11 2010,May 11 2011,RNA,1,C. elegans L4 larvae,Caenorhabditis elegans,strain: Bristol N2,...,Dennis Kim,Biology,MIT,77 Massachusetts Ave. (68-440D),Cambridge,MA,2139,USA,ftp://ftp.ncbi.nlm.nih.gov/pub/geo/DATA/supple...,22625


There were three biological replicates at three different ages: L4, day 6, and day 15. We will extract the day and replicate information out as metadata.

## Extract age and replicate number

For simplicity I will treat L4 worms as day 0 adults so that we can do linear regression later. This is probably not correct, but is an assumption I will make for now.

In [7]:
meta = (
    samples[["title", "geo_accession"]]
        .assign(
            age = lambda df: df["title"].str.split(",").str.get(0),
        
            replicate = lambda df:
                pd.to_numeric(df["title"].str.extract(r'(\d)$', expand = False)),
            
            days_old = lambda df:
                pd.to_numeric(
                    df["title"].str.extract(r'(\d+)', expand = False).replace("4", "0")
                )
        )
        .drop("title", axis = 1)
)

In [8]:
meta

Unnamed: 0,geo_accession,age,days_old,replicate
0,GSM542652,L4 larvae,0,1
1,GSM542653,L4 larvae,0,2
2,GSM542654,L4 larvae,0,3
3,GSM542655,day 6 adults,6,1
4,GSM542656,day 6 adults,6,2
5,GSM542657,day 6 adults,6,3
6,GSM542658,day 15 adults,15,1
7,GSM542659,day 15 adults,15,2
8,GSM542660,day 15 adults,15,3


### Save metadata to file

In [9]:
meta.to_csv("sample_metadata.tsv", sep = '\t', index = False)

## Reshape expression values

In [10]:
exp.head()

Unnamed: 0,ID_REF,GSM542652,GSM542653,GSM542654,GSM542655,GSM542656,GSM542657,GSM542658,GSM542659,GSM542660
0,171720_x_at,8.71,8.49,8.8,8.63,9.01,8.94,8.54,8.63,8.01
1,171721_x_at,9.91,9.9,9.93,10.51,10.42,10.53,10.48,10.24,10.34
2,171722_x_at,11.28,11.21,11.1,10.1,10.26,10.66,11.1,11.39,11.17
3,171723_x_at,12.97,12.96,12.83,13.26,13.27,13.26,12.17,11.52,11.75
4,171724_x_at,8.26,10.04,9.34,10.53,10.75,10.71,10.17,9.98,10.58


In [11]:
exp.isnull().sum()

ID_REF       0
GSM542652    0
GSM542653    0
GSM542654    0
GSM542655    0
GSM542656    0
GSM542657    0
GSM542658    0
GSM542659    0
GSM542660    0
dtype: int64

In [12]:
long = (
    pd.melt(exp, id_vars = ["ID_REF"])
        .rename(columns = {
            "ID_REF": "probe_id",
            "variable": "geo_id",
            "value": "log2_exp"
        })
)

In [13]:
long.shape

(203625, 3)

In [14]:
long.head()

Unnamed: 0,probe_id,geo_id,log2_exp
0,171720_x_at,GSM542652,8.71
1,171721_x_at,GSM542652,9.91
2,171722_x_at,GSM542652,11.28
3,171723_x_at,GSM542652,12.97
4,171724_x_at,GSM542652,8.26


## Merge in the metadata

In [15]:
long = (
    pd.merge(
        long, meta, how = "left",
        left_on = "geo_id", right_on = "geo_accession")
        .drop("geo_accession", axis = 1)
)

In [16]:
long.head()

Unnamed: 0,probe_id,geo_id,log2_exp,age,days_old,replicate
0,171720_x_at,GSM542652,8.71,L4 larvae,0,1
1,171721_x_at,GSM542652,9.91,L4 larvae,0,1
2,171722_x_at,GSM542652,11.28,L4 larvae,0,1
3,171723_x_at,GSM542652,12.97,L4 larvae,0,1
4,171724_x_at,GSM542652,8.26,L4 larvae,0,1


## Read and annotate genes

In [17]:
mapping = pd.read_csv("../GPL_maps/GPL200_id_mapping.tsv", sep = '\t')

In [18]:
mapping.shape

(22625, 5)

In [19]:
mapping.head()

Unnamed: 0,probe_id,entrez_id,wormbase_id,ensembl_id,other_id
0,171720_x_at,174997,,WBGene00013011,AV179929
1,171721_x_at,172609,,WBGene00011344,6767
2,171722_x_at,176907,,WBGene00018934,AV189310
3,171723_x_at,180646,CE26817,WBGene00006928,CEC7564
4,171724_x_at,172353,CE11778,WBGene00000386,AV178012


### Dropping unknown probes

Current versions of Wormbase use the "WBGene" identifiers (as of 2016-05-23), which are called "ensembl_ids" here. We will standardize on the Entrez gene and Ensembl identifiers, and ignore genes which do not have these identifiers.

We will also ignore any genes which have neither a entrez id nor a wormbase id.

In [20]:
mapping = (
    mapping.drop(["wormbase_id", "other_id"], axis = 1)
        .rename(columns = {"ensembl_id": "wormbase_id"})
        .dropna(axis = 0, thresh = 2)
)

In [21]:
mapping.shape

(22031, 3)

In [22]:
mapping.head()

Unnamed: 0,probe_id,entrez_id,wormbase_id
0,171720_x_at,174997,WBGene00013011
1,171721_x_at,172609,WBGene00011344
2,171722_x_at,176907,WBGene00018934
3,171723_x_at,180646,WBGene00006928
4,171724_x_at,172353,WBGene00000386


In [23]:
mapping.isnull().groupby(["wormbase_id", "entrez_id"]).size().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,0
wormbase_id,entrez_id,Unnamed: 2_level_1
False,False,21818
False,True,90
True,False,123


All remaining 22031 probes have at least an entrez or a wormbase id. We dropped a total of 594 probes which did not have an entrez or a wormbase id.

## Annotate genes

In [24]:
exp_vals = pd.merge(long, mapping, how = "right", on = "probe_id")

In [25]:
exp_vals.shape

(198279, 8)

In [26]:
exp_vals.head()

Unnamed: 0,probe_id,geo_id,log2_exp,age,days_old,replicate,entrez_id,wormbase_id
0,171720_x_at,GSM542652,8.71,L4 larvae,0,1,174997,WBGene00013011
1,171720_x_at,GSM542653,8.49,L4 larvae,0,2,174997,WBGene00013011
2,171720_x_at,GSM542654,8.8,L4 larvae,0,3,174997,WBGene00013011
3,171720_x_at,GSM542655,8.63,day 6 adults,6,1,174997,WBGene00013011
4,171720_x_at,GSM542656,9.01,day 6 adults,6,2,174997,WBGene00013011


In [27]:
exp_vals["probe_id"].nunique()

22031

## Save to file

In [28]:
exp_vals.to_csv("annot_GSE21784.tsv", sep = '\t', index = False)