In [1]:
library(data.table)
library(tidyr)

“package ‘tidyr’ was built under R version 3.4.4”

In [2]:
rat_genes <- read.delim2("ftp://ftp.rgd.mcw.edu/pub/data_release/GENES_RAT.txt", header = T, comment.char = "#", stringsAsFactors = F, na.strings = "", quote = "\"")
setDT(rat_genes)

In [3]:
colnames(rat_genes)

In [4]:
rat_genes[1:10,UNIPROT_ID]

In [5]:
rat_genes <- rat_genes[!is.na(UNIPROT_ID)]
rat_genes[1:10,UNIPROT_ID]

### Get rid of all columns except RGD_ID and UniProt

In [6]:
rat_genes <- rat_genes[, c("GENE_RGD_ID","UNIPROT_ID"), with=FALSE]

In [7]:
nrow(rat_genes)

In [9]:
head(rat_genes)

GENE_RGD_ID,UNIPROT_ID
69417,Q9EPH1
621583,Q9JI93
619834,F1LNL0;Q923K9
727913,A0A4Z3
1308701,D3ZNS8
1565709,D3ZS19


## Separate rows with multiple UniProts

In [10]:
rat_genes <- separate_rows(rat_genes, UNIPROT_ID, sep = ";", convert = T)
nrow(rat_genes)

In [11]:
nrow(rat_genes)

In [12]:
head(rat_genes)

GENE_RGD_ID,UNIPROT_ID
69417,Q9EPH1
621583,Q9JI93
619834,F1LNL0
619834,Q923K9
727913,A0A4Z3
1308701,D3ZNS8


## Connect to TCRD to get rat nhprotein_ids and corresponding UniProt accessions

In [13]:
library(RMySQL, quietly = T)
DBHOST <- 'localhost'
DBNAME <- 'tcrd6'
DBUSER <- 'smathias'

dbconn <- dbConnect(MySQL(), host=DBHOST, dbname=DBNAME, user=DBUSER)

“package ‘DBI’ was built under R version 3.4.4”

In [14]:
sql <- "SELECT id AS nhprotein_id, uniprot FROM nhprotein WHERE taxid = 10116"
prots <- dbGetQuery(dbconn, sql)
dbDisconnect(dbconn)
rm(dbconn)
setDT(prots) # convert data.frame to data.table

In [15]:
head(prots)

nhprotein_id,uniprot
85188,P18757
85189,Q6AY71
85190,Q6P6R2
85191,Q9WTV1
85192,P13233
85193,Q66HG3


In [16]:
nrow(prots)

## Join rat_genes to prots on UniProt

In [17]:
nhprot2rgd <- merge(rat_genes, prots, by.x = "UNIPROT_ID", by.y = "uniprot")
head(nhprot2rgd)

UNIPROT_ID,GENE_RGD_ID,nhprotein_id
A0A023GRW5,1598370,88228
A0A023IKK2,1595922,111266
A0A023ILR5,1595814,95455
A0A023IMI6,3426,112319
A0A059NZR0,1359325,91660
A0A059NZV6,1305117,91665


## Remove UniProt column

In [18]:
nhprot2rgd[, UNIPROT_ID := NULL]
head(nhprot2rgd)

GENE_RGD_ID,nhprotein_id
1598370,88228
1595922,111266
1595814,95455
3426,112319
1359325,91660
1305117,91665


In [19]:
nrow(nhprot2rgd)

In [20]:
OUTPUT_FILE <- '/Users/smathias/TCRD/data/RGD/nhprotein2rgd.tsv'
if(file.exists(OUTPUT_FILE)) {
  file.remove(OUTPUT_FILE)
}
fwrite(nhprot2rgd, file = OUTPUT_FILE, sep = "\t", col.names = T, row.names = F, na = "None", quote = T)

In [None]:
GZIP_FILE <- '/Users/smathias/TCRD/data/RGD/nhprotein2rgd.tsv.gz'
if(file.exists(GZIP_FILE)) {
  file.remove(GZIP_FILE)
}
system(sprintf("gzip -9v %s", OUTPUT_FILE))

## Get and Process QTLs

In [21]:
qtl <- fread("ftp://ftp.rgd.mcw.edu/pub/data_release/QTLS_RAT.txt", header = T, sep = "\t", na.strings = "", quote = "\"", skip = 70, verbose = T, col.names = c("QTL_RGD_ID","SPECIES","QTL_SYMBOL","QTL_NAME","CHROMOSOME_FROM_REF","LOD","P_VALUE","VARIANCE","FLANK_1_RGD_ID","FLANK_1_SYMBOL","FLANK_2_RGD_ID","FLANK_2_SYMBOL","PEAK_RGD_ID","PEAK_MARKER_SYMBOL","TRAIT_NAME","MEASUREMENT_TYPE","(UNUSED)","PHENOTYPES","ASSOCIATED_DISEASES","CURATED_REF_RGD_ID","CURATED_REF_PUBMED_ID","CANDIDATE_GENE_RGD_IDS","CANDIDATE_GENE_SYMBOLS","INHERITANCE_TYPE","RELATED_QTLS","UNUSED","5.0_MAP_POS_CHR","5.0_MAP_POS_START","5.0_MAP_POS_STOP","5.0_MAP_POS_METHOD","3.4_MAP_POS_CHR","3.4_MAP_POS_START","3.4_MAP_POS_STOP","3.4_MAP_POS_METHOD","CROSS_TYPE","CROSS_PAIR","STRAIN_RGD_ID1","STRAIN_RGD_ID2","STRAIN_RGD_SYMBOL1","STRAIN_RGD_SYMBOL2","6.0_MAP_POS_CHR","6.0_MAP_POS_START","6.0_MAP_POS_STOP","6.0_MAP_POS_METHOD","STRAIN_RGD_ID3","STRAIN_RGD_SYMBOL3","SSTRAIN"))
nrow(qtl)

Input contains no \n. Taking this to be a filename to open
File opened, filesize is 0.001025 GB.
Memory mapping ... ok
Detected eol as \n only (no \r afterwards), the UNIX and Mac standard.
Positioned on line 71 after skip or autostart
This line isn't blank and skip>0 so we're done
Using supplied sep '\t' ... found ok
Detected 47 columns. Longest stretch was from line 72 to line 100
Starting data input on line 72 (either column names or first row of data). First 10 characters: 61326	rat	
The line before starting line 72 is non-empty and will be ignored (it has too few or too many items to be column names or data): QTL_RGD_ID	SPECIES	QTL_SYMBOL	QTL_NAME	CHROMOSOME_FROM_REF	LOD	P_VALUE	VARIANCE	FLANK_1_RGD_ID	FLANK_1_SYMBOL	FLANK_2_RGD_ID	FLANK_2_SYMBOL	PEAK_RGD_ID	PEAK_MARKER_SYMBOL	TRAIT_NAME	MEASUREMENT_TYPE	(UNUSED)	PHENOTYPES	ASSOCIATED_DISEASES	CURATED_REF_RGD_ID	CURATED_REF_PUBMED_ID	CANDIDATE_GENE_RGD_IDS	CANDIDATE_GENE_SYMBOLS	INHERITANCE_TYPE	RELATED_QTLS	(UNUSED)	5.0_MAP_POS_C

## Separate gene ids, symbols and phenotypes

In [22]:
qtl <- qtl[!is.na(CANDIDATE_GENE_RGD_IDS)]
nrow(qtl)

In [23]:
qtl[1:5, CANDIDATE_GENE_RGD_IDS]

In [24]:
qtl[1:5, CANDIDATE_GENE_SYMBOLS]

In [25]:
qtl <- separate_rows(qtl, CANDIDATE_GENE_RGD_IDS, CANDIDATE_GENE_SYMBOLS, sep = ";", convert = T)
nrow(qtl)

In [26]:
qtl[1:5, PHENOTYPES]

In [27]:
qtl <- separate_rows(qtl, PHENOTYPES, sep = ";")
nrow(qtl)

In [28]:
qtl <- qtl[, .(QTL_RGD_ID, QTL_SYMBOL, QTL_NAME, LOD, P_VALUE, TRAIT_NAME, MEASUREMENT_TYPE, ASSOCIATED_DISEASES, CANDIDATE_GENE_RGD_IDS, PHENOTYPES)]
head(qtl)

QTL_RGD_ID,QTL_SYMBOL,QTL_NAME,LOD,P_VALUE,TRAIT_NAME,MEASUREMENT_TYPE,ASSOCIATED_DISEASES,CANDIDATE_GENE_RGD_IDS,PHENOTYPES
61332,Eau3,Experimental allergic uveoretinitis QTL 3,,0.004,uvea integrity trait (VT:0010551),experimental autoimmune uveitis score (CMO:0001504),Experimental Autoimmune Uveitis,2836,abnormal uvea morphology
61332,Eau3,Experimental allergic uveoretinitis QTL 3,,0.004,uvea integrity trait (VT:0010551),experimental autoimmune uveitis score (CMO:0001504),Experimental Autoimmune Uveitis,2836,eye inflammation
61332,Eau3,Experimental allergic uveoretinitis QTL 3,,0.004,uvea integrity trait (VT:0010551),experimental autoimmune uveitis score (CMO:0001504),Experimental Autoimmune Uveitis,3395,abnormal uvea morphology
61332,Eau3,Experimental allergic uveoretinitis QTL 3,,0.004,uvea integrity trait (VT:0010551),experimental autoimmune uveitis score (CMO:0001504),Experimental Autoimmune Uveitis,3395,eye inflammation
61332,Eau3,Experimental allergic uveoretinitis QTL 3,,0.004,uvea integrity trait (VT:0010551),experimental autoimmune uveitis score (CMO:0001504),Experimental Autoimmune Uveitis,3645,abnormal uvea morphology
61332,Eau3,Experimental allergic uveoretinitis QTL 3,,0.004,uvea integrity trait (VT:0010551),experimental autoimmune uveitis score (CMO:0001504),Experimental Autoimmune Uveitis,3645,eye inflammation


In [29]:
qtl <- qtl[CANDIDATE_GENE_RGD_IDS %in% rat_genes$GENE_RGD_ID]
nrow(qtl)

## Join qtl to nhprotein2rgd

In [31]:
rat_qtls <- merge(nhprot2rgd, qtl, by.x = "GENE_RGD_ID", by.y = "CANDIDATE_GENE_RGD_IDS")
head(rat_qtls)

GENE_RGD_ID,nhprotein_id,QTL_RGD_ID,QTL_SYMBOL,QTL_NAME,LOD,P_VALUE,TRAIT_NAME,MEASUREMENT_TYPE,ASSOCIATED_DISEASES,PHENOTYPES
2003,119941,70199,Coreg1,Compensatory renal growth QTL 1,11.8,,kidney mass (VT:0002707),compensatory renal growth score (CMO:0001894),,increased compensatory renal growth
2004,119075,724558,Plsm2,Polydactyly-luxate syndrome (PLS) morphotypes QTL 2,,0.0003,hindlimb integrity trait (VT:0010563),hind foot phalanges count (CMO:0001949),polydactyly,polydactyly
2004,119075,6903353,Bp353,Blood pressure QTL 353,2.8,,arterial blood pressure trait (VT:2000000),diastolic blood pressure (CMO:0000005),hypertension,increased systemic arterial diastolic blood pressure
2004,119075,6903353,Bp353,Blood pressure QTL 353,2.8,,arterial blood pressure trait (VT:2000000),diastolic blood pressure (CMO:0000005),hypertension,salt-sensitive hypertension
2015,119100,1298527,Arunc2,Aerobic running capacity QTL 2,2.9,,exercise endurance trait (VT:0002332),maximum distance run on treadmill (CMO:0001406),,decreased aerobic running capacity
2015,119100,1298527,Arunc2,Aerobic running capacity QTL 2,2.9,,exercise endurance trait (VT:0002332),maximum distance run on treadmill (CMO:0001406),,increased aerobic running capacity


In [32]:
nrow(rat_qtls)

In [34]:
RAT_QTLS_FILE <- '/Users/smathias/TCRD/data/RGD/rat_qtls.tsv'
if(file.exists(RAT_QTLS_FILE)) {
  file.remove(RAT_QTLS_FILE)
}
fwrite(rat_qtls, file = RAT_QTLS_FILE, sep = "\t", col.names = T, row.names = F, quote = T, na = "None")

# Get terms from DO, MP and RDO

In [35]:
rat.do <- fread("ftp://ftp.rgd.mcw.edu/pub/data_release/with_terms/rattus_terms_do", sep = "\t", na.strings = "", skip = 27, verbose = T, quote = "")
rat.do <- rat.do[OBJECT_TYPE == "gene"]
rat.do <- rat.do[, .(RGD_ID, OBJECT_SYMBOL, TERM_ACC_ID, TERM_NAME, QUALIFIER, EVIDENCE)]
rat.do[, ONTOLOGY := "Disease Ontology"]
rat.do <- rat.do[RGD_ID %in% rat_genes$GENE_RGD_ID]
rat.do <- unique(rat.do, by = c("RGD_ID", "TERM_ACC_ID"))
head(rat.do)

Input contains no \n. Taking this to be a filename to open
File opened, filesize is 0.021729 GB.
Memory mapping ... ok
Detected eol as \n only (no \r afterwards), the UNIX and Mac standard.
Positioned on line 28 after skip or autostart
This line isn't blank and skip>0 so we're done
Using supplied sep '\t' ... found ok
Detected 16 columns. Longest stretch was from line 28 to line 57
Starting data input on line 28 (either column names or first row of data). First 10 characters: RGD_ID	OBJ
The line before starting line 28 is non-empty and will be ignored (it has too few or too many items to be column names or data): #16  ORIGINAL_REFERENCE original referenceAll the fields on line 28 are character fields. Treating as the column names.
Count of eol: 118356 (including 1 at the end)
Count of sep: 1775325
nrow = MIN( nsep [1775325] / (ncol [16] -1), neol [118356] - endblanks [1] ) = 118355
Type codes (point  0): 1444444444414044
Type codes (point  1): 1444444444414044
Type codes (point  2): 14

RGD_ID,OBJECT_SYMBOL,TERM_ACC_ID,TERM_NAME,QUALIFIER,EVIDENCE,ONTOLOGY
69651,Tgfbr2,DOID:14004,thoracic aortic aneurysm,,ISS,Disease Ontology
1310949,Sos1,DOID:0060582,Noonan syndrome 4,,ISS,Disease Ontology
620906,Six1,DOID:1612,breast cancer,severity,ISS,Disease Ontology
3075,Mecp2,DOID:1206,Rett syndrome,,ISS,Disease Ontology
2507,Dmd,DOID:0060561,DMD-related dilated cardiomyopathy,,ISS,Disease Ontology
620906,Six1,DOID:3459,breast carcinoma,severity,ISS,Disease Ontology


In [36]:
nrow(rat.do)

In [37]:
RAT_TERMS_FILE <- '/Users/smathias/TCRD/data/RGD/rat_terms.tsv'
if(file.exists(RAT_TERMS_FILE)) {
  file.remove(RAT_TERMS_FILE)
}
fwrite(rat.do, file = RAT_TERMS_FILE, append = file.exists(RAT_TERMS_FILE), col.names = !file.exists(RAT_TERMS_FILE), sep = "\t", row.names = F, quote = T, na = "None")

In [38]:
rat.mp <- fread("ftp://ftp.rgd.mcw.edu/pub/data_release/with_terms/rattus_terms_mp", sep = "\t", na.strings = "", skip = 27, verbose = T, quote = "")
rat.mp <- rat.mp[OBJECT_TYPE == "gene"]
rat.mp <- rat.mp[, .(RGD_ID, OBJECT_SYMBOL, TERM_ACC_ID, TERM_NAME, QUALIFIER, EVIDENCE)]
rat.mp[, ONTOLOGY := "Mammalian Phenotype"]
rat.mp <- rat.mp[RGD_ID %in% rat_genes$GENE_RGD_ID]
rat.mp <- unique(rat.mp, by = c("RGD_ID", "TERM_ACC_ID"))
head(rat.pm)

Input contains no \n. Taking this to be a filename to open
File opened, filesize is 0.000915 GB.
Memory mapping ... ok
Detected eol as \n only (no \r afterwards), the UNIX and Mac standard.
Positioned on line 28 after skip or autostart
This line isn't blank and skip>0 so we're done
Using supplied sep '\t' ... found ok
Detected 16 columns. Longest stretch was from line 28 to line 57
Starting data input on line 28 (either column names or first row of data). First 10 characters: RGD_ID	OBJ
The line before starting line 28 is non-empty and will be ignored (it has too few or too many items to be column names or data): #16  ORIGINAL_REFERENCE original referenceAll the fields on line 28 are character fields. Treating as the column names.
Count of eol: 7009 (including 1 at the end)
Count of sep: 105120
nrow = MIN( nsep [105120] / (ncol [16] -1), neol [7009] - endblanks [1] ) = 7008
Type codes (point  0): 1444444444414040
Type codes (point  1): 1444444444414040
Type codes (point  2): 1444444444

ERROR: Error in head(rat.pm): object 'rat.pm' not found


In [39]:
nrow(rat.mp)

In [40]:
fwrite(rat.mp, file = RAT_TERMS_FILE, append = file.exists(RAT_TERMS_FILE), col.names = !file.exists(RAT_TERMS_FILE), sep = "\t", row.names = F, quote = T, na = "None")

In [41]:
rat.rdo <- fread("ftp://ftp.rgd.mcw.edu/pub/data_release/with_terms/rattus_terms_rdo", sep = "\t", na.strings = "", skip = 27, verbose = T, quote = "")
rat.rdo <- rat.rdo[OBJECT_TYPE == "gene"]
rat.rdo <- rat.rdo[, .(RGD_ID, OBJECT_SYMBOL, TERM_ACC_ID, TERM_NAME, QUALIFIER, EVIDENCE)]
rat.rdo[, ONTOLOGY := "RGD Disease Ontology"]
rat.rdo <- rat.rdo[RGD_ID %in% rat_genes$GENE_RGD_ID]
rat.rdo <- unique(rat.rdo, by = c("RGD_ID", "TERM_ACC_ID"))
head(rat.rdo)

Input contains no \n. Taking this to be a filename to open
File opened, filesize is 0.025776 GB.
Memory mapping ... ok
Detected eol as \n only (no \r afterwards), the UNIX and Mac standard.
Positioned on line 28 after skip or autostart
This line isn't blank and skip>0 so we're done
Using supplied sep '\t' ... found ok
Detected 16 columns. Longest stretch was from line 28 to line 57
Starting data input on line 28 (either column names or first row of data). First 10 characters: RGD_ID	OBJ
The line before starting line 28 is non-empty and will be ignored (it has too few or too many items to be column names or data): #16  ORIGINAL_REFERENCE original referenceAll the fields on line 28 are character fields. Treating as the column names.
Count of eol: 131216 (including 1 at the end)
Count of sep: 1968225
nrow = MIN( nsep [1968225] / (ncol [16] -1), neol [131216] - endblanks [1] ) = 131215
Type codes (point  0): 1444444444414444
Type codes (point  1): 1444444444414444
Type codes (point  2): 14

RGD_ID,OBJECT_SYMBOL,TERM_ACC_ID,TERM_NAME,QUALIFIER,EVIDENCE,ONTOLOGY
708540,Cxcl6,RDO:0001650,"Scleroderma, Diffuse",,ISS,RGD Disease Ontology
621528,Cxcr3,RDO:0001051,Myasthenia Gravis,,ISS,RGD Disease Ontology
1590342,Adssl1,RDO:0005352,"Diabetes Mellitus, Experimental",,IEP,RGD Disease Ontology
3177,Ngfr,RDO:0006438,Pulmonary Fibrosis,,ISS,RGD Disease Ontology
3177,Ngfr,RDO:0006542,Sciatic Neuropathy,,ISS,RGD Disease Ontology
2097,Alox5ap,RDO:0004932,Asthma,,ISS,RGD Disease Ontology


In [42]:
nrow(rat.rdo)

In [43]:
fwrite(rat.rdo, file = RAT_TERMS_FILE, append = file.exists(RAT_TERMS_FILE), col.names = !file.exists(RAT_TERMS_FILE), sep = "\t", row.names = F, quote = T, na = "None")