# WS_ch01C.ipynb
# WESmith 04/27/23
# INTERFACING WITH R VIA rpy2
## (see book code in Chapter01/Interfacing_R.py)
## WS created this notebook to follow along with code from the book
## 'Bioinformatics with Python Cookbook' by Tiago Antao¶
### Each recipe will have its own notebook, suffixed by A, B, etc.¶

In [None]:
import os
import pandas as pd
from IPython.display import Image
import utils as ws

In [None]:
import rpy2.robjects as robjects
import rpy2.robjects.lib.ggplot2 as ggplot2
from   rpy2.robjects.functions import SignatureTranslatedFunction
from   rpy2.robjects import pandas2ri
from   rpy2.robjects.conversion import localconverter

In [None]:
# get the data: took about 5m, 64MB, saved in file on disk called sequence.index
#!wget -nd http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/20130502.phase3.sequence.index -O sequence.index
# WS I moved sequence.index to .../ch01/data/

In [None]:
data_dir  = 'data'
file      = 'sequence.index'

In [None]:
read_delim = robjects.r('read.delim')

In [None]:
seq_data = read_delim(os.path.join(data_dir, file), header=True, stringsAsFactors=False)

In [None]:
ws.attrs(seq_data)

In [None]:
print('This data frame has %d columns and %d rows' % (seq_data.ncol, seq_data.nrow))
print(seq_data.colnames)

In [None]:
print('number of columns: {}'.format(robjects.r.ncol(seq_data)[0]))

In [None]:
as_integer = robjects.r('as.integer')
match      = robjects.r.match

In [None]:
my_col = match('READ_COUNT', seq_data.colnames)[0] # Vector returned
print('Type of read count before as.integer: %s' % seq_data[my_col - 1].rclass[0])

In [None]:
seq_data[my_col - 1] = as_integer(seq_data[my_col - 1])
print('Type of read count after as.integer: %s' % seq_data[my_col - 1].rclass[0])

In [None]:
my_col               = match('BASE_COUNT', seq_data.colnames)[0] # Vector returned
seq_data[my_col - 1] = as_integer(seq_data[my_col - 1])

In [None]:
my_col = match('CENTER_NAME', seq_data.colnames)[0]
seq_data[my_col - 1] = robjects.r.toupper(seq_data[my_col - 1])
robjects.r.assign('seq.data', seq_data)
robjects.r('print(c("Column names in R: ",colnames(seq.data)))')

In [None]:
# remove all withdrawn sequences
robjects.r('seq.data <- seq.data[seq.data$WITHDRAWN==0, ]')

In [None]:
# shorten the dataframe
robjects.r("seq.data <- seq.data[, c('STUDY_ID', 'STUDY_NAME', 'CENTER_NAME', 'SAMPLE_ID', 'SAMPLE_NAME', 'POPULATION', 'INSTRUMENT_PLATFORM', 'LIBRARY_LAYOUT', 'PAIRED_FASTQ', 'READ_COUNT', 'BASE_COUNT', 'ANALYSIS_GROUP')]")

In [None]:
#Population as factor
robjects.r('seq.data$POPULATION <- as.factor(seq.data$POPULATION)')

In [None]:
out_image = os.path.join(data_dir, 'out1.png') # WS
ggplot2.theme = SignatureTranslatedFunction(ggplot2.theme,
                                            init_prm_translate = {'axis_text_x': 'axis.text.x'})
bar = ggplot2.ggplot(seq_data) + ggplot2.geom_bar() + ggplot2.aes_string(x='CENTER_NAME') + ggplot2.theme(axis_text_x=ggplot2.element_text(angle=90, hjust=1, size=40), axis_text_y=ggplot2.element_text(size=40), text=ggplot2.element_text(size=40))
robjects.r.png(out_image, width=16, height=9, units="in", res=600) 
bar.plot()
dev_off = robjects.r('dev.off')
dev_off()

In [None]:
Image(filename=out_image)

In [None]:
#Get Yoruba and CEU
robjects.r('yri_ceu <- seq.data[seq.data$POPULATION %in% c("YRI", "CEU") & seq.data$BASE_COUNT < 2E9 & seq.data$READ_COUNT < 3E7, ]')
yri_ceu = robjects.r('yri_ceu')

In [None]:
out_image = os.path.join(data_dir, 'out2.png') # WS
scatter = ggplot2.ggplot(yri_ceu) + ggplot2.aes_string(x='BASE_COUNT', y='READ_COUNT', shape='factor(POPULATION)', col='factor(ANALYSIS_GROUP)') + ggplot2.geom_point()
robjects.r.png(out_image, width=16, height=9, units="in", res=600)
scatter.plot()
dev_off = robjects.r('dev.off')
dev_off()

In [None]:
Image(filename=out_image)

In [None]:
with localconverter(robjects.default_converter + pandas2ri.converter):
  pd_yri_ceu = robjects.conversion.rpy2py(yri_ceu)
del pd_yri_ceu['PAIRED_FASTQ']
# no_paired = pandas2ri.py2ri(pd_yri_ceu)
with localconverter(robjects.default_converter + pandas2ri.converter):
  no_paired = robjects.conversion.py2rpy(pd_yri_ceu)
robjects.r.assign('no.paired', no_paired)
robjects.r("print(colnames(no.paired))")