In [None]:
from pathlib import Path
import pandas as pd 
import numpy as np
from plotnine import * 


In [None]:
folder = Path('../smallTseq/nya_gentools/results/')
unwanted_files = ['coldata', 'de_genes']
file_list = sorted([file for file in folder.iterdir() if file.suffix == '.csv' and file.stem not in unwanted_files])
names = [file.stem for file in file_list]

df_list = [pd.read_csv(file).assign(program=file.stem) for file in file_list]
total_df = pd.concat(df_list)
total_df['name'] = [x.split('_')[0] for x in total_df['name']]

raw = (total_df
    .groupby('name')
    .max()
    .reset_index()
    .assign(program='raw')
    .drop('post_filtering', axis=1)
    .rename({'pre_filtering': 'value'}, axis=1))

total_df = (total_df
    .drop('pre_filtering', axis=1)
    .rename({'post_filtering':'value'}, axis=1))

plot = pd.concat([total_df, raw], ignore_index=True)



In [None]:
fig = (plot >> 
       mutate(program = fct_reorder(f.program, f.value)) >>
       ggplot(aes('program', 'value', fill='program')) +
       geom_col() + facet_wrap('name') + 
       coord_flip() +
       theme_seaborn() +
       theme(axis_text_x = element_text(angle = 90)) +
       labs(x='', 
           y='Number of reads',
           title='Reads filtered by each program'))
    





In [None]:
def plot_col_all(config: str) -> None:
    '''Creates a facet plot over all samples and the relationship between reads after each processing step'''
    
    folder = CreateFolders(config)
    unwanted_files = ['coldata', 'de_genes']
    file_list = sorted([file for file in folder.results.iterdir() if file.suffix == '.csv' and file.stem not in unwanted_files])
    
    df_list = [pd.read_csv(file).assign(program=file.stem) for file in file_list]
    total_df = pd.concat(df_list)
    total_df['name'] = [x.split('_')[0] for x in total_df['name']]
    
    raw = (total_df
    .groupby('name')
    .max()
    .reset_index()
    .assign(program='raw')
    .drop('post_filtering', axis=1)
    .rename({'pre_filtering': 'value'}, axis=1))

    total_df = (total_df
    .drop('pre_filtering', axis=1)
    .rename({'post_filtering':'value'}, axis=1))

    plot = pd.concat([total_df, raw], ignore_index=True)
    
    fig = (plot >> 
       mutate(program = fct_reorder(f.program, f.value)) >>
       ggplot(aes('program', 'value', fill='program')) +
       geom_col() + facet_wrap('name') + 
       coord_flip() +
       theme_seaborn() +
       theme(axis_text_x = element_text(angle = 90)) +
       labs(x='', 
           y='Number of reads',
           title='Reads filtered by each program'))
    
    fig_save_name = folder.results / 'processed_files_all.pdf'
    plot_save_name = folder.results / 'concatenated_processing_file.csv'
    plot.to_csv(plot_save_name, index=False)
    
    


In [None]:


fig = (plot >> 
       ggplot(aes('reorder(program, value)', 'value', fill='program')) +
       geom_col() + facet_wrap('name', scales='free') + 
       coord_flip() +
       theme_seaborn() +
       theme(axis_text_x=element_text(angle=90)) +
       labs(x='', 
           y='Number of reads',
           title='Reads filtered by each program') +
       theme(legend_title=element_blank(),
            axis_text_y=element_blank()))

fig

In [None]:
import re 

log = '../SRR3495859_umi_tools_extract.log'
with open(log, 'r') as f:
    log_file = f.read()

pattern_pre = re.compile(r'Total reads processed: +([\d,]+)')
pattern_post = re.compile(r'Reads written \(passing filters\): +([\d,]+)')

match_pre = re.findall(pattern_pre, log_file)

match_pre
match_post = re.findall(pattern_post, log_file)

In [None]:
test = 'SRRR88888'
test.split('_')[0]

In [1]:
from programs import FeatureCountsCommando

In [8]:
import yaml

with open('../test_yaml.yaml', 'r') as f:
    file = f.read()
    
file

'### Setup ###\n\n# Enter the directory containing the raw reads and a directory name that you want to use as a working directory\n\nraw_reads: data_smallseq_10_files\nworking_directory: gentools_smallseq_10_files\nthreads: 6\n\n### Preprocessing ###  \n\n# The default configuration is optimized to work with miRNA data. Especially libraries prepared with the \n# Small-seq protocol. \n\numi_tools_extract:\n- input: raw\n- mode: extract\n- extract-method: regex\n- bc-pattern: (?P<discard_1>.*)(?P<umi_1>[ACT]{8}CA)\n\n# For trimmed only enter Y for yes and N for no if you want cutadapt to only keep reads that have been filtered.\n\ncutadapt:\n- input: umi_tools_extract\n- adapter: TGGAATTCTCGGGTGCCAAGG\n- minimum-length: 18\n- maximum-length: 41\n- error-rate: 0.1\n- overlap: 1\n- trimmed_only?: N\n\n# enter folder and prefix of index name, e.g. index/human_genome/hg38\n\nbowtie2:\n- input: cutadapt\n- k: 100\n- local: very-sensitive-local\n- x: /bowtie2_index/genome/GRCh38\n\n\numi_tools