In [2]:
import polars as pl
from os.path import join

In [3]:
numbers = list(range(10))
coverage_defs = [join('coverage_definitions/coverage{}.tsv'.format(i)) for i in numbers]

In [4]:
total_coverages = []
for cov_file in coverage_defs:
    df = pl.read_csv(cov_file, separator='\t', has_header=False)
    df = df.filter(df['column_1'].str.contains('Otu'))
    total_coverage = df['column_2'].sum()
    total_coverages.append(total_coverage)

In [5]:
df = pl.DataFrame({
    'sample': numbers,
    'total_coverage': total_coverages,
})
df

sample,total_coverage
i64,f64
0,1657.79682
1,1650.28354
2,1686.9465
3,1828.4964
4,1634.22324
5,1737.54842
6,1772.31032
7,1694.80384
8,1790.228
9,1597.09387


In [6]:
df = df.with_columns((0.35 / pl.col('total_coverage')).alias('relabund0.35')*100)
df

sample,total_coverage,relabund0.35
i64,f64,f64
0,1657.79682,0.021112
1,1650.28354,0.021208
2,1686.9465,0.020748
3,1828.4964,0.019141
4,1634.22324,0.021417
5,1737.54842,0.020143
6,1772.31032,0.019748
7,1694.80384,0.020651
8,1790.228,0.019551
9,1597.09387,0.021915


In [7]:
df.select(pl.col('relabund0.35').mean())

relabund0.35
f64
0.020563


# How many zero entries are there in each coverage file?

In [9]:
num_zeroes_list = []
for cov_file in coverage_defs:
    df = pl.read_csv(cov_file, separator='\t', has_header=False)
    df = df.filter(df['column_1'].str.contains('Otu'))
    num_zeroes = len(df.filter(pl.col('column_2') == 0))
    num_zeroes_list.append(num_zeroes)
num_zeroes_list, sum(num_zeroes_list)/len(num_zeroes_list)

([388, 327, 348, 172, 349, 273, 267, 344, 245, 363], 307.6)