In [2]:
import os
import os.path
import glob

In [224]:
#inputs
work_dir = os.path.abspath("/mnt/projects/zilov/metagenomes/usearch/cancer_16s_run_2/")
reads = os.path.abspath("/mnt/projects/zilov/data/quiime_metagenome/v16trim_R/*_R1_*.fastq")

common_out = work_dir + "/common_out/"
uparse_out = work_dir + "/uparse_out/"
unoise_out = work_dir + "/unoise_out/"

threads = "100"

dirs_to_create = [work_dir ,common_out, uparse_out, unoise_out]
for folder in dirs_to_create:
    if not os.path.exists(folder):
        os.mkdir(folder)
        
tool = "usearch"

In [225]:
#common_script is consist of merging reads, filtering and uniques definitions

#1) script for merging
merge_in = reads
merge_out = common_out + "merged.fq"
merge_command = f"{tool} -fastq_mergepairs {merge_in} -fastq_maxdiffs 14 -fastq_pctid 10 -fastqout {merge_out}"
print(merge_command)
#2) script for filtering
filt_in = merge_out
filt_out = common_out + "filtered.fasta"
filt_command = f"{tool} -fastq_filter {filt_in} -fastq_maxee 1.0 -fastaout {filt_out}"
print(filt_command)
# if not run because of file size, use split_merged_file function
#3) script for uniques search
unique_in = filt_out
unique_out = common_out + "uniques.fasta"
unique_command = f"{tool} -fastx_uniques {unique_in} -fastaout {unique_out} -relabel Uniq -sizeout"
print(unique_command)
# Than two ways OTU - Uparse, ASV - Unoise

usearch -fastq_mergepairs /mnt/projects/zilov/data/quiime_metagenome/v16trim_R/*_R1_*.fastq -fastq_maxdiffs 14 -fastq_pctid 10 -fastqout /mnt/projects/zilov/metagenomes/usearch/cancer_16s_run_2/common_out/merged.fq
usearch -fastq_filter /mnt/projects/zilov/metagenomes/usearch/cancer_16s_run_2/common_out/merged.fq -fastq_maxee 1.0 -fastaout /mnt/projects/zilov/metagenomes/usearch/cancer_16s_run_2/common_out/filtered.fasta
usearch -fastx_uniques /mnt/projects/zilov/metagenomes/usearch/cancer_16s_run_2/common_out/filtered.fasta -fastaout /mnt/projects/zilov/metagenomes/usearch/cancer_16s_run_2/common_out/uniques.fasta -relabel Uniq -sizeout


In [237]:
# uparse pipeline 
uparse_dir = work_dir + "/uparse_out/"
print(uparse_dir)
#1) Build OTU clusters

otu_cluster_in = unique_out
otu_cluster_out = uparse_dir + 'otus.fa'
otu_cluster_command = f"{tool} -cluster_otus {otu_cluster_in} -minsize 2 -otus {otu_cluster_out} -relabel Otu"
print(f"\n{otu_cluster_command}")

#2) Build OTU table (that is better to do with xargs, if you got big dataset)

otu_table_in = otu_cluster_out
otu_table_out = uparse_dir + "otu_table.txt"
otu_map_out = uparse_dir + "map.txt"
otu_table_command = f"{tool} -otutab {merge_out} -otus {otu_cluster_out} -otutabout {otu_table_out} -mapout {otu_map_out}"
print(f"\n{otu_table_command}")

# at that point I've got problem with merged.fq file again, I've splited it and made an OTU table for each part
# I will merge OTUs in 4 step
    
# 2.1) Usearch global
otu_tab_global = "/mnt/projects/zilov/metagenomes/usearch/cancer_16s/uparse_out/otutable_global.txt"
command = f"{tool} -usearch_global {merge_out} -db {otu_cluster_out} -strand plus -id 0.97 -otutabout {otu_tab_global}"
print(f"\n{command}")
    
# 4) Merge few OTU tables (helpful when you have big files) - if doesnt work - use merge_and_sort_otu_tables function

otu_merge_in = glob.glob(uparse_dir + "otu_table_0*.txt")
otu_merge_in.sort()
otu_merge_in = ",".join(otu_merge_in)
otu_merge_out = uparse_dir + "otu_table_full.txt"
otu_merge_command = f"{tool} -otutab_merge {otu_merge_in} -output {otu_merge_out} -threads {threads}"
print(f"\n{otu_merge_command}")

# 5) Sort rare samples - if doesnt work look at 4 step

otu_sort_in = uparse_dir + "otu_table_merged.txt"
otu_sort_out = uparse_dir + "otutab_sort_5k.txt"
otu_sort_command = f"{tool} -otutab_rare {otu_sort_in} -sample_size 5000 -output {otu_sort_out}"
print(f"\n{otu_sort_command}")


# 6) alpha-diversity

alpha_in = uparse_dir + "otu_table_merged.txt"
alpha_out = uparse_dir + "alpha_gini_simpson.txt"
alpha_div_otu_command = f"{tool} -alpha_div {alpha_in} -output {alpha_out} -metrics simpson"
print(f"\n{alpha_div_otu_command}")

/mnt/projects/zilov/metagenomes/usearch/cancer_16s_run_2/uparse_out/

usearch -cluster_otus /mnt/projects/zilov/metagenomes/usearch/cancer_16s_run_2/common_out/uniques.fasta -minsize 2 -otus /mnt/projects/zilov/metagenomes/usearch/cancer_16s_run_2/uparse_out/otus.fa -relabel Otu

usearch -otutab /mnt/projects/zilov/metagenomes/usearch/cancer_16s_run_2/common_out/merged.fq -otus /mnt/projects/zilov/metagenomes/usearch/cancer_16s_run_2/uparse_out/otus.fa -otutabout /mnt/projects/zilov/metagenomes/usearch/cancer_16s_run_2/uparse_out/otu_table.txt -mapout /mnt/projects/zilov/metagenomes/usearch/cancer_16s_run_2/uparse_out/map.txt

usearch -usearch_global /mnt/projects/zilov/metagenomes/usearch/cancer_16s_run_2/common_out/merged.fq -db /mnt/projects/zilov/metagenomes/usearch/cancer_16s_run_2/uparse_out/otus.fa -strand plus -id 0.97 -otutabout /mnt/projects/zilov/metagenomes/usearch/cancer_16s/uparse_out/otutable_global.txt

usearch -otutab_merge  -output /mnt/projects/zilov/metagenomes/usea

In [126]:
# unoise pipeline
unoise_in = unique_out
unoise_out = '/mnt/projects/zilov/metagenomes/usearch/cancer_16s/unoise_out/zotus.fa'
unoise_command = f"usearch -unoise3 {unoise_in} -zotus {unoise_out}"
print(unoise_command)

usearch -unoise3 /mnt/projects/zilov/metagenomes/usearch/cancer_16s/common_out/uniques.fasta -zotus /mnt/projects/zilov/metagenomes/usearch/cancer_16s/unoise_out/zotus.fa


In [251]:
# SPLIT BIG FILES IN FEW ()
# count number of lines in your merged file with (less ./merged.fq | wc -l)
#inputs
def split_files(file_path, lines_in, file_format = "fastq", split_number = 3):
    
    n = 4
    if file_format == "fasta":
        n = 2

    reads_number = lines_in / n # 4 because of fastq file structure, use 2 for fasta

    #found out split number
    if n == 2:
        if reads_number % split_number != 0:
            lines_in += 1
    else:
        while reads_number % split_number != 0:
            print(split_number)
            split_number+=1
    
    print(f"Will split file in {split_number} files")

    line_split_number = int(lines_in / split_number)
    
    prefix = file_path.split(".")[0] + "_"

    command = f"split -d -l{line_split_number} {file_path} {prefix}"
    print("Insert this command in command line:", command)
    print(".\n"*5)
    print("After spliting you can merge files with the help of cat command >>>")
    print(".")
    command = "cat ./file_00.fasta ./file_01.fasta ./file_02.fasta > ./file.fasta"
    print(command)

In [195]:
# Merge few OTUs tables
otu_table_list = glob.glob(uparse_dir + "otu_*0*.txt")
otu_table_list.sort()

otu_dir = {}

def merge_and_sort_otu_tables(directory, prefix, output_prefix, sort_number = 5000):
    """
    Desctiption: merges few Otu tables:
    directory - path to the folder with OTU tables to merge (should end with "/")
    prefix - prefix of tables in otu*0*.txt format
    output_file - prefix to output file
    """
    otu_table_list = glob.glob(directory + prefix)
    for table in otu_table_list:
        with open(table) as fh:
            for line in fh:
                if line.startswith("#"):
                    continue
                line = line.split()
                otu_key = line[0]
                otu_value = line[1]
                if otu_key in otu_dir.keys():
                    otu_dir[otu_key] += otu_value
                else:
                    otu_dir[otu_key] = otu_value
    
    with open(directory + output_prefix, "w") as fw:
        fw.write("#OTU ID	M04266")
        for key, value in otu_dir.items():
            if int(value) > sort_number:
                fw.write("\n" + key + "\t" + value)
            
merge_otu_tables(uparse_dir, "otu_*0*.txt", "otu_table_merged.txt")

In [None]:
otu_dir_1 = {}
otu_dir_2 = {}
otu_dir_3 = {}

In [180]:
with open(otu_table_list[0]) as fh:
    for line in fh:
        if line.startswith("#"):
            continue
        line = line.split()
        otu_key = line[0]
        otu_value = int(line[1])
        otu_dir_1[otu_key] = otu_value

1444
797
2202
4443
4443


In [253]:
# split merged_file
split_files("/mnt/projects/zilov/metagenomes/usearch/cancer_16s_run_2/common_out/merged.fq", 38363976)

Will split file in 3 files
Insert this command in command line: split -d -l12787992 /mnt/projects/zilov/metagenomes/usearch/cancer_16s_run_2/common_out/merged.fq /mnt/projects/zilov/metagenomes/usearch/cancer_16s_run_2/common_out/merged_
.
.
.
.
.

After spliting you can merge files with the help of cat command >>>
.
cat ./file_00.fasta ./file_01.fasta ./file_02.fasta > ./file.fasta


In [119]:
#split filtered file
split_files("/mnt/projects/zilov/metagenomes/usearch/cancer_16s/common_out/filtered.fasta", 38363976, "fastq", 2)

Will split file in 2 files
Insert this command in command line: split -d -l22966267 file file_
.
.
.
.
.

After spliting you can merge files with the help of cat command >>>
.
cat ./file_00.fasta ././file_01.fasta ./file_02.fasta > ./file.fasta


'split -d -l22966267.0 ./merged.fq ./merged_'

In [44]:
# merge(assembly) of 16S rRNA Illumina reads, options - https://drive5.com/usearch/manual/merge_options.html
#command = f"usearch -fastq_mergepairs {forward_read}"
# print(command)

# если слилось меньше 70% ридов, меняем дефолтные параметры
# -fastq_maxdiffs - как много ошибок может быть в риде (дефолт 5), увеличиваем на 1, проверяем как меняется процент
# в какой=то момент он перестанет расти, это значение оставляем
output_dir_merge = work_dir + "/merged"
if not os.path.exists(output_dir_merge): 
    os.mkdir(output_dir_merge)

command_new = f"usearch -fastq_mergepairs {reads_dir}/*_R1.fastq -fastq_maxdiffs 14 -fastq_pctid 10 -fastqout {output_dir_merge}/merged.fq "
print(command_new)

usearch -fastq_mergepairs /mnt/projects/zilov/data/quiime_metagenome/v16trim/*_R1.fastq -fastq_maxdiffs 14 -fastq_pctid 10 -fastqout /mnt/projects/zilov/metagenomes/usearch/cancer_16s/merged/merged.fq 


In [20]:
# basic statistics
output_dir = work_dir + "/reads_stats"
for f_read in forward_reads:
    in_file = f_read
    out_file = output_dir + "/" + f_read.split("/")[-1].split(".")[0]
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    command = f"usearch -fastx_info {in_file} > {out_file}.stat"
    print(command)

usearch -fastx_info /mnt/projects/zilov/data/quiime_metagenome/v16trim/10b_1.fastq > /mnt/projects/zilov/metagenomes/usearch/cancer_16s/reads_stats/10b_1.stat
usearch -fastx_info /mnt/projects/zilov/data/quiime_metagenome/v16trim/11b_1.fastq > /mnt/projects/zilov/metagenomes/usearch/cancer_16s/reads_stats/11b_1.stat
usearch -fastx_info /mnt/projects/zilov/data/quiime_metagenome/v16trim/12b_1.fastq > /mnt/projects/zilov/metagenomes/usearch/cancer_16s/reads_stats/12b_1.stat
usearch -fastx_info /mnt/projects/zilov/data/quiime_metagenome/v16trim/13b_1.fastq > /mnt/projects/zilov/metagenomes/usearch/cancer_16s/reads_stats/13b_1.stat
usearch -fastx_info /mnt/projects/zilov/data/quiime_metagenome/v16trim/14b_1.fastq > /mnt/projects/zilov/metagenomes/usearch/cancer_16s/reads_stats/14b_1.stat
usearch -fastx_info /mnt/projects/zilov/data/quiime_metagenome/v16trim/15b_1.fastq > /mnt/projects/zilov/metagenomes/usearch/cancer_16s/reads_stats/15b_1.stat
usearch -fastx_info /mnt/projects/zilov/data/q

In [246]:
reads_dir = '/mnt/projects/zilov/data/quiime_metagenome/v16trim_R/'
forward_reads = glob.glob("/mnt/projects/zilov/data/quiime_metagenome/v16trim/*_1.fastq")
reverse_reads = glob.glob("/mnt/projects/zilov/data/quiime_metagenome/v16trim/*_2.fastq")

forward_reads.sort()
reverse_reads.sort()

In [249]:
for i in range(len(forward_reads)):
    mv_file = reverse_reads[i]
    prefix = "human_" + mv_file.split("/")[-1].split("_")[0] + "_R2_001.fastq"
    command = f"cp {mv_file} {reads_dir + prefix}"
    print(command)
    os.system(command)

cp /mnt/projects/zilov/data/quiime_metagenome/v16trim/10b_2.fastq /mnt/projects/zilov/data/quiime_metagenome/v16trim_R/human_10b_R2_001.fastq
cp /mnt/projects/zilov/data/quiime_metagenome/v16trim/11b_2.fastq /mnt/projects/zilov/data/quiime_metagenome/v16trim_R/human_11b_R2_001.fastq
cp /mnt/projects/zilov/data/quiime_metagenome/v16trim/12b_2.fastq /mnt/projects/zilov/data/quiime_metagenome/v16trim_R/human_12b_R2_001.fastq
cp /mnt/projects/zilov/data/quiime_metagenome/v16trim/13b_2.fastq /mnt/projects/zilov/data/quiime_metagenome/v16trim_R/human_13b_R2_001.fastq
cp /mnt/projects/zilov/data/quiime_metagenome/v16trim/14b_2.fastq /mnt/projects/zilov/data/quiime_metagenome/v16trim_R/human_14b_R2_001.fastq
cp /mnt/projects/zilov/data/quiime_metagenome/v16trim/15b_2.fastq /mnt/projects/zilov/data/quiime_metagenome/v16trim_R/human_15b_R2_001.fastq
cp /mnt/projects/zilov/data/quiime_metagenome/v16trim/16b_2.fastq /mnt/projects/zilov/data/quiime_metagenome/v16trim_R/human_16b_R2_001.fastq
cp /mn

cp /mnt/projects/zilov/data/quiime_metagenome/v16trim/70b_2.fastq /mnt/projects/zilov/data/quiime_metagenome/v16trim_R/human_70b_R2_001.fastq
cp /mnt/projects/zilov/data/quiime_metagenome/v16trim/72_2.fastq /mnt/projects/zilov/data/quiime_metagenome/v16trim_R/human_72_R2_001.fastq
cp /mnt/projects/zilov/data/quiime_metagenome/v16trim/73_2.fastq /mnt/projects/zilov/data/quiime_metagenome/v16trim_R/human_73_R2_001.fastq
cp /mnt/projects/zilov/data/quiime_metagenome/v16trim/75_2.fastq /mnt/projects/zilov/data/quiime_metagenome/v16trim_R/human_75_R2_001.fastq
cp /mnt/projects/zilov/data/quiime_metagenome/v16trim/76_2.fastq /mnt/projects/zilov/data/quiime_metagenome/v16trim_R/human_76_R2_001.fastq
cp /mnt/projects/zilov/data/quiime_metagenome/v16trim/77_2.fastq /mnt/projects/zilov/data/quiime_metagenome/v16trim_R/human_77_R2_001.fastq
cp /mnt/projects/zilov/data/quiime_metagenome/v16trim/78_2.fastq /mnt/projects/zilov/data/quiime_metagenome/v16trim_R/human_78_R2_001.fastq
cp /mnt/projects/z