In [1]:
### Declare variables here ###
ws   = '/beegfs/work/graaf20/Rstc2020'               # working space for this project
CR   = 'Pooled'                                      # directory for clean reads (do not change unless needed)
db   = '/beegfs/work/graaf20/Databases/SqueezeMeta'  # replace 'your_ws' with the name of your working space
name = 'MAGs/'                                       # name for the directory to output files
sts  = ['RSP', 'RF', 'FL', 'FR']

### Do not change from here ###
import os
import pandas as pd

# binac function that will submit the job to server
def binac(string,name,queue):
    script = "%s.sh"%name
    with open(script, "w") as text_file:
        text_file.write(string)
    !chmod +x $script
    !qsub -q $queue $script
    !rm $script

# Create list of file indexes to access it later
clean = !rm -r */.ipynb_checkpoints
samples = []
if os.path.exists(CR): 
    samples = [f.rsplit('_', 1)[0] for f in os.listdir(CR) if any(['fq' in f,'fastq' in f])]
    samples = sorted(list(set(samples)))
print('Number of clean samples: ', len(samples))
if samples != []:
    print('List of clean samples:')
    print(*samples)

Number of clean samples:  52
List of clean samples:
100_FR_d13_Trt4_HP1 102_FR_d13_Trt4_HP2 104_FR_d13_Trt5_HP1 106_FR_d13_Trt5_HP2 108_FR_d13_Trt6_HP1 110_FR_d13_Trt6_HP2 11_FR_d13_Trt1_HP2 123_FL_d13_Trt1_HP1 124_FL_d13_Trt1_HP2 125_FL_d13_Trt2_HP1 126_FL_d13_Trt2_HP2 127_FL_d13_Trt3_HP1 128_FL_d13_Trt3_HP2 129_FL_d13_Trt4_HP1 130_FL_d13_Trt4_HP2 131_FL_d13_Trt5_HP1 132_FL_d13_Trt5_HP2 133_FL_d13_Trt6_HP1 134_FL_d13_Trt6_HP2 13_FR_d13_Trt2_HP1 15_FR_d13_Trt2_HP2 17_FR_d13_Trt3_HP1 19_FR_d13_Trt3_HP2 1_RSP_d0_not_appl_C1 21_FR_d13_Trt4_HP1 23_FR_d13_Trt4_HP2 25_FR_d13_Trt5_HP1 27_FR_d13_Trt5_HP2 29_FR_d13_Trt6_HP1 31_FR_d13_Trt6_HP2 44_FL_d13_Trt1_HP1 45_FL_d13_Trt1_HP2 46_FL_d13_Trt2_HP1 47_FL_d13_Trt2_HP2 48_FL_d13_Trt3_HP1 49_FL_d13_Trt3_HP2 4_RF_d0_not_appl_C1 50_FL_d13_Trt4_HP1 51_FL_d13_Trt4_HP2 52_FL_d13_Trt5_HP1 53_FL_d13_Trt5_HP2 54_FL_d13_Trt6_HP1 55_FL_d13_Trt6_HP2 80_RSP_d0_not_appl_C1 83_RF_d0_not_appl_C1 88_FR_d13_Trt1_HP1 90_FR_d13_Trt1_HP2 92_FR_d13_Trt2_HP1 94_FR_d13_

# Unzip archive with reads

In [7]:
# Upload first zip file to your ws directory. Run this cell to unzip
string = '''
#PBS -l nodes=1:ppn=4
#PBS -l walltime=48:00:00
#PBS -l mem=8gb
#PBS -S /bin/bash
source $HOME/miniconda3/etc/profile.d/conda.sh
conda activate unzip
cd %s
bsdtar -xf %s
'''
for z in os.listdir(ws):
    if '.zip' not in z: continue
    if '.sh' in z: continue
    binac(string % (ws, z), 'bsdtar_unzip_' + z, 'short')
    print(z)

10945432
SM_RSTC2020.zip


# Collection of fasta files

In [16]:
# Create a new folder for collected bins

!mkdir -p Bins

#Collect bins
for st in sts:
    bins = f'{st}/bins'
    if not os.path.exists(bins):
        print('No bins!!!')
        continue
    for fa in os.listdir(bins):
        if not fa.endswith('.fa'): 
            continue
        inp = f'{bins}/{fa}'
        out = f'Bins/{st}_{fa}'
        
        !cp $inp $out

# Bins clustering

### Extract completness and contamination from SqueezeMeta runs

In [2]:
compl = pd.DataFrame()
for st in sts:
    inp = f'{st}/19.{st}_SqueezeMeta_CAZy.bintable'
    if not os.path.exists(inp):
        print(st, 'No bintab!!!')
        continue
    tab = pd.read_csv(inp, sep='\t', index_col=0, skiprows=1)
    for ind in tab.index:
        compl.loc[len(compl), ['Bin Id', 'Completeness', 'Contamination']] = \
            [f'{st}_{ind}', tab.loc[ind, 'Completeness'], tab.loc[ind, 'Contamination']]
nans = compl[compl['Completeness'].isna()].copy()
compl = compl[compl['Completeness'].notna()].copy()
display(compl)
compl.to_csv('mOTUlizer.tsv', index=False, sep='\t')
compl = compl[['Bin Id', 'Completeness']]
compl.to_csv('SuperPang.tsv', index=False, header=False, sep='\t')

Unnamed: 0,Bin Id,Completeness,Contamination
0,RSP_maxbin.022.fasta.contigs,66.13,15.99
1,RSP_maxbin.018.fasta.contigs,59.96,41.62
2,RSP_maxbin.009.fasta.contigs,53.21,14.66
3,RSP_maxbin.006.fasta.contigs,40.17,5.27
4,RSP_maxbin.007.fasta.contigs,34.62,3.93
...,...,...,...
755,FR_maxbin.111.fasta_sub.contigs,16.25,2.97
756,FR_maxbin.120.fasta_sub.contigs,15.46,5.59
757,FR_maxbin.058.fasta.contigs,13.64,0.00
758,FR_maxbin.056.fasta_sub.contigs,11.34,0.00


In [5]:
for mag in nans['Bin Id']:
    file = f'Bins/{mag}.fa'
    
    !rm $file

### mOTUlizer to get clusters
conda create mOTUlizer -c bioconda  motulizer -y

In [10]:
string = '''
#PBS -l nodes=1:ppn=6
#PBS -l walltime=36:00:00
#PBS -l mem=20gb
#PBS -S /bin/bash
source $HOME/miniconda3/etc/profile.d/conda.sh
export TMPDIR=$TMPDIR

### Declare variables ###
ws=%s
out=%s

### Run mOTUlizer ###
conda activate mOTUlizer
cd $ws
mOTUlize.py --fnas Bins/*.fa -o $out --threads 6 --checkm mOTUlizer.tsv --MC 70
'''

out = 'mOTUs.tsv'

binac(string % (ws, out), 'mOTUs', 'short')


10945568


### SuperPang on clusters to get single MAG/mOTU sequence

In [39]:
string = '''
#PBS -l nodes=1:ppn=6
#PBS -l walltime=48:00:00
#PBS -l mem=16gb
#PBS -S /bin/bash
source $HOME/miniconda3/etc/profile.d/conda.sh
export TMPDIR=$TMPDIR

### Declare variables ###
ws=%s
mags=%s
out=%s


### Run SuperPang ###
conda activate SuperPang
cd $ws

SuperPang.py --fasta $mags --checkm SuperPang.tsv --output-dir $out --threads 6 --force-overwrite
rm $mags
'''
    
!mkdir -p SuperPang
    
compl = pd.read_csv('SuperPang.tsv', sep='\t', index_col=0, names=['Completeness'])
motus = pd.read_csv('mOTUs.tsv', sep='\t', index_col=0, skiprows=5)

for motu in motus.index:
    if motu != 'mOTU_026': #motus.index.tolist()[0]:
        continue
    mags = motus.loc[motu, 'MAGs']
    mags = [f'Bins/{mag}.fa' for mag in mags.split(';') if mag in compl.index]
        
    file = f'SuperPang/{motu}_mags.txt'
    with open(file, mode='wt', encoding='utf-8') as txt:
        txt.write('\n'.join(mags))
    out = f'SuperPang/{motu}'
        
    binac(string%(ws, file, out), f'{motu}_SuperPang', 'short')
    

10946193


### Check if all mOTUs are processed by SuperPang, combine them to one big fasta

In [48]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.81-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m38.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: biopython
Successfully installed biopython-1.81


In [49]:
from Bio import SeqIO

with open('big_ass.fa', 'w') as big_ass:
    for motu in os.listdir('SuperPang'):
        if not motu.startswith('mOTU_'):
            continue
        ass = f'SuperPang/{motu}/assembly.fasta'
        if not os.path.exists(ass):
            print(f'There is no fasta file for {motu}!!!')
        for record in SeqIO.parse(ass, 'fasta'):
            rec = record.id.split('=')[0]
            big_ass.write(f'>{motu}_{rec}\n{record.seq}\n')
 

In [79]:
!mkdir MAGs

for motu in os.listdir('SuperPang'):
    if not motu.startswith('mOTU_'):
        continue
    inp = f'SuperPang/{motu}/assembly.fasta'
    out = f'MAGs/{motu}_assembly.fasta'
    if not os.path.exists(inp):
        print(f'There is no fasta file for {motu}!!!')
        
    !cp $inp $out


mkdir: cannot create directory ‘MAGs’: File exists
mkdir: cannot create directory ‘MAGs’: File exists
mkdir: cannot create directory ‘MAGs’: File exists
mkdir: cannot create directory ‘MAGs’: File exists
mkdir: cannot create directory ‘MAGs’: File exists
mkdir: cannot create directory ‘MAGs’: File exists
mkdir: cannot create directory ‘MAGs’: File exists
mkdir: cannot create directory ‘MAGs’: File exists
mkdir: cannot create directory ‘MAGs’: File exists
mkdir: cannot create directory ‘MAGs’: File exists
mkdir: cannot create directory ‘MAGs’: File exists
mkdir: cannot create directory ‘MAGs’: File exists
mkdir: cannot create directory ‘MAGs’: File exists
mkdir: cannot create directory ‘MAGs’: File exists
mkdir: cannot create directory ‘MAGs’: File exists
mkdir: cannot create directory ‘MAGs’: File exists
mkdir: cannot create directory ‘MAGs’: File exists
mkdir: cannot create directory ‘MAGs’: File exists
mkdir: cannot create directory ‘MAGs’: File exists
mkdir: cannot create directory 

In [81]:

!zip -r MAGs.zip MAGs

  adding: MAGs/ (stored 0%)
  adding: MAGs/mOTU_153_assembly.fasta (deflated 71%)
  adding: MAGs/mOTU_171_assembly.fasta (deflated 71%)
  adding: MAGs/mOTU_064_assembly.fasta (deflated 71%)
  adding: MAGs/mOTU_067_assembly.fasta (deflated 71%)
  adding: MAGs/mOTU_118_assembly.fasta (deflated 71%)
  adding: MAGs/mOTU_266_assembly.fasta (deflated 71%)
  adding: MAGs/mOTU_167_assembly.fasta (deflated 71%)
  adding: MAGs/mOTU_208_assembly.fasta (deflated 71%)
  adding: MAGs/mOTU_023_assembly.fasta (deflated 71%)
  adding: MAGs/mOTU_044_assembly.fasta (deflated 72%)
  adding: MAGs/mOTU_200_assembly.fasta (deflated 72%)
  adding: MAGs/mOTU_166_assembly.fasta (deflated 71%)
  adding: MAGs/mOTU_133_assembly.fasta (deflated 71%)
  adding: MAGs/mOTU_127_assembly.fasta (deflated 71%)
  adding: MAGs/mOTU_082_assembly.fasta (deflated 71%)
  adding: MAGs/mOTU_173_assembly.fasta (deflated 71%)
  adding: MAGs/mOTU_004_assembly.fasta (deflated 72%)
  adding: MAGs/mOTU_149_assembly.fasta (deflated 72%)


### SqueezeMeta to reannotate MAGs

In [90]:
string = '''
#PBS -l nodes=1:ppn=8
#PBS -l walltime=29:00:00:00
#PBS -l mem=164gb
#PBS -S /bin/bash
source $HOME/miniconda3/etc/profile.d/conda.sh
export TMPDIR=$TMPDIR

conda activate SqueezeMeta

ws=%s
project=%s
samples=%s
reads=%s
ass=%s

cd $ws

SqueezeMeta.pl -m coassembly -p $project -s $samples -f $reads -extassembly \
$ass -t 8 --norename -test 13
'''

project = 'Annotated_MAGs'
sampl_df = pd.DataFrame()
reads = 'Pooled'
for f in samples:
    for r in range(1, 3):
        i = len(sampl_df)
        sampl_df.loc[i, 'sampleID'] = f.split('_')[0]
        sampl_df.loc[i, 'sampleName'] = f'{f}_{r}.fastq.gz'
        sampl_df.loc[i, 'pair'] = f'pair{r}'
samplesDF = 'samples.tsv'
sampl_df.to_csv(samplesDF, sep='\t', header=False, index=False)
big_ass = 'big_ass.fa'
binac(string % (ws, project, samplesDF, reads, big_ass), 'SM_reannotate_MAGs' , 'smp')

10948186


In [78]:
string = '''
#PBS -l nodes=1:ppn=6
#PBS -l walltime=10:00:00:00
#PBS -l mem=164gb
#PBS -S /bin/bash
source $HOME/miniconda3/etc/profile.d/conda.sh
export TMPDIR=$TMPDIR

conda activate SqueezeMeta

ws=%s
project=%s

cd $ws
restart.pl $project
'''

project = 'Annotated_MAGs'
binac(string % (ws, project), 'SM_reannotate_MAGs_restart', 'smp')

10908105


# Add mOTUs as bins

In [3]:
from Bio import SeqIO

!mkdir -p Annotated_MAGs/results/bins

for motu in os.listdir('SuperPang'):
    if not motu.startswith('mOTU_'):
        continue
    ass = f'SuperPang/{motu}/assembly.fasta'
    out = f'Annotated_MAGs/results/bins/{motu}.fa'
    if not os.path.exists(ass):
        print(f'There is no fasta file for {motu}!!!')
    with open(out, 'w') as fa:
        for record in SeqIO.parse(ass, 'fasta'):
            rec = record.id.split('=')[0]
            fa.write(f'>{motu}_{rec}\n{record.seq}\n')

# Restart SqueezeMeta

In [4]:
string = '''
#PBS -l nodes=1:ppn=6
#PBS -l walltime=6:00:00:00
#PBS -l mem=120gb
#PBS -S /bin/bash
source $HOME/miniconda3/etc/profile.d/conda.sh
export TMPDIR=$TMPDIR

conda activate SqueezeMeta

ws=%s
project=%s

cd $ws
restart.pl $project -step 16
'''

project = 'Annotated_MAGs'
binac(string % (ws, project), 'SM_reannotate_MAGs_restart_16', 'smp')

10953453


## Export tables

In [13]:
string = '''
#PBS -l nodes=1:ppn=2
#PBS -l walltime=04:00:00
#PBS -l mem=100gb
#PBS -S /bin/bash
source $HOME/miniconda3/etc/profile.d/conda.sh
export TMPDIR=$TMPDIR

conda activate SqueezeMeta

ws=%s
project=%s

cd $ws
sqm2tables.py $project $project/ext_tables/Tables
'''

project = 'Annotated_MAGs'
binac(string % (ws, project), 'SM_tables', 'long')

10954218


In [16]:
!zip -r SM_MAGs_Rusitec.zip Annotated_MAGs/ext_tables/Tables Annotated_MAGs/results

  adding: Annotated_MAGs/ext_tables/Tables/ (stored 0%)
  adding: Annotated_MAGs/ext_tables/Tables/Annotated_MAGs.contig.tax.allfilter.tsv (deflated 97%)
  adding: Annotated_MAGs/ext_tables/Tables/Annotated_MAGs.family.prokfilter.abund.tsv (deflated 62%)
  adding: Annotated_MAGs/ext_tables/Tables/Annotated_MAGs.genus.prokfilter.abund.tsv (deflated 64%)
  adding: Annotated_MAGs/ext_tables/Tables/Annotated_MAGs.order.nofilter.abund.tsv (deflated 61%)
  adding: Annotated_MAGs/ext_tables/Tables/.ipynb_checkpoints/ (stored 0%)
  adding: Annotated_MAGs/ext_tables/Tables/.ipynb_checkpoints/Annotated_MAGs.bin.tax-checkpoint.tsv (deflated 93%)
  adding: Annotated_MAGs/ext_tables/Tables/.ipynb_checkpoints/Annotated_MAGs.COG.cov-checkpoint.tsv (deflated 66%)
  adding: Annotated_MAGs/ext_tables/Tables/.ipynb_checkpoints/Annotated_MAGs.species.allfilter.abund-checkpoint.tsv (deflated 66%)
  adding: Annotated_MAGs/ext_tables/Tables/.ipynb_checkpoints/Annotated_MAGs.KO.abund-checkpoint.tsv (deflated 

In [15]:
!qstat -u ho_graaf20


mgmt02: 
                                                                                  Req'd       Req'd       Elap
Job ID                  Username    Queue    Jobname          SessID  NDS   TSK   Memory      Time    S   Time
----------------------- ----------- -------- ---------------- ------ ----- ------ --------- --------- - ---------
10954217                ho_graaf20  long     SM_tables.sh       4823     1      2     100gb  04:00:00 C       -- 
10954218                ho_graaf20  long     SM_tables.sh        --      1      2     100gb  04:00:00 Q       -- 


In [83]:
!qdel 10946196 10947590

In [143]:
bins = 'Bins/'
len(os.listdir(bins))

112

In [151]:
!chmod +x Bins/SuperPang_temp.py

In [172]:
!rm mOTU_*

In [174]:
!rm -r Bins/SuperPang

In [58]:
!mkdir -p Reports
!mv *.sh.* Reports/

# Clean projects to save some space on Binac

In [19]:
import os

pro = 'Sequential_run'
for p in os.listdir(pro):
    if not p.startswith('LH1_'):
        continue
    !rm -r $pro/$p/$p/temp $pro/$p/$p/data

In [17]:
import os

pro = 'Annotated_MAGs'
!rm -r $pro/temp $pro/data

In [20]:
import os

pro = 'Annotated_MAGs'

!rm -r $pro/$p/$p/temp $pro/$p/$p/data