## Imports

In [1]:
import paramiko
from scp import SCPClient
import os
import sys
import logging

from dotenv import load_dotenv
load_dotenv('../secrets.env')

# Custom functions
python_dir_path = os.path.join('..', 'scripts', 'python')
sys.path.append(python_dir_path)
from compute_tree import run_raxmlng_check, generate_slurm_script, submit_sbatch

## Variables

In [2]:
raw_data = os.path.join('..', 'raw_data')
subset_align_dir = os.path.join(raw_data, 'reference_alignments', 'pr2_subset')
subset_reftree_dir = os.path.join(raw_data, 'reference_trees', 'pr2_subset')
slurm_scripts_dir = os.path.join('..', 'scripts', 'slurm')

# Create reference tree using *raxml-ng*

## Alignment validation using RAxML-ng

Before going into computationally intensive phylogenetic analyses, it's important to ensure that our initial data – the alignment – is in good shape. For this, we can use the RAxML *--check* function.
The *--check* function in RAxML provides a preliminary examination of the alignment data. It checks for potential issues, such as
1. Data Consistency: Ensures that the alignment doesn't have any unexpected or inconsistent data formats that could disrupt subsequent analysis.
2. Gap Handling: Identifies if there are any all-gap columns which might be problematic for tree calculations.
3. Model Compatibility: Verifies that the chosen substitution model is appropriate for the alignment dtion.tion.



In [None]:
# Alignment validation using RAxML-ng

# Variables
alignment_masking = 'Moderate' # Loose, Moderate, or Strict
alignment_name = f'reference_alignment_gblocks{alignment_masking}.fasta'
ref_alignment_path = os.path.join(subset_align_dir, alignment_name)
output_dir_path = os.path.join(subset_reftree_dir, f'gblocks{alignment_masking}')

# Run
run_raxmlng_check(ref_alignment = ref_alignment_path, 
                  output_dir = output_dir_path, 
                  model='GTR+G', 
                  prefix="T1", 
                  log_file='log_file.txt')

## Create slurm script

In [9]:
# Define inputs:
masking_level = 'Moderate'
ref_align_cheops_path = os.path.join('pacbio_diversity_assessment', 'alignments', 'pr2_subset', f'reference_alignment_gblocks{masking_level}.fasta')
ref_trees_cheaops_path = os.path.join('pacbio_diversity_assessment', 'reference_trees', 'pr2_subset', f'gblocks{masking_level}')
model = 'GTR+FO+G4m'
ncores = 6
slurm_script_name = 'compute_tree.slurm'

slurm_input = {'email': os.getenv('EMAIL'),
               'ref_alignment': os.path.join(ref_align_cheops_path),
               'otu_dir': ref_trees_cheaops_path,
               'model': model,
               'ncores': ncores,
               'prefix': f'T2_{masking_level}'}

# Prompt the user for input and generate the script
script_content = generate_slurm_script(slurm_input)

# Save the generated script to a file
script_path = os.path.join(slurm_scripts_dir, slurm_script_name)
with open(script_path, 'w') as file:
    file.write(script_content)

print(f"Script saved to {script_path}")

Script saved to ../scripts/slurm/compute_tree.slurm


## Computing a phylogenetic tree at the CHEOPS cluster

In the next step, we can compute a phylogenetic tree from the validated reference alignment.

This will be computationally heavy, considering we have several hundreds of taxa and sites in the alignment. Therefore, we will run the computation on the university 'CHEOPS' cluster.

There are multiple steps involved:

1. Creat an SSH connection with the cluster
2. Upload the reference alignment and slurm script to the cluster.
3. Run the slurm script on the cluster
4. Retrieving data
5. Close connection

### Create SSH connection

In [14]:
# Create an SSH client instance
ssh_client = paramiko.SSHClient()

# Automatically add the server's host key (this is needed since it's your first time connecting)
ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())

# Connect using your SSH key and username
ssh_client.connect('cheops.rrz.uni-koeln.de', username=os.getenv('CHEOPS_USERNAME'), key_filename=os.getenv('ID_RSA_PATH'))

In [11]:
# Check if the transport is active
if ssh_client.get_transport() and ssh_client.get_transport().is_active():
    print("SSH connection is active!")
else:
    print("SSH connection is not active.")

SSH connection is active!


## Uploading files to the cluster

In [12]:
# Variables
alignment_masking = 'Moderate' # Loose, Moderate, or Strict
alignment_name = f'reference_alignment_gblocks{alignment_masking}.fasta'
local_reference_alignment_path = os.path.join(subset_align_dir, alignment_name)
local_slurm_script_path = os.path.join(slurm_scripts_dir, 'compute_tree.slurm')
remote_reference_alignment_path = os.path.join('/', 'home', 'lrajter', 'pacbio_diversity_assessment', 'alignments', 'pr2_subset', alignment_name)
remote_slurm_script_path = os.path.join('/', 'home', 'lrajter', 'pacbio_diversity_assessment', 'slurm_scripts', 'compute_tree.slurm')

# Establish SSH connection
ssh_client = paramiko.SSHClient()
ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
# ssh_client.connect('cheops.rrz.uni-koeln.de', username='lrajter', password=os.getenv('PASSWORD'))
ssh_client.connect(os.getenv('CHEOPS_CLUSTER'), username=os.getenv('CHEOPS_USERNAME'), key_filename=os.getenv('ID_RSA_PATH'))

# Use SFTP to transfer files
sftp = ssh_client.open_sftp()
sftp.put(local_reference_alignment_path, remote_reference_alignment_path)
sftp.put(local_slurm_script_path, remote_slurm_script_path)
sftp.close()

### Run the slurm script on the cluster

In [15]:
masking_level = 'Moderate'
slurm_script_path = os.path.join('/', 'home', 'lrajter', 'pacbio_diversity_assessment', 'slurm_scripts', 'compute_tree.slurm')
ref_trees_path = os.path.join(raw_data, 'reference_trees', 'pr2_subset', f'gblocks{masking_level}')
job_id_file_path = os.path.join(ref_trees_path, 'job_id.txt')

submit_sbatch(ssh_client, slurm_script_path, job_id_file_path)

Output: Submitted batch job 18262093


'18262093'

### Close the SSH connection

In [None]:
# Close the SSH connection
ssh_client.close()

### Check the status of the SLURM job remotely via SSH

In [16]:
# Re-establish SSH Connection
ssh_client = paramiko.SSHClient()
ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh_client.connect(os.getenv('CHEOPS_CLUSTER'), username=os.getenv('CHEOPS_USERNAME'), key_filename=os.getenv('ID_RSA_PATH'))

In [17]:
# Retrieve the job number
masking_level = 'Moderate'
job_id_file = os.path.join(subset_reftree_dir, f'gblocks{masking_level}', 'job_id.txt')

with open(job_id_file, 'r') as infile:
    for line in infile:
        job_id = line

print(job_id)

18262093


In [18]:
# Query Job Status:
# Assuming you have the job ID saved in a variable named job_id
command = f'squeue -j {job_id}'
stdin, stdout, stderr = ssh_client.exec_command(command)

output = stdout.read().decode().strip()
error = stderr.read().decode().strip()

if output:
    print("Output:", output)
if error:
    print("Error:", error)

Output: JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
          18262093   smp-rh7 compute_  lrajter PD       0:00      1 (Resources)


In [19]:
# Get More Detailed Information:
command = f'sstat -j {job_id}'
stdin, stdout, stderr = ssh_client.exec_command(command)

output = stdout.read().decode().strip()
error = stderr.read().decode().strip()

if output:
    print("Output:", output)
if error:
    print("Error:", error)

Output: JobID         MaxVMSize  MaxVMSizeNode  MaxVMSizeTask  AveVMSize     MaxRSS MaxRSSNode MaxRSSTask     AveRSS MaxPages MaxPagesNode   MaxPagesTask   AvePages     MinCPU MinCPUNode MinCPUTask     AveCPU   NTasks AveCPUFreq ReqCPUFreqMin ReqCPUFreqMax ReqCPUFreqGov ConsumedEnergy  MaxDiskRead MaxDiskReadNode MaxDiskReadTask  AveDiskRead MaxDiskWrite MaxDiskWriteNode MaxDiskWriteTask AveDiskWrite TRESUsageInAve TRESUsageInMax TRESUsageInMaxNode TRESUsageInMaxTask TRESUsageInMin TRESUsageInMinNode TRESUsageInMinTask TRESUsageInTot TRESUsageOutAve TRESUsageOutMax TRESUsageOutMaxNode TRESUsageOutMaxTask TRESUsageOutMin TRESUsageOutMinNode TRESUsageOutMinTask TRESUsageOutTot 
------------ ---------- -------------- -------------- ---------- ---------- ---------- ---------- ---------- -------- ------------ -------------- ---------- ---------- ---------- ---------- ---------- -------- ---------- ------------- ------------- ------------- -------------- ------------ --------------- --------

In [20]:
# Get Control Information:
command = f'scontrol show job {job_id}'
stdin, stdout, stderr = ssh_client.exec_command(command)

output = stdout.read().decode().strip()
error = stderr.read().decode().strip()

if output:
    print("Output:", output)
if error:
    print("Error:", error)

Output: JobId=18262093 JobName=compute_tree.slurm
   UserId=lrajter(405397) GroupId=uniuser(666) MCS_label=N/A
   Priority=7144 Nice=0 Account=unikoeln QOS=normal
   JobState=PENDING Reason=Resources Dependency=(null)
   Requeue=1 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0
   RunTime=00:00:00 TimeLimit=10-00:00:00 TimeMin=N/A
   SubmitTime=2023-10-27T20:23:05 EligibleTime=2023-10-27T20:23:05
   AccrueTime=2023-10-27T20:23:05
   StartTime=2023-10-28T14:39:59 EndTime=2023-11-07T13:39:59 Deadline=N/A
   SuspendTime=None SecsPreSuspend=0 LastSchedEval=2023-10-27T20:25:47 Scheduler=Backfill:*
   Partition=smp-rh7 AllocNode:Sid=cheops0:10131
   ReqNodeList=(null) ExcNodeList=(null)
   NodeList= SchedNodeList=cheops11708
   NumNodes=1-1 NumCPUs=6 NumTasks=1 CPUs/Task=6 ReqB:S:C:T=0:0:*:*
   TRES=cpu=6,mem=46G,node=1,billing=6
   Socks/Node=* NtasksPerN:B:S:C=1:0:*:* CoreSpec=*
   MinCPUsNode=6 MinMemoryNode=46G MinTmpDiskNode=0
   Features=(null) DelayBoot=00:00:00
   OverSubscribe=OK Conti

In [21]:
# Close the SSH Connection:
ssh_client.close()

## Transfering the output files back to the local machine

### Re-establish SSH Connection

In [22]:
ssh_client = paramiko.SSHClient()
ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh_client.connect(os.getenv('CHEOPS_CLUSTER'), username=os.getenv('CHEOPS_USERNAME'), key_filename=os.getenv('ID_RSA_PATH'))

### Open an SFTP Session

In [23]:
# Start an SFTP session using the connected SSH client.
sftp = ssh_client.open_sftp()

### Download the Files

In [24]:
# Specify the remote file's path and the local path where you want to save the file.

masking_level = 'Moderate'
remote_ref_tree_dir = os.path.join('/', 'home', 'lrajter', 'pacbio_diversity_assessment', 'reference_trees', 'pr2_subset', f'gblocks{masking_level}')
local_ref_tree_dir = os.path.join(raw_data, 'reference_trees', 'pr2_subset', f'gblocks{masking_level}')

# Loop
for filename in sftp.listdir(remote_ref_tree_dir):
    sftp.get(os.path.join(remote_ref_tree_dir, filename), os.path.join(local_ref_tree_dir, filename))

### Close connection

In [25]:
sftp.close()
ssh_client.close()