In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import glob
import scipy.io
import os 
import anndata 

class DataProcessor:
    def __init__(self, root_directory):
        self.root_directory = root_directory

    def assemble_h5ad(self, barcodes_file, features_file, matrix_file, output_file):
        # Read barcode, features, and matrix files
        barcodes = pd.read_csv(barcodes_file, header=None, index_col=0, names=['barcode'])
        features = pd.read_csv(features_file, sep='\t', header=None)
        features.columns = features.columns.astype(str)
        barcodes.columns = barcodes.columns.astype(str)
        matrix = scipy.io.mmread(matrix_file).T.tocsc()

        # Create AnnData object
        adata = anndata.AnnData(X=matrix, obs=barcodes, var=features)
        # adata_copy = adata.copy()

        # Convert index to strings in the copy
        adata.var = adata.var.astype(str)
        adata.obs = adata.obs.astype(str)

        # Write h5ad file
        adata.write_h5ad(output_file)


    def process_directory(self):
        file_counter = 0 
        for subdir in os.listdir(self.root_directory):
            subdirectory_path = os.path.join(self.root_directory, subdir)    
            print(subdirectory_path)
            if not os.path.isdir(subdirectory_path):
                continue
    
            matrix_files = [f for f in os.listdir(subdirectory_path) if f.endswith(('matrix.mtx.gz', 'matrix.mtx'))]
            
            if not matrix_files:
                print('No matrix file, skipping this directory')
                continue
    
            if matrix_files:
                matrix_file = os.path.join(subdirectory_path, matrix_files[0])
                barcodes_file = os.path.join(subdirectory_path,[f for f in os.listdir(subdirectory_path) if 'barcode' in f or 'barcodes' in f][0])
                features_file = os.path.join(subdirectory_path,[f for f in os.listdir(subdirectory_path) if 'feature' in f or 'genes' in f or 'feature' in f][0])
                # Define the output file path
                output_file = os.path.join(subdirectory_path, f'{subdir}_output.h5ad')
    
                # Call the assemble_h5ad function
                self.assemble_h5ad(barcodes_file, features_file, matrix_file, output_file)
                file_counter += 1
                print('Saving file at:', output_file)
        print(f"Successfully processed {file_counter} directories.")



In [None]:
root_directory = '/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Lee'

# Create an instance of DataProcessor
data_processor = DataProcessor(root_directory)

# Call the process_directory method
data_processor.process_directory()

In [None]:
main_directory = '/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Steele'
#Loop throught the main directory because files are 1 down inside another directory for Steele 
for subdir in os.listdir(main_directory):
    root_directory = os.path.join(main_directory, subdir)

    try:
        # Check if it's a directory
        if not os.path.isdir(root_directory):
            raise NotADirectoryError(f"{root_directory} is not a directory")

        # Create an instance of DataProcessor
        data_processor = DataProcessor(root_directory)

        # Call the process_directory method
        data_processor.process_directory()

    except Exception as e:
        print(f"Error processing {root_directory}: {e}")

In [None]:
#Move the files for Simeone into directories for consistency
import shutil

files = []
directory_path = '/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Simeone'
for file in os.listdir(directory_path):
    try:
        if file.startswith('GSM'):
            parts = file.split('_')[0] + '_' + file.split('_')[1]
            files.append(parts)
        files_unique = sorted(list(set(files)))
    except:
        None

for new_dir in files_unique:
    directory = os.path.join(directory_path, new_dir)
    os.makedirs(directory, exist_ok=True)

for dir in files_unique:
    for file in sorted(os.listdir(directory_path)) :
        file_path = os.path.join(directory_path, file)
        if os.path.isfile(file_path):
            if file.startswith(dir):
                new_directory = os.path.join(directory_path, dir)
                shutil.move(file_path, new_directory)
                print(f"Moved {file_path} to {new_directory}")

    #     if file.startswith(new_dir):
    #         source_path = os.path.join(directory_path, file)
    #         print(source_path, directory)
    #         # shutil.move(source_path, destination_path)

#TODO: remove commented code if not necessary

In [None]:
root_directory = '/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Simeone'

# Create an instance of DataProcessor
data_processor = DataProcessor(root_directory)

# Call the process_directory method
data_processor.process_directory()

In [None]:
import shutil

files = []
directory_path = '/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Caronni'
for file in os.listdir(directory_path):
    try:
        if file.startswith('GSM'):
            parts = file.split('_')[0] + '_' + file.split('_')[1]
            files.append(parts)
        files_unique = sorted(list(set(files)))
    except:
        None
        
for new_dir in files_unique:
    directory = os.path.join(directory_path, new_dir)
    os.makedirs(directory, exist_ok=True)

for dir in files_unique:
    for file in sorted(os.listdir(directory_path)) :
        file_path = os.path.join(directory_path, file)
        if os.path.isfile(file_path):
            if file.startswith(dir):
                new_directory = os.path.join(directory_path, dir)
                shutil.move(file_path, new_directory)
                print(f"Moved {file_path} to {new_directory}")

In [None]:
for dir in files_unique:
    for file in sorted(os.listdir(directory_path)) :
        file_path = os.path.join(directory_path, file)
        print(file_path)

In [None]:
root_directory = '/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Caronni/'

# Create an instance of DataProcessor
data_processor = DataProcessor(root_directory)

# Call the process_directory method
data_processor.process_directory()

In [None]:
import shutil

files = []
directory_path = '/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Zhang'
for file in os.listdir(directory_path):
    try:
        if file.startswith('GSM'):
            parts = file.split('_')[0] + '_' + file.split('_')[1]
            files.append(parts)
        files_unique = sorted(list(set(files)))
    except:
        None
        
for new_dir in files_unique:
    directory = os.path.join(directory_path, new_dir)
    os.makedirs(directory, exist_ok=True)

for dir in files_unique:
    for file in sorted(os.listdir(directory_path)) :
        file_path = os.path.join(directory_path, file)
        if os.path.isfile(file_path):
            if file.startswith(dir):
                new_directory = os.path.join(directory_path, dir)
                shutil.move(file_path, new_directory)
                print(f"Moved {file_path} to {new_directory}")

In [None]:
root_directory = '/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Zhang/'

# Create an instance of DataProcessor
data_processor = DataProcessor(root_directory)

# Call the process_directory method
data_processor.process_directory()

In [None]:
root_directory = '/lustre/groups/ml01/workspace/shrey.parikh/PDAC/raw_data/Lin'

# Create an instance of DataProcessor
data_processor = DataProcessor(root_directory)

# Call the process_directory method
data_processor.process_directory()

In [None]:
schlesinger = pd.read_csv('/lustre/groups/ml01/workspace/shrey.parikh/PDAC/raw_data/Schlesinger/GSM4293555_Human.csv', sep='\t',  index_col=0)

In [None]:
schlesinger = schlesinger.T

In [None]:
from scipy import sparse
sparse_matrix = sparse.csr_matrix(schlesinger.values)
adata = sc.AnnData(X=sparse_matrix, obs=pd.DataFrame(index=schlesinger.index), var=pd.DataFrame(index=schlesinger.columns))
adata.write('/lustre/groups/ml01/workspace/shrey.parikh/PDAC/raw_data/Schlesinger/GSM4293555_Human.h5ad')