In [3]:
import tarfile
import scanpy as sc
import scipy.io
import pandas as pd
import numpy as np

path = "extracted_data/GSM2396856_dc_3hr_genenames.csv.gz"

genes = pd.read_csv(path, header=None)
genes.rename(columns={1: "gene_name"}, inplace=True)
genes = genes.iloc[1:]
gene_names = genes["gene_name"]
print(gene_names)

1                ENSMUSG00000033845_Mrpl15
2                ENSMUSG00000025903_Lypla1
3                 ENSMUSG00000033813_Tcea1
4                 ENSMUSG00000002459_Rgs20
5               ENSMUSG00000033793_Atp6v1h
                       ...                
17771        ENSMUSG00000094799_AC125149.4
17772        ENSMUSG00000079808_AC168977.1
17773              ENSMUSG00000095041_PISD
17774             ENSMUSG00000063897_DHRSX
17775    ENSMUSG00000095742_CAAA01147332.1
Name: gene_name, Length: 17775, dtype: object


In [None]:
import os
import glob
import scanpy as sc
import scipy.io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

data_folder = "extracted_data/"
output_folder = "converted_data/"

if not os.path.exists(output_folder):
    os.makedirs(output_folder)
    print(f"Created directory: {output_folder}")

mtx_files = glob.glob(os.path.join(data_folder, "*.mtx.txt.gz"))

for mtx_file in mtx_files:
    try:
       
        base_name = os.path.basename(mtx_file).split(".mtx")[0]

     
        genes_path = os.path.join(data_folder, f"{base_name}_genenames.csv.gz")
        cells_path = os.path.join(data_folder, f"{base_name}_cellnames.csv.gz")

        matrix = scipy.io.mmread(mtx_file).tocsc()  

        # Load gene names
        genes = pd.read_csv(genes_path, header=None)
        genes.rename(columns={1: "gene_name"}, inplace=True)

        # Load cell barcodes
        cells = pd.read_csv(cells_path, header=None)
        cells.rename(columns={0: "cell_barcode"}, inplace=True)

        # Ensure gene count matches matrix rows
        if genes.shape[0] != matrix.shape[0]:
            genes = genes.iloc[:matrix.shape[0]]  

        # Convert to AnnData format
        adata = sc.AnnData(X=matrix.T)  
        adata.var["gene_name"] = genes["gene_name"].values  
        adata.obs["cell_barcode"] = cells["cell_barcode"].values  
        # Save to h5ad format
        h5ad_path = os.path.join(output_folder, f"{base_name}.h5ad")
        adata.write(h5ad_path)
        print(f"Saved h5ad file: {h5ad_path}")

    except Exception as e:
        print(f"Error processing {mtx_file}: {e}")

print("All datasets processed successfully!")

# Load one h5ad file for visualization
h5ad_file = glob.glob(os.path.join(output_folder, "*.h5ad"))[0]  

# Check basic dataset information
print(adata)



KeyboardInterrupt: 

In [None]:
mtx_files

['extracted_data\\GSM2396856_dc_3hr.mtx.txt.gz',
 'extracted_data\\GSM2396857_dc_0hr.mtx.txt.gz',
 'extracted_data\\GSM2396858_k562_tfs_7.mtx.txt.gz',
 'extracted_data\\GSM2396859_k562_tfs_13.mtx.txt.gz',
 'extracted_data\\GSM2396860_k562_tfs_highmoi.mtx.txt.gz',
 'extracted_data\\GSM2396861_k562_ccycle.mtx.txt.gz']

In [None]:
import os
import glob
import scanpy as sc
import scipy.io
import pandas as pd
import numpy as np

# Define folders
data_folder = "extracted_data/"
output_folder = "converted_data/"

# Ensure output directory exists
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
    print(f"✅ Created directory: {output_folder}")

# Get all `.mtx` files (expression matrices)
mtx_files = glob.glob(os.path.join(data_folder, "*.mtx.txt.gz"))

# Process each dataset
for mtx_file in mtx_files:
    try:
       
        base_name = os.path.basename(mtx_file).split(".mtx")[0]

        # Find all associated metadata files
        metadata_files = glob.glob(os.path.join(data_folder, f"{base_name}_*.csv.gz"))

        # Load sparse gene expression matrix
        matrix = scipy.io.mmread(mtx_file).tocsc()

        # Dictionary to store metadata tables
        metadata_dict = {}

        # Load metadata dynamically and check lengths
        for meta_file in metadata_files:
            meta_df = pd.read_csv(meta_file)

            # Check number of rows
            meta_length = meta_df.shape[0]

            # Store metadata in dictionary
            metadata_dict[meta_length] = metadata_dict.get(meta_length, []) + [meta_df]

        # Merge metadata based on row counts
        genes_metadata = pd.concat(metadata_dict.get(matrix.shape[0], []), axis=1) if matrix.shape[0] in metadata_dict else None
        cells_metadata = pd.concat(metadata_dict.get(matrix.shape[1], []), axis=1) if matrix.shape[1] in metadata_dict else None

        # Convert to AnnData format
        adata = sc.AnnData(X=matrix.T)  
        
        if genes_metadata is not None:
            adata.var = genes_metadata
        if cells_metadata is not None:
            adata.obs = cells_metadata

        # Save to h5ad format
        h5ad_path = os.path.join(output_folder, f"{base_name}.h5ad")
        adata.write(h5ad_path)
        print(f"Saved h5ad file: {h5ad_path}")

    except Exception as e:
        print(f"Error processing {mtx_file}: {e}")

print("All datasets processed successfully!")



✅ Saved h5ad file: converted_data/GSM2396856_dc_3hr.h5ad
✅ Saved h5ad file: converted_data/GSM2396857_dc_0hr.h5ad
✅ Saved h5ad file: converted_data/GSM2396858_k562_tfs_7.h5ad
✅ Saved h5ad file: converted_data/GSM2396859_k562_tfs_13.h5ad
✅ Saved h5ad file: converted_data/GSM2396860_k562_tfs_highmoi.h5ad
✅ Saved h5ad file: converted_data/GSM2396861_k562_ccycle.h5ad
✅ All datasets processed successfully!


In [None]:
adata.obs

Unnamed: 0.1,Unnamed: 0,0
0,0,AAACATACATGTGC_cc7d_D2
1,1,AAACATACATTCCT_cc7d_D2
2,2,AAACATACCCCGTT_cc7d_D2
3,3,AAACATACGATAGA_cc7d_D2
4,4,AAACATACGTACGT_cc7d_D2
...,...,...
25966,25966,TTTGCATGGCCATA_cc7d_C1
25967,25967,TTTGCATGGCTAAC_cc7d_C1
25968,25968,TTTGCATGGCTACA_cc7d_C1
25969,25969,TTTGCATGTAGAGA_cc7d_C1


In [None]:
import scanpy as sc

# Define the file path
file_path = r"C:\Users\W\Desktop\Assignments\Data Science\project\repreduce 1\data_visual\converted_data\GSM2396856_dc_3hr.h5ad"

# Load the data
adata = sc.read_h5ad(file_path)

# Print dataset summary
print(adata.X.toarray())
print(adata.obs)



[[0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 1. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 1. 0.]]
       Unnamed: 0                          0
0               0  AAACATACACGTAC_dc3hLPS_A8
1               1  AAACATACATGTCG_dc3hLPS_A8
2               2  AAACATACCAACTG_dc3hLPS_A8
3               3  AAACATACTCCTTA_dc3hLPS_A8
4               4  AAACATACTCTCCG_dc3hLPS_A8
...           ...                        ...
32772       32772  TTTCTACTGACGAG_dc3hLPS_D9
32773       32773  TTTCTACTTCTTAC_dc3hLPS_D9
32774       32774  TTTGACTGAAAAGC_dc3hLPS_D9
32775       32775  TTTGCATGCGACAT_dc3hLPS_D9
32776       32776  TTTGCATGCTGAGT_dc3hLPS_D9

[32777 rows x 2 columns]


In [None]:
import importlib
from methods.preprocessing import gene_preprocessing
from fuzzywuzzy import process

importlib.reload(methods.preprocessing)  


def merge_perturbed_genes(original_adata, preprocessed_adata, perturbation_names):
    """
    Ensures that missing perturbed genes are added back to the preprocessed AnnData 
    while avoiding duplication of already-existing genes.

    Parameters:
    - original_adata: AnnData (before preprocessing)
    - preprocessed_adata: AnnData (after preprocessing)
    - perturbation_names: List of perturbed gene names (e.g., ['m_Rel_3', 'm_Nfkb1_2'])

    Returns:
    - Merged AnnData containing missing perturbed genes.
    """

    original_adata.var["symbols"] = original_adata.var["0"].str.split("_").str[-1]

    # Extract gene symbols from preprocessed AnnData (genes already kept)
    preprocessed_adata.var["symbols"] = preprocessed_adata.var["0"].str.split("_").str[-1]

    # Clean perturbed gene names (e.g., "m_Rel_3" → "Rel")
    cleaned_perturbation_genes = [gene.split("_")[1] for gene in perturbation_names if gene != "control"]

    # Match cleaned perturbation names to original gene symbols
    matched_genes = {}
    for gene in cleaned_perturbation_genes:
        match, score = process.extractOne(gene, original_adata.var["symbols"].values)
        if score > 85 and gene.lower() in match.lower():
            matched_genes[gene] = match

    print("Matched Genes:", matched_genes)

    # Identify which matched genes are **already in preprocessed_adata**
    existing_genes = preprocessed_adata.var["symbols"].tolist()
    missing_genes = [gene for gene in matched_genes.values() if gene not in existing_genes]

    # Get indices of missing genes in original AnnData
    missing_indices = original_adata.var.index[original_adata.var["symbols"].isin(missing_genes)].tolist()

    if not missing_indices:
        print("No missing perturbed genes to recover. Returning preprocessed AnnData.")
        return preprocessed_adata

    # Extract missing genes' data from original AnnData
    adding_X = original_adata[:, missing_indices].X
    adding_var = original_adata.var.loc[missing_indices]

    # Merge missing genes into preprocessed_adata
    new_X = np.hstack((preprocessed_adata.X, adding_X))  # Append columns to AnnData.X
    new_var = pd.concat([preprocessed_adata.var, adding_var])  # Append rows to AnnData.var

    # Create new merged AnnData
    merged_adata = sc.AnnData(X=new_X, obs=preprocessed_adata.obs.copy(), var=new_var.copy())

    print(f"Recovered {len(missing_indices)} missing perturbed genes and merged them back.")

    return merged_adata





    


perturbation_path = r"C:\Users\W\Desktop\Assignments\Data Science\project\repreduce 1\data_visual\extracted_data\GSM2396856_dc_3hr_cbc_gbc_dict_strict.csv.gz"
df_perturbations = pd.read_csv(perturbation_path, header=None)  # Read compressed CSV
df_perturbations.rename(columns={1: "gene_name"}, inplace=True)
perturbation_names = df_perturbations[0]

adata_prep = gene_preprocessing(adata, num=5000, log_transform=True, normalize=True, select_by="random")
adata_prep.obs
adata_merge = merge_perturbed_genes(adata, adata_prep, perturbation_names)

KeyError: 'methods'

In [None]:
adata_merge.obs

Unnamed: 0.1,Unnamed: 0,0
0,0,AAACATACACGTAC_dc3hLPS_A8
1,1,AAACATACATGTCG_dc3hLPS_A8
2,2,AAACATACCAACTG_dc3hLPS_A8
3,3,AAACATACTCCTTA_dc3hLPS_A8
4,4,AAACATACTCTCCG_dc3hLPS_A8
...,...,...
32772,32772,TTTCTACTGACGAG_dc3hLPS_D9
32773,32773,TTTCTACTTCTTAC_dc3hLPS_D9
32774,32774,TTTGACTGAAAAGC_dc3hLPS_D9
32775,32775,TTTGCATGCGACAT_dc3hLPS_D9


In [None]:
path = r"C:\Users\W\Desktop\Assignments\Data Science\project\repreduce 1\data_visual\extracted_data\GSM2396856_dc_3hr_cbc_gbc_dict_strict.csv.gz"
genes = pd.read_csv(path, header=None)
genes.rename(columns={1: "gene_name"}, inplace=True)
genes = genes.iloc[1:]
# print(genes)
print(genes.iloc[0,1])
genes["cells"] = genes["gene_name"].apply(lambda x: x.split(", "))
print(len(genes.iloc[0,2]))

TAAGCGTGACTCTT_dc3hLPS_D8, TGACGCCTTGGTTG_dc3hLPS_D8, CGACTCTGCTTGTT_dc3hLPS_D8, GACTTTACAAGATG_dc3hLPS_D8, AGCGCCGACTTGGA_dc3hLPS_D8, TCACAACTCCAATG_dc3hLPS_D8, TAACCGGATTCGTT_dc3hLPS_D8, TAAATGTGCTCAAG_dc3hLPS_D8, TGGCAATGACAGCT_dc3hLPS_D8, CTTCACCTTAGACC_dc3hLPS_D8, TGAAATTGTTCATC_dc3hLPS_D8, AGACTGACAGAGTA_dc3hLPS_D8, CGAAGTACCCTCCA_dc3hLPS_D8, AAGATTACAAGATG_dc3hLPS_D8, GTCGCACTAAAGCA_dc3hLPS_D8, CAGACTGATGGTTG_dc3hLPS_D8, TCAGTGGACTACTT_dc3hLPS_D8, TCTGATACCTAGCA_dc3hLPS_D8, ATCCTAACCTTGTT_dc3hLPS_D8, TGTAATGACCATAG_dc3hLPS_D8, CTAGGCCTGAATCC_dc3hLPS_D8, TTATTCCTAGTAGA_dc3hLPS_D8, CTCGAAGATGGATC_dc3hLPS_D8, ATCAGGTGCACACA_dc3hLPS_D8, CATTACACTGGTTG_dc3hLPS_D8, CACTCTCTCCTTTA_dc3hLPS_D8, AGGGACGATGTGGT_dc3hLPS_D8, AGGACACTAAGTAG_dc3hLPS_D8, AGCGAACTTGGTGT_dc3hLPS_D8, ATGTACCTGATAGA_dc3hLPS_D8, ACGATGACCCCGTT_dc3hLPS_D8, TTGCTAACCGTTAG_dc3hLPS_D8, CTTCTAGACATGGT_dc3hLPS_D8, TGAGACACGCTAAC_dc3hLPS_D8, AGAGTCTGGGACTT_dc3hLPS_B9, CCCAACACATAAGG_dc3hLPS_B9, ACCTGGCTGACGAG_dc3hLPS_B9, T

In [None]:
adata.X.toarray

<bound method _cs_matrix.toarray of <Compressed Sparse Row sparse matrix of dtype 'float64'
	with 52199775 stored elements and shape (32777, 17775)>>

In [None]:
import pandas as pd

def merge_perturbations(adata1, perturbation_path):
    """
    Merges perturbation information into adata.obs based on cell barcodes.

    Parameters:
    - adata: AnnData object
    - perturbation_path: Path to the CSV file containing perturbation data

    Returns:
    - adata with an added column "perturbation" in adata.obs
    """

    #  Load the perturbation file
    df_perturbations = pd.read_csv(perturbation_path, header=None)  # Read compressed CSV
    df_perturbations.rename(columns={1: "gene_name"}, inplace=True)
    #  Convert cell barcode lists from string to actual lists
    df_perturbations["Cells"] = df_perturbations["gene_name"].apply(lambda x: x.split(", "))
    
    #  Create a dictionary mapping each cell barcode to its perturbation(s)
    cells = np.array(adata1.obs["0"])
    adata1.obs["perturbation"] = "control"
    for _, row in df_perturbations.iterrows():
        perturbed_gene = row["Cells"]
        target = row.iloc[0]
          # e.g., "m_Irf1_4"
        for item in perturbed_gene:
            if item in cells:
                adata1.obs.loc[adata1.obs["0"] == item, "perturbation"] = target
   
    
    return adata1

print(adata_merge.obs)
# Example Usage
perturbation_path = r"C:\Users\W\Desktop\Assignments\Data Science\project\repreduce 1\data_visual\extracted_data\GSM2396856_dc_3hr_cbc_gbc_dict_strict.csv.gz"
adata_merge = merge_perturbations(adata_merge, perturbation_path)
adata_merge.obs[adata_merge.obs["perturbation"]!="control"]
adata_merge.write("adata_1")

NameError: name 'adata_merge' is not defined

In [None]:
print(adata_merge.var)
print(adata_prep.var)

       Unnamed: 0                                 0    gene_symbol
16760       16760        ENSMUSG00000097888_Gm26682        Gm26682
15760       15760          ENSMUSG00000022867_Usp25          Usp25
9677         9677          ENSMUSG00000039431_Mtmr7          Mtmr7
4384         4384         ENSMUSG00000035212_Leprot         Leprot
14313       14313  ENSMUSG00000021098_4930447C04Rik  4930447C04Rik
...           ...                               ...            ...
14986       14986           ENSMUSG00000042622_Maff           Maff
15812       15812          ENSMUSG00000022952_Runx1          Runx1
15830       15830           ENSMUSG00000022895_Ets2           Ets2
16882       16882           ENSMUSG00000038418_Egr1           Egr1
17251       17251           ENSMUSG00000024927_Rela           Rela

[5024 rows x 3 columns]
       Unnamed: 0                                 0    gene_symbol  \
16760       16760        ENSMUSG00000097888_Gm26682        Gm26682   
15760       15760          ENSM

In [None]:
import scanpy as sc
import numpy as np
import pandas as pd
from rapidfuzz import process

def sketch_perturbation_genes(preprocessed_adata, perturbation_names):
    """
    Extracts information only for the perturbation genes from the processed AnnData object.

    Parameters:
    - preprocessed_adata: AnnData object after preprocessing (5000 most variable genes + perturbed genes).
    - perturbation_names: List of perturbation gene names.

    Returns:
    - perturbation_adata: AnnData object containing only perturbation gene information.
    """
    # Extract gene symbols from the processed AnnData
    preprocessed_adata.var["symbols"] = preprocessed_adata.var["0"].str.split("_").str[-1]

    # Clean perturbation names (e.g., "m_Rel_1" → "Rel")
    cleaned_perturbation_genes = [gene.split("_")[1] for gene in perturbation_names if gene != "control"]

    # Match the cleaned perturbed names with gene symbols in preprocessed AnnData
    matched_genes = {}
    for gene in cleaned_perturbation_genes:
        match, score = process.extractOne(gene, preprocessed_adata.var["symbols"].values)
        if score > 85 and gene.lower() in match.lower():
            matched_genes[gene] = match

    print("✅ Matched Perturbation Genes:", matched_genes)

    # Get indices of the matched perturbation genes
    perturb_indices = preprocessed_adata.var.index[preprocessed_adata.var["symbols"].isin(matched_genes.values())].tolist()

    # Extract the perturbation genes' data from preprocessed AnnData
    perturbation_adata = preprocessed_adata[:, perturb_indices].copy()

    print(f"✅ Extracted {len(perturb_indices)} perturbation genes from AnnData.")

    return perturbation_adata

# Call the function to extract only perturbation genes
perturbation_adata = sketch_perturbation_genes(adata, perturbation_names)



  preprocessed_adata.var["symbols"] = preprocessed_adata.var["0"].str.split("_").str[-1]


ValueError: too many values to unpack (expected 2)

In [None]:
import scanpy as sc
import numpy as np
import scipy.sparse
from fuzzywuzzy import process
import pandas as pd 
import preprocessing
import os
import glob
import scanpy as sc
import scipy.io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from preprocessing import gene_preprocessing, merge_perturbed_genes, merge_perturbed_genes_same_size, sketch_perturbation_genes
class data_init():

    def __init__(self, data_folder, output_folder = None, perturbation_names = None):
        self.data_folder = data_folder
        self.output_folder = output_folder

    def base_adata(self, return_cell_type = True):
        multi_files = glob.glob(os.path.join(self.data_folder, "*.mtx.txt.gz"))
        for one_file in multi_files:
            try:
                base_name = os.path.basename(one_file).split(".mtx")[0]  # GMS123456_dc_3hr
                genes_path = os.path.join(self.data_folder, f"{base_name}_genenames.csv.gz")
                cells_path = os.path.join(self.data_folder, f"{base_name}_cellnames.csv.gz")
                matrix = scipy.io.mmread(one_file).tocsc()  
                genes = pd.read_csv(genes_path, header=None)
                genes.rename(columns={1: "gene_name"}, inplace=True)
                cells = pd.read_csv(cells_path, header=None)
                cells.rename(columns={0: "cell_barcode"}, inplace=True)

                if genes.shape[0] != matrix.shape[0]:
                    genes = genes.iloc[:matrix.shape[0]] 

                adata = sc.AnnData(X=matrix.T)  
                adata.var["gene_name"] = genes["gene_name"].values 
                adata.obs["cell_barcode"] = cells["cell_barcode"].values  

                if self.output_folder != None:
                    h5ad_path = os.path.join(self.output_folder, f"{base_name}.h5ad")
                    adata.write(h5ad_path)
                    print(f"Saved h5ad file: {h5ad_path}")
                    
            except Exception as e:
                print(f"Error processing {one_file}: {e}")
        
        if return_cell_type == True:
            cell_type = base_name.split("_")[1]  
            return cell_type

    def adata_show(self, data_path):
        adata = sc.read_h5ad(data_path)
        return adata
    
    def add_single_perturbation(self, adata, perturbation_path):

        df_perturbations = pd.read_csv(perturbation_path, header=None)  # Read compressed CSV
        df_perturbations.rename(columns={1: "gene_name"}, inplace=True)
        perturbation_names = df_perturbations[0]

        adata_prep = gene_preprocessing(adata, num=5000, log_transform=True, normalize=True, select_by="random")
        adata_prep.obs
        adata_merge = merge_perturbed_genes(adata, adata_prep, perturbation_names)

        return adata_merge
    
    def add_perturbations(self, adata1, perturbation_path):
        """
        Merges perturbation information into adata.obs based on cell barcodes.

        Parameters:
        - adata: AnnData object
        - perturbation_path: Path to the CSV file containing perturbation data

        Returns:
        - adata with an added column "perturbation" in adata.obs
        """

        # Load the perturbation file
        df_perturbations = pd.read_csv(perturbation_path, header=None)  # Read compressed CSV
        df_perturbations.rename(columns={1: "gene_name"}, inplace=True)
        #  Convert cell barcode lists from string to actual lists
        df_perturbations["Cells"] = df_perturbations["gene_name"].apply(lambda x: x.split(", "))
        
        #  Create a dictionary mapping each cell barcode to its perturbation(s)
        cells = np.array(adata1.obs["0"])
        adata1.obs["condition"] = "control"
        for _, row in df_perturbations.iterrows():
            perturbed_gene = row["Cells"]
            target = row.iloc[0]
            # e.g., "m_Irf1_4"
            for item in perturbed_gene:
                if item in cells:
                    adata1.obs.loc[adata1.obs["0"] == item, "condition"] = target
        
        return adata1
    


    def prep(self, file_path, perturbation_path):

        cell_name = self.base_adata()
        adata = self.adata_show(file_path)
        adata1 = self.add_single_perturbation(adata, perturbation_path)

        adata1.obs["cell_type"] = cell_name
        adata1.obs.index = adata1.obs["0"]
        adata1 = self.add_perturbations(adata1, perturbation_path)
        del adata1.obs["Unnamed: 0"]
        adata1.var["gene_names"] = adata1.var["symbols"]
        del adata1.var["Unnamed: 0"]
        del adata1.var["0"]
        del adata1.var["symbols"]
        # print("AnnData.X",adata1.X)
        # print("AnnData.obs",adata1.obs)
        # print("AnnData.var",adata1.var)
        return adata1




    data_folder = r"C:\Users\W\Desktop\Assignments\Data Science\project\repreduce 1\data_visual\extracted_data/"
    output_folder = r"C:\Users\W\Desktop\Assignments\Data Science\project\repreduce 1\data_visual\converted_data/"
    model = data_init(data_folder, output_folder = output_folder)
    cell_name = model.base_adata()
    file_path = r"C:\Users\W\Desktop\Assignments\Data Science\project\repreduce 1\data_visual\converted_data\GSM2396856_dc_3hr.h5ad"
    adata = model.adata_show(file_path)
    
    perturbation_path = r"C:\Users\W\Desktop\Assignments\Data Science\project\repreduce 1\data_visual\extracted_data\GSM2396856_dc_3hr_cbc_gbc_dict_strict.csv.gz"
    adata1 = model.add_single_perturbation(adata, perturbation_path)
    adata1.obs["cell_type"] = cell_name
    adata1.obs.index = adata1.obs["0"]
    adata1 = model.add_perturbations(adata1, perturbation_path)
    del adata1.obs["Unnamed: 0"]
    adata1.var["gene_names"] = adata1.var["symbols"]
    del adata1.var["Unnamed: 0"]
    del adata1.var["0"]
    del adata1.var["symbols"]

    

    print("AnnData.X",adata1.X)
    print("AnnData.obs",adata1.obs)
    print("AnnData.var",adata1.var)

    
    data_folder = r"C:\Users\W\Desktop\Assignments\Data Science\project\repreduce 1\data_visual\extracted2/"
    output_folder = r"C:\Users\W\Desktop\Assignments\Data Science\project\repreduce 1\data_visual\converted_data/"
    file_path = r"C:\Users\W\Desktop\Assignments\Data Science\project\repreduce 1\data_visual\converted_data\GSM2396858_k562_tfs_7.h5ad"
    perturbation_path = r"C:\Users\W\Desktop\Assignments\Data Science\project\repreduce 1\data_visual\extracted2\GSM2396858_k562_tfs_7_cbc_gbc_dict.csv.gz"
    
    model2 = data_init(data_folder, output_folder = output_folder)
    adata2 = model2.prep(file_path, perturbation_path)
    print(adata2.X.shape, adata1.X.shape)

    adata1.write("adata1.h5ad")
    adata2.write("adata2.h5ad")


    print("AnnData.X",merged_adata.X)
    print("AnnData.obs",merged_adata.obs)
    print("AnnData.var",merged_adata.var)


ImportError: cannot import name 'merge_perturbed_genes_same_size' from 'preprocessing' (c:\Users\W\Desktop\Assignments\Data Science\project\repreduce 1\data_visual\preprocessing.py)