In [1]:
# import all libraries you need here
import os
import zipfile

import pandas as pd
import numpy as np

# Step 0: Download the training data

In [2]:
path_data = "./data/"

In [3]:
bulk_pancreas = pd.read_csv(os.path.join(path_data,"train_data","pancreas_bulk_train.csv")).set_index("level_0")

In [4]:
print(f"Number of patients in the train dataset {bulk_pancreas.shape[1]}")
print(f"Number of genes in the dataset {bulk_pancreas.shape[0]}")

Number of patients in the train dataset 4
Number of genes in the dataset 25453


In [5]:
sc_pancreas = pd.read_csv(os.path.join(path_data,"train_data","pancreas_sc_train.csv")).set_index("Unnamed: 0")

In [6]:
assert bulk_pancreas.shape[0] == sc_pancreas.shape[0]
print(f"Number of cells in the train dataset {sc_pancreas.shape[1]}")

Number of cells in the train dataset 978


In [7]:
sc_pancreas_test = pd.read_csv(os.path.join(path_data,"test_data","pancreas_sc_test.csv")).set_index("Unnamed: 0")

In [8]:
print(f"Number of cells in the test dataset {sc_pancreas_test.shape[1]}")

Number of cells in the test dataset 789


In [9]:
sc_pancreas_metadata = pd.read_csv(os.path.join(path_data,"train_data","pancreas_sc_metadata_train.csv")).set_index("Source Name")

In [10]:
for patient in sc_pancreas_metadata.Sample.unique():
    print(f"Number of cells for {patient} is {sc_pancreas_metadata[sc_pancreas_metadata.Sample==patient].shape[0]}")

Number of cells for patient1 is 249
Number of cells for patient3 is 219
Number of cells for patient2 is 234
Number of cells for patient4 is 276


In [11]:
for dis in sc_pancreas_metadata.Disease.unique():
    df = sc_pancreas_metadata[sc_pancreas_metadata.Disease==dis]
    print(f"There are {df.Sample.nunique()} train patients with {dis}")

There are 2 train patients with type II diabetes mellitus
There are 2 train patients with normal


In [12]:
print(f"There are {df.Celltype.nunique()} different cell types in the dataset")
print(f"The different cells types are {df.Celltype.unique()}")

There are 11 different cell types in the dataset
The different cells types are ['delta cell' 'alpha cell' 'beta cell' 'PSC cell' 'endothelial cell'
 'gamma cell' 'co-expression cell' 'ductal cell' 'epsilon cell'
 'unclassified endocrine cell' 'acinar cell']


In [13]:
sc_pancreas_metadata_test = pd.read_csv(os.path.join(path_data,"test_data","pancreas_sc_metadata_test_wocelltype.csv")).set_index("Source Name")

In [14]:
for patient in sc_pancreas_metadata_test.Sample.unique():
    print(f"Number of cells for {patient} is {sc_pancreas_metadata_test[sc_pancreas_metadata_test.Sample==patient].shape[0]}")

Number of cells for patient5 is 203
Number of cells for patient7 is 284
Number of cells for patient6 is 302


In [15]:
for dis in sc_pancreas_metadata_test.Disease.unique():
    df = sc_pancreas_metadata_test[sc_pancreas_metadata_test.Disease==dis]
    print(f"There are {df.Sample.nunique()} test patients with {dis}")

There are 1 test patients with normal
There are 2 test patients with type II diabetes mellitus


__What do we have now?__

- bulk_pancreas: pd dataframe, train data. Bulk expression counts.
- sc_pancreas: pd dataframe, train data. Single-cell expression counts.
- sc_pancreas_test: pd dataframe, test data. Single-cell expression counts.
- sc_pancreas_metadata: pd dataframe, train data.
- sc_pancreas_metadata_test: pd dataframe, test data.
    - sc_pancreas_metadata.Disease: either 'type II diabetes mellitus' or 'normal'
    - sc_pancreas_metadata.Celltype: string of the cell type.

# Step 1: Perform the imputation

In [None]:
# Imputation
# TODO

# Step 2: Perform clustering


In [None]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
from sklearn import metrics


In [None]:
# Batch Correction? Imputation? Feature Selection?
# TODO
X = None

# Feature Selection
# https://github.com/ScialdoneLab/CIARA_python --> Cluster Independent Algorithm for the identification of Rare cell types
# https://triku.readthedocs.io/en/latest/ -->


# Other preprocessing?
# Normalization
# log-transform

# Dimension Reduction
pca = PCA(n_components=20)
X_dimred = pca.fit_transform(X)

# Gini-Clustering, hierarchical Clustering?
# Clustering: ------------------
clusterings = []
scores = []
for k in range(2, 10):
    print(f'Experimental k: {k}')
    #cluster = AgglomerativeClustering(n_clusters=k, distance_threshold=None, affinity='euclidean', linkage='ward', compute_distances=True)
    cluster = AgglomerativeClustering(n_clusters=k, distance_threshold=None, affinity='euclidean', linkage='average', compute_distances=True)
    cluster.fit_predict(X_dimred)
    clusterings.append(cluster)
    #scores.append(abs(get_silhouette_score(X_new, cluster.labels_)))
    # scores.append(abs(get_silhouette_score(u, cluster.labels_)))
    scores.append(metrics.davies_bouldin_score(X_dimred, cluster.labels_))


# Step 3: Predict on the test data

In [None]:
# Goals:
#   (1) Impute and transform to pseudo-bulk
#   (2) Clustering performance on the scRNA-seq data.

# Step 4: Save the required files

In [None]:
# bulkified should be a DataFrame containing the "bulkified" version of the imputed data
# bulkified.columns = ["patient5","patient6","patient7"]
# bulkified.index = sc_pancreas_test.index

In [None]:
results_path = "path/to/your/results"
archive_name = "LastName_FirstName_Project2.zip" # TODO

In [None]:
assert all(bulkified.columns == ["index","patient5","patient6","patient7"])

In [None]:
assert all(bulkified["index"] == sc_pancreas_test.index)

In [None]:
# cluster_labels should be a DataFrame containing the cluster labels for each cell
# cluster_labels.columns = ["index", "cluster"]
# cluster_labels["index"] = sc_pancreas_test.columns

In [None]:
assert all(cluster_labels.columns == ["index","cluster"])

In [None]:
assert all(cluster_labels["index"] == sc_pancreas_test.columns)

In [None]:
# PCA should be a DataFrame containing the coordinates of each cell in the PCA transformed space for the 50 first PCs
# PCA.columns = ["index", "PC1", "PC2", ..., "PC50"]
# PCA["index"] = sc_pancreas_test.columns

In [None]:
assert all(PCA.columns == ["index"] + ["PC"+str(i+1) for i in range(50)])

In [None]:
assert all(PCA["index"] == sc_pancreas_test.columns)

In [None]:
with zipfile.ZipFile(os.path.join(results_path, archive_name), "x") as zf:
    with zf.open(f"imputed_bulkified.csv", "w") as buffer:
        bulkified.to_csv(buffer)
    with zf.open(f"cluster_membership.csv", "w") as buffer:
        cluster_labels.to_csv(buffer)
    with zf.open(f"PCA.csv", "w") as buffer:
        PCA.to_csv(buffer)
    zf.close()

In [None]:
archive = zipfile.ZipFile(os.path.join(results_path, archive_name))
assert all(
        [
            i in archive.namelist()
            for i in [
                "imputed_bulkified.csv",
                "cluster_membership.csv",
                "PCA.csv",
            ]
        ]
    )