In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm, Normalize

# from matplotlib import rcParam
import seaborn as sb
import os
import sys

sys.setrecursionlimit(1000000)
from scipy.stats import spearmanr
from scipy.spatial.distance import cdist
import pathlib
import scipy as sp
import scipy.stats as stats
from sklearn.linear_model import LinearRegression
from allensdk.core.mouse_connectivity_cache import MouseConnectivityCache

mcc = MouseConnectivityCache()
from scipy.stats import pearsonr
import itertools
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import umap
import hdbscan
import sklearn.cluster as cluster
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score
import scanpy as sc
import anndata as ad
from preprocessing_sequencing import preprocess_sequences as ps

In [None]:
# load datasets
FIAA456a = pd.read_pickle(
    "/camp/lab/znamenskiyp/home/shared/projects/turnerb_A1_MAPseq/FIAA45.6a/Sequencing/barcode_matrix_normalised.pkl"
)
FIAA456d = pd.read_pickle(
    "/camp/lab/znamenskiyp/home/shared/projects/turnerb_A1_MAPseq/FIAA45.6d/Sequencing/barcode_matrix_normalised.pkl"
)

In [None]:
FIAA456a_parameters = ps.load_parameters(
    directory="/camp/lab/znamenskiyp/home/shared/projects/turnerb_A1_MAPseq/FIAA45.6a/Sequencing"
)
FIAA456d_parameters = ps.load_parameters(
    directory="/camp/lab/znamenskiyp/home/shared/projects/turnerb_A1_MAPseq/FIAA45.6d/Sequencing"
)

In [None]:
# what does the distribution max/2nd max look like?
filtered_barcodes_FIAA456a = pd.DataFrame()
filtered_barcodes_FIAA456d = pd.DataFrame()
filtered_barcodes_FIAA456a["highest"] = FIAA456a.max(axis=1) + 1
filtered_barcodes_FIAA456d["highest"] = FIAA456d.max(axis=1) + 1
filtered_barcodes_FIAA456a["second"] = (
    FIAA456a.apply(lambda row: row.nlargest(2).values[-1], axis=1) + 1
)
filtered_barcodes_FIAA456d["second"] = (
    FIAA456d.apply(lambda row: row.nlargest(2).values[-1], axis=1) + 1
)
filtered_barcodes_FIAA456a["relative_to_max"] = np.log10(
    filtered_barcodes_FIAA456a["highest"] / filtered_barcodes_FIAA456a["second"]
)
filtered_barcodes_FIAA456d["relative_to_max"] = np.log10(
    filtered_barcodes_FIAA456d["highest"] / filtered_barcodes_FIAA456d["second"]
)

In [None]:
plt.hist(
    filtered_barcodes_FIAA456a["relative_to_max"],
    bins=150,
    alpha=0.5,
    color="Black",
    label="FIAA45.6a",
    weights=np.zeros_like(filtered_barcodes_FIAA456a["relative_to_max"])
    + 1.0 / filtered_barcodes_FIAA456a["relative_to_max"].size,
)

plt.hist(
    filtered_barcodes_FIAA456d["relative_to_max"],
    bins=150,
    alpha=0.5,
    label="FIAA45.6d",
    weights=np.zeros_like(filtered_barcodes_FIAA456d["relative_to_max"])
    + 1.0 / filtered_barcodes_FIAA456d["relative_to_max"].size,
)

plt.title("Distribution of barcode umi counts in samples relative to maximum")
plt.xlabel("log10 max/2nd max barcode count")
plt.ylabel("Frequency")
plt.axvline(
    x=np.log10(70),
    linestyle="dashed",
    color="Black",
    alpha=0.5,
    label="soma threshold =70",
)
plt.axvline(
    x=np.log10(10),
    linestyle="-.",
    color="Black",
    alpha=0.5,
    label="soma threshold =10",
)
plt.legend()

In [None]:
filtered_barcodes_FIAA456a["third"] = (
    FIAA456a.apply(lambda row: row.nlargest(3).values[-1], axis=1) + 1
)
filtered_barcodes_FIAA456d["third"] = (
    FIAA456d.apply(lambda row: row.nlargest(3).values[-1], axis=1) + 1
)

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(10, 10))
plt.subplots_adjust(hspace=0.3)
slope = 1
intercept = np.log10(70)
intercept_2 = np.log10(10)
x_vals = np.array((0, 5))
y_vals = intercept + slope * x_vals
y_vals_2 = intercept_2 + slope * x_vals
fig.suptitle("Barcode umi counts comparisons")
ax[0, 0].scatter(
    x=np.log10(filtered_barcodes_FIAA456a["second"]),
    y=np.log10(filtered_barcodes_FIAA456a["highest"]),
    s=1,
    alpha=0.1,
)
# ax[0, 0].plot(x_vals, y_vals, "--", label="x70 enrichment of soma", alpha=0.3)
# ax[0, 0].axhline(
#    y=np.log10(300), linestyle="dashed", color="Black", label="300 counts", alpha=0.5
# )
ax[0, 0].plot(
    x_vals, y_vals_2, "--", c="grey", label="x10 enrichment of soma", alpha=0.3
)
ax[0, 0].set_title("FIAA45.6a first vs second max")
ax[0, 0].set_xlabel("log10(second max counts)")
ax[0, 0].set_ylabel("log10(first max counts)")
ax[0, 0].set_ylim([0, 5])
ax[0, 0].set_xlim([0, 4])
ax[0, 1].scatter(
    x=np.log10(filtered_barcodes_FIAA456a["third"]),
    y=np.log10(filtered_barcodes_FIAA456a["second"]),
    s=1,
    alpha=0.1,
)
# ax[0, 1].plot(x_vals, y_vals, "--", label="x40 enrichment of soma", alpha=0.3)
ax[0, 1].plot(
    x_vals, y_vals_2, "--", c="grey", label="x10 enrichment of soma", alpha=0.3
)
# ax[0, 1].axhline(
#    y=np.log10(300), linestyle="dashed", color="Black", label="300 counts", alpha=0.5
# )
ax[0, 1].set_title("FIAA45.6a second vs third max counts")
ax[0, 1].set_xlabel("log10(third max counts)")
ax[0, 1].set_ylabel("log10(second max counts)")
ax[0, 1].set_ylim([0, 5])
ax[0, 1].set_xlim([0, 4])
ax[0, 1].legend()
ax[0, 0].legend()
# ax2.plot(x, -y)
ax[1, 0].scatter(
    x=np.log10(filtered_barcodes_FIAA456d["second"]),
    y=np.log10(filtered_barcodes_FIAA456d["highest"]),
    s=1,
    alpha=0.05,
)
ax[1, 0].set_title("FIAA45.6d first vs second max")
ax[1, 0].plot(
    x_vals, y_vals_2, "--", c="grey", label="x10 enrichment of soma", alpha=0.3
)
# ax[1, 0].axhline(
# )
##    y=np.log10(300), linestyle="dashed", color="Black", label="300 counts", alpha=0.5
ax[1, 0].legend()
ax[1, 0].set_xlabel("log10(second max counts)")
ax[1, 0].set_ylabel("log10(first max counts)")
ax[1, 0].set_ylim([0, 5])
ax[1, 0].set_xlim([0, 4])


ax[1, 1].scatter(
    x=np.log10(filtered_barcodes_FIAA456d["third"]),
    y=np.log10(filtered_barcodes_FIAA456d["second"]),
    s=1,
    alpha=0.05,
)
ax[1, 1].set_title("FIAA45.6d second vs third max counts")
# ax[1, 1].plot(x_vals, y_vals, "--", label="x70 enrichment of soma", alpha=0.3)
ax[1, 1].plot(
    x_vals, y_vals_2, "--", c="grey", label="x10 enrichment of soma", alpha=0.3
)
# ax[1, 1].axhline(
#    y=np.log10(300), linestyle="dashed", color="Black", label="300 counts", alpha=0.5
# )
ax[1, 1].legend()
ax[1, 1].set_xlabel("log10(third max counts)")
ax[1, 1].set_ylabel("log10(second max counts)")
ax[1, 1].set_ylim([0, 5])
ax[1, 1].set_xlim([0, 4])

In [None]:
soma_thresh = np.log10(10)
filtered_soma_FIAA456a = FIAA456a[
    filtered_barcodes_FIAA456a["relative_to_max"] > soma_thresh
]
filtered_soma_FIAA456d = FIAA456d[
    filtered_barcodes_FIAA456d["relative_to_max"] > soma_thresh
]

FIAA45.6a analysis

In [None]:
# convert the barcode dataframe into tube numbers rather than RT primers
RT_to_sample_FIAA456a = pd.read_csv(
    "/camp/lab/znamenskiyp/home/shared/projects/turnerb_A1_MAPseq/FIAA45.6a/Sequencing/RT_to_sample_FIAA45.6a.csv"
)
RT_to_sample_FIAA456a.set_index("sample", inplace=True)
mapping_FIAA456a = RT_to_sample_FIAA456a["tube"].to_dict()
filtered_soma_FIAA456a.rename(columns=mapping_FIAA456a, inplace=True)

Where ROIs for particular samples have been combined, sum the output

In [None]:
# drop the non-existant tubes, added so that there wasn't gaps in RT to sample
filtered_soma_FIAA456a = filtered_soma_FIAA456a.drop(0, axis=1)

In [None]:
for tube_to_group in FIAA456a_parameters["rois_to_combine"]:
    filtered_soma_FIAA456a[tube_to_group] = filtered_soma_FIAA456a[
        FIAA456a_parameters["rois_to_combine"][tube_to_group]
    ].sum(axis=1)
    drop_list = []
    for tube in FIAA456a_parameters["rois_to_combine"][tube_to_group]:
        if tube != tube_to_group:
            drop_list.append(tube)
    filtered_soma_FIAA456a.drop(columns=drop_list, inplace=True)

In [None]:
filtered_soma_FIAA456a.to_pickle(
    "/camp/lab/znamenskiyp/home/shared/projects/turnerb_A1_MAPseq/FIAA45.6a/Sequencing/barcode_matrix_soma_thresholded.pkl"
)

In [None]:
A1_FIAA456a = [83, 84, 85, 86, 97, 98, 99, 100, 101, 102, 103, 104]
# now take only those with source sites in A1
filtered_barcodes_A1_source_FIAA456a = filtered_soma_FIAA456a[
    filtered_soma_FIAA456a.idxmax(axis=1).isin(A1_FIAA456a)
]

In [None]:
%matplotlib inline

In [None]:
labels = (
    f"All A1 source  \n neurons ({len(filtered_barcodes_A1_source_FIAA456a)}/{len(filtered_soma_FIAA456a)})",
    f"Projecting outside A1  \nsource sample ({len(filtered_barcodes_A1_source_FIAA456a[filtered_barcodes_A1_source_FIAA456a.astype(bool).sum(axis=1)>1])})",
)
sizes = [
    len(filtered_barcodes_A1_source_FIAA456a),
    len(
        filtered_barcodes_A1_source_FIAA456a[
            filtered_barcodes_A1_source_FIAA456a.astype(bool).sum(axis=1) > 1
        ]
    ),
]

fig, ax = plt.subplots()
ax.pie(sizes, labels=labels, colors=["gray", "skyblue"])

In [None]:
regions_to_add = {}
regions_to_add["caudal_striatum_samples"] = [121, 122, 130, 142]
regions_to_add["dorsal_striatum_samples"] = [150, 158, 165, 170]
regions_to_add["thalamus"] = [71, 88, 106, 107, 123, 124, 125, 149, 137, 138, 140, 141]
regions_to_add["contra_cortex"] = [
    5,
    14,
    27,
    39,
    54,
    72,
    73,
    89,
    90,
    109,
    110,
    111,
    127,
    128,
    139,
]
regions_to_add["IC"] = [1, 2, 6]
regions_to_add["SC"] = [7, 25, 26, 37, 38, 51, 52, 68, 69]
regions_to_add["pons"] = [13, 35, 53, 70]
regions_to_add["ipsi_cortex"] = [
    3,
    4,
    8,
    9,
    10,
    11,
    12,
    15,
    16,
    17,
    18,
    19,
    20,
    21,
    22,
    23,
    28,
    29,
    30,
    31,
    32,
    33,
    34,
    40,
    41,
    42,
    43,
    44,
    45,
    46,
    47,
    48,
    49,
    50,
    57,
    58,
    59,
    60,
    61,
    62,
    63,
    64,
    65,
    66,
    67,
    78,
    79,
    80,
    81,
    82,
    87,
    93,
    94,
    95,
    96,
    105,
    112,
    113,
    114,
    115,
    116,
    117,
    118,
    119,
    120,
    129,
    131,
    132,
    133,
    134,
    135,
    136,
    144,
    145,
    146,
    148,
    151,
    152,
    153,
    154,
    155,
    156,
    157,
    161,
    162,
    163,
    164,
    166,
    167,
    168,
    169,
    171,
    172,
    173,
    174,
    175,
    176,
]
regions_to_add["olfactory_bulb"] = [177, 178, 179, 180]
ipsi_cortex = [
    3,
    4,
    8,
    9,
    10,
    11,
    12,
    15,
    16,
    17,
    18,
    19,
    20,
    21,
    22,
    23,
    28,
    29,
    30,
    31,
    32,
    33,
    34,
    40,
    41,
    42,
    43,
    44,
    45,
    46,
    47,
    48,
    49,
    50,
    57,
    58,
    59,
    60,
    61,
    62,
    63,
    64,
    65,
    66,
    67,
    78,
    79,
    80,
    81,
    82,
    87,
    93,
    94,
    95,
    96,
    105,
    112,
    113,
    114,
    115,
    116,
    117,
    118,
    119,
    120,
    129,
    131,
    132,
    133,
    134,
    135,
    136,
    144,
    145,
    146,
    148,
    151,
    152,
    153,
    154,
    155,
    156,
    157,
    161,
    162,
    163,
    164,
    166,
    167,
    168,
    169,
    171,
    172,
    173,
    174,
    175,
    176,
]

In [None]:
# Create a new dictionary to store the summed values for each area
summed_data = {}

# Iterate through the dictionary and sum the corresponding columns
for area, tubes in regions_to_add.items():
    valid_tubes = [
        tube for tube in tubes if tube in filtered_barcodes_A1_source_FIAA456a.columns
    ]
    summed_data[area] = filtered_barcodes_A1_source_FIAA456a[valid_tubes].sum(axis=1)


# Create a new DataFrame from the summed_data dictionary
df_result = pd.DataFrame(summed_data)

df_result = df_result.loc[(df_result != 0).any(axis=1)]

In [None]:
sb.clustermap(
    df_result,
    metric="euclidean",
    standard_scale=0,
    norm=LogNorm(),
    cmap="Greys",
    figsize=(6, 10),
    xticklabels=1,
    yticklabels=False,
    cbar_pos=(1.01, 0.5, 0.02, 0.18),
)

In [None]:
striatum_projecting = df_result[df_result["caudal_striatum_samples"] > 0]

In [None]:
striatum_projecting

In [None]:
striatum_projecting = df_result[df_result["caudal_striatum_samples"] > 0]
sb.clustermap(
    striatum_projecting,
    metric="euclidean",
    standard_scale=0,
    norm=LogNorm(),
    cmap="Greys",
    figsize=(6, 10),
    xticklabels=1,
    yticklabels=False,
    cbar_pos=(1.01, 0.5, 0.02, 0.18),
)

Now look at FIAA45.6d

In [None]:
# convert the barcode dataframe into tube numbers rather than RT primers
RT_to_sample_FIAA456d = pd.read_csv(
    "/camp/lab/znamenskiyp/home/shared/projects/turnerb_A1_MAPseq/FIAA45.6d/Sequencing/RT_to_sample_FIAA45.6d.csv"
)
RT_to_sample_FIAA456d.set_index("sample", inplace=True)
mapping = RT_to_sample_FIAA456d["tube"].to_dict()
filtered_soma_FIAA456d.rename(columns=mapping, inplace=True)

In [None]:
# drop the non-existant tubes, added so that there wasn't gaps in RT to sample
filtered_soma_FIAA456d = filtered_soma_FIAA456d.drop(0, axis=1)

Where ROIs for particular samples have been combined, sum the output

In [None]:
# since one of the samples for FIAA45.6d, have separate ROIs, but were pooled with one RT primer, we want to ignore if the the column names don't exist in the dataset
for tube_to_group in FIAA456d_parameters["rois_to_combine"]:
    if all(
        col in filtered_soma_FIAA456d.columns
        for col in FIAA456d_parameters["rois_to_combine"][tube_to_group]
    ):
        filtered_soma_FIAA456d[tube_to_group] = filtered_soma_FIAA456d[
            FIAA456d_parameters["rois_to_combine"][tube_to_group]
        ].sum(axis=1)
        drop_list = []
        for tube in FIAA456d_parameters["rois_to_combine"][tube_to_group]:
            if tube != tube_to_group:
                drop_list.append(tube)
        filtered_soma_FIAA456d.drop(columns=drop_list, inplace=True)
    else:
        print(
            f'not all tubes not there for {FIAA456d_parameters["rois_to_combine"][tube_to_group]}'
        )

In [None]:
filtered_soma_FIAA456d.to_pickle(
    "/camp/lab/znamenskiyp/home/shared/projects/turnerb_A1_MAPseq/FIAA45.6d/Sequencing/barcode_matrix_soma_thresholded.pkl"
)

In [None]:
A1 = [57, 58, 59, 60, 61, 79, 80, 81, 82, 83, 84, 99, 100, 101, 102, 103, 104, 105, 106]
# now take only those with source sites in A1
filtered_barcodes_A1_source = filtered_soma_FIAA456d[
    filtered_soma_FIAA456d.idxmax(axis=1).isin(A1)
]
FIAA456d_A1_source = filtered_barcodes_A1_source.copy()

In [None]:
labels = (
    f"All A1 source  \n neurons ({len(filtered_barcodes_A1_source)}/{len(filtered_soma_FIAA456d)})",
    f"Projecting outside A1  \nsource sample ({len(filtered_barcodes_A1_source[filtered_barcodes_A1_source.astype(bool).sum(axis=1)>1])})",
)
sizes = [
    len(filtered_barcodes_A1_source),
    len(
        filtered_barcodes_A1_source[
            filtered_barcodes_A1_source.astype(bool).sum(axis=1) > 1
        ]
    ),
]

fig, ax = plt.subplots()
ax.pie(sizes, labels=labels, colors=["gray", "skyblue"])

In [None]:
regions_to_add = {}
regions_to_add["caudal_striatum_samples"] = [129, 144, 152, 160]
regions_to_add["dorsal_striatum_samples"] = [166, 171, 176]
regions_to_add["thalamus"] = [
    49,
    64,
    65,
    67,
    89,
    90,
    91,
    92,
    114,
    115,
    116,
    130,
    131,
    145,
    146,
]
regions_to_add["contra_cortex"] = [
    13,
    25,
    37,
    51,
    70,
    71,
    72,
    93,
    94,
    95,
    118,
    119,
    120,
    121,
    132,
    133,
    134,
    135,
    147,
    148,
]
regions_to_add["IC"] = [
    1,
    2,
    3,
    22,
]
regions_to_add["SC"] = [
    21,
    23,
    34,
    35,
    47,
    48,
    69,
]
regions_to_add["pons"] = [
    12,
    24,
    36,
    50,
]
regions_to_add["ipsi_cortex"] = ipsi_cortex
regions_to_add["olfactory_bulb"] = [
    187,
    188,
    189,
    190,
]

In [None]:
all_cortex = [
    4,
    5,
    6,
    7,
    8,
    9,
    10,
    11,
    13,
    14,
    15,
    16,
    17,
    18,
    19,
    20,
    25,
    26,
    27,
    28,
    29,
    30,
    31,
    32,
    33,
    37,
    38,
    39,
    40,
    41,
    42,
    43,
    44,
    45,
    46,
    51,
    52,
    53,
    54,
    55,
    56,
    57,
    58,
    59,
    60,
    61,
    62,
    63,
    66,
    70,
    71,
    72,
    73,
    74,
    75,
    76,
    77,
    78,
    79,
    80,
    81,
    82,
    83,
    84,
    85,
    86,
    87,
    88,
    93,
    94,
    95,
    96,
    97,
    98,
    99,
    100,
    101,
    102,
    103,
    104,
    105,
    106,
    107,
    108,
    109,
    110,
    111,
    112,
    113,
    117,
    118,
    119,
    120,
    121,
    122,
    123,
    124,
    125,
    126,
    127,
    128,
    132,
    133,
    134,
    135,
    136,
    137,
    138,
    139,
    140,
    141,
    142,
    143,
    147,
    148,
    149,
    150,
    151,
    153,
    154,
    155,
    156,
    157,
    158,
    159,
    161,
    162,
    163,
    164,
    165,
    167,
    168,
    169,
    170,
    172,
    173,
    174,
    175,
    177,
    178,
    179,
    180,
    181,
    182,
    183,
    184,
    185,
    186,
]
contra_cortex = [
    13,
    25,
    37,
    51,
    70,
    71,
    72,
    93,
    94,
    95,
    118,
    119,
    120,
    121,
    132,
    133,
    134,
    135,
    147,
    148,
]
ipsi_cortex = [item for item in all_cortex if item not in contra_cortex]
ipsi_cortex = [item for item in ipsi_cortex if item not in A1]

In [None]:
# Create a new dictionary to store the summed values for each area
summed_data = {}

# Iterate through the dictionary and sum the corresponding columns
for area, tubes in regions_to_add.items():
    valid_tubes = [
        tube for tube in tubes if tube in filtered_barcodes_A1_source.columns
    ]
    summed_data[area] = filtered_barcodes_A1_source[valid_tubes].sum(axis=1)


# Create a new DataFrame from the summed_data dictionary
df_result = pd.DataFrame(summed_data)

df_result = df_result.loc[(df_result != 0).any(axis=1)]

In [None]:
sb.clustermap(
    df_result,
    metric="euclidean",
    standard_scale=0,
    norm=LogNorm(),
    cmap="Greys",
    figsize=(6, 10),
    xticklabels=1,
    yticklabels=False,
    cbar_pos=(1.01, 0.5, 0.02, 0.18),
)

In [None]:
striatum_projecting = df_result[df_result["caudal_striatum_samples"] > 0]
sb.clustermap(
    striatum_projecting,
    metric="euclidean",
    standard_scale=0,
    norm=LogNorm(),
    cmap="Greys",
    figsize=(6, 10),
    xticklabels=1,
    yticklabels=False,
    cbar_pos=(1.01, 0.5, 0.02, 0.18),
)

Look at correlation to Allen Anterograde Tracing

In [None]:
download_allen = pathlib.Path(
    "/camp/lab/znamenskiyp/home/shared/projects/turnerb_MAPseq/A1_MAPseq/Allen_Connectivity"
)
finalpix_expt_a = pd.read_pickle("mouse_connectivity/finalpix_expt_a.pkl")
finalpix_expt_b = pd.read_pickle("mouse_connectivity/finalpix_expt_b.pkl")
finalpix_expt_c = pd.read_pickle("mouse_connectivity/finalpix_expt_c.pkl")
# allen anterograde tracing datasets with more than 75% injection site AUDp
experiment_id_a = 120491896  # AUDp
experiment_id_b = 116903230  # AUDp, AUDpo, AUDd, AUDv
experiment_id_c = 100149109  # AUDp and AUDd
# injection volumes to normalise to (mm3)
expt_a_inj_vol = 0.097
expt_b_inj_vol = 0.114
expt_c_inj_vol = 0.073
# get projection density for each anterograde tracing expt: values are sum of projecting pixels per voxel.
expt_a, pd_a_info = mcc.get_projection_density(experiment_id_a)
expt_b, pd_b_info = mcc.get_projection_density(experiment_id_b)
expt_c, pd_c_info = mcc.get_projection_density(experiment_id_c)

In [None]:
# create an average of three experiments normalised by injection volume
expt_a_normalised = expt_a / expt_a_inj_vol
expt_b_normalised = expt_b / expt_b_inj_vol
expt_c_normalised = expt_c / expt_c_inj_vol

# downsample the 3D ROI map to be same as the allen anterograde map
ROI_3D_FIAA456a = np.load(
    "/camp/lab/znamenskiyp/home/shared/projects/turnerb_A1_MAPseq/FIAA45.6a/LCM/ROI_3D_25.npy"
)
ROI_3D_FIAA456d = np.load(
    "/camp/lab/znamenskiyp/home/shared/projects/turnerb_A1_MAPseq/FIAA45.6d/LCM/ROI_3D_25.npy"
)

In [None]:
FIAA456a_allen_comp = pd.DataFrame(
    columns=["Sample", "Allen_expt_a", "Allen_expt_b", "Allen_expt_c", "MAPseq_counts"]
)
for tube in filtered_barcodes_A1_source_FIAA456a.columns:
    projection_strengths_a = expt_a_normalised[ROI_3D_FIAA456a == tube].sum()
    projection_strengths_b = expt_b_normalised[ROI_3D_FIAA456a == tube].sum()
    projection_strengths_c = expt_c_normalised[ROI_3D_FIAA456a == tube].sum()
    row_data = {
        "Sample": tube,
        "Allen_expt_a": projection_strengths_a,
        "Allen_expt_b": projection_strengths_b,
        "Allen_expt_c": projection_strengths_c,
        "MAPseq_counts": filtered_barcodes_A1_source_FIAA456a[tube].sum(),
    }
    FIAA456a_allen_comp = FIAA456a_allen_comp.append(row_data, ignore_index=True)

In [None]:
FIAA456a_allen_comp["mean"] = np.log10(
    FIAA456a_allen_comp[["Allen_expt_a", "Allen_expt_b", "Allen_expt_c"]].mean(axis=1)
    + 1
)
FIAA456a_allen_comp["std"] = np.log10(
    FIAA456a_allen_comp[["Allen_expt_a", "Allen_expt_b", "Allen_expt_c"]].std(axis=1)
    + 1
)
FIAA456a_allen_comp["conf_interval_upper"] = np.log10(
    (1.96 * FIAA456a_allen_comp["std"] / np.sqrt(len(FIAA456a_allen_comp)))
    + FIAA456a_allen_comp["mean"]
    + 1
)
FIAA456a_allen_comp["conf_interval_lower"] = np.log10(
    FIAA456a_allen_comp["mean"]
    - (1.96 * FIAA456a_allen_comp["std"] / np.sqrt(len(FIAA456a_allen_comp)))
    + 1
)
FIAA456a_allen_comp["MAPseq_counts_numeric"] = np.log10(
    pd.to_numeric(FIAA456a_allen_comp["MAPseq_counts"], errors="coerce") + 1
)

In [None]:
RIN_info_FIAA456a = pd.read_csv(
    "/camp/lab/znamenskiyp/home/shared/projects/turnerb_A1_MAPseq/FIAA45.6a/Sequencing/tube_bioanalyzer_FIAA45.6a.csv"
)

In [None]:
FIAA456a_allen_comp = pd.merge(
    FIAA456a_allen_comp,
    RIN_info_FIAA456a,
    left_on="Sample",
    right_on="tube",
    how="left",
)
FIAA456a_allen_comp.drop(columns="tube", inplace=True)

In [None]:
from decimal import Decimal

sb.scatterplot(
    data=FIAA456a_allen_comp,
    x="mean",
    y="MAPseq_counts_numeric",
    hue="bioanalyzer_results",
    palette="ch:s=.25,rot=-.25",
)
sb.regplot(data=FIAA456a_allen_comp, x="mean", y="MAPseq_counts_numeric", scatter=False)
plt.xlabel("log 10 (Allen projection strength) (AU))")
plt.ylabel("log 10 (MAPseq barcode counts)")

corr, p = pearsonr(
    FIAA456a_allen_comp["MAPseq_counts_numeric"], y=FIAA456a_allen_comp["mean"]
)
plt.title(
    f"FIAA45.6a LCM ROI MAPseq barcode counts to anterograde tracing \n (r = {np.round(corr, 3)}, p = {Decimal(p):.2E})"
)

Now do the same for FIAA45.6d

In [None]:
FIAA456d_allen_comp = pd.DataFrame(
    columns=["Sample", "Allen_expt_a", "Allen_expt_b", "Allen_expt_c", "MAPseq_counts"]
)
for tube in filtered_barcodes_A1_source.columns:
    projection_strengths_a = expt_a_normalised[ROI_3D_FIAA456d == tube].sum()
    projection_strengths_b = expt_b_normalised[ROI_3D_FIAA456d == tube].sum()
    projection_strengths_c = expt_c_normalised[ROI_3D_FIAA456d == tube].sum()
    row_data = {
        "Sample": tube,
        "Allen_expt_a": projection_strengths_a,
        "Allen_expt_b": projection_strengths_b,
        "Allen_expt_c": projection_strengths_c,
        "MAPseq_counts": filtered_barcodes_A1_source[tube].sum(),
    }
    FIAA456d_allen_comp = FIAA456d_allen_comp.append(row_data, ignore_index=True)
FIAA456d_allen_comp["mean"] = np.log10(
    FIAA456d_allen_comp[["Allen_expt_a", "Allen_expt_b", "Allen_expt_c"]].mean(axis=1)
    + 1
)
FIAA456d_allen_comp["MAPseq_counts_numeric"] = np.log10(
    pd.to_numeric(FIAA456d_allen_comp["MAPseq_counts"], errors="coerce") + 1
)

In [None]:
RIN_info_FIAA456d = pd.read_csv(
    "/camp/lab/znamenskiyp/home/shared/projects/turnerb_A1_MAPseq/FIAA45.6d/Sequencing/tube_bioanalyzer_FIAA45.6d.csv"
)
FIAA456d_allen_comp = pd.merge(
    FIAA456d_allen_comp,
    RIN_info_FIAA456d,
    left_on="Sample",
    right_on="tube",
    how="left",
)

In [None]:
sb.scatterplot(
    data=FIAA456d_allen_comp,
    x="mean",
    y="MAPseq_counts_numeric",
    hue="bioanalyzer_result",
    palette="ch:s=.25,rot=-.25",
)
sb.regplot(data=FIAA456d_allen_comp, x="mean", y="MAPseq_counts_numeric", scatter=False)
plt.xlabel("log 10 (Allen projection strength) (AU))")
plt.ylabel("log 10 (MAPseq barcode counts)")

corr, p = pearsonr(
    FIAA456d_allen_comp["MAPseq_counts_numeric"], y=FIAA456d_allen_comp["mean"]
)
plt.title(
    f"FIAA45.6d LCM ROI MAPseq barcode counts to anterograde tracing \n (r = {np.round(corr, 3)}, p = {Decimal(p):.2E})"
)

For each LCM sample, find the LCM sample in the other brain that most closely matches

In [None]:
tubes_FIAA456a = []
centroids_FIAA456a = []
for tube in np.unique(ROI_3D_FIAA456a):
    if tube > 0:
        centroid = np.argwhere(ROI_3D_FIAA456a == tube).mean(axis=0)
        tubes_FIAA456a.append(tube)
        centroids_FIAA456a.append(centroid)

ROI_coords = pd.DataFrame({"Tube": tubes_FIAA456a, "Centroid": centroids_FIAA456a})


tubes = []
centroids = []
for tube in np.unique(ROI_3D_FIAA456d):
    if tube > 0:
        centroid = np.argwhere(ROI_3D_FIAA456d == tube).mean(axis=0)
        tubes.append(tube)
        centroids.append(centroid)

ROI_coords_FIAA456d = pd.DataFrame({"Tube": tubes, "Centroid": centroids})

combinations = [
    (x, y) for x in ROI_coords_FIAA456d["Centroid"] for y in ROI_coords["Centroid"]
]
tube_comp = [(x, y) for x in ROI_coords_FIAA456d["Tube"] for y in ROI_coords["Tube"]]
distances = []
for i in combinations:
    dist = np.linalg.norm(i[0] - i[1])
    distances.append(dist)
tube_A = []
tube_B = []
for i in tube_comp:
    tube_A.append(i[0])
    tube_B.append(i[1])
distances_rois = pd.DataFrame(
    {"Tube A": tube_A, "Tube_B": tube_B, "Distance": distances}
)
df_min_distance = distances_rois.loc[
    distances_rois.groupby("Tube A")["Distance"].idxmin()
]

In [None]:
ROI_3D = np.load(
    "/camp/lab/znamenskiyp/home/shared/projects/turnerb_A1_MAPseq/FIAA45.6a/LCM/ROI_3D.npy"
)

In [None]:
len(ROI_3D[ROI_3D == 86])

In [None]:
plt.hist((df_min_distance["Distance"] * 10), color="steelblue", bins=15)
plt.ylabel("Frequency")
plt.xlabel("Euclidean Distance (um)")
plt.title("Distribution of nearest distances between ROIs on different brains")

In [None]:
df_min_distance = df_min_distance.reset_index()

In [None]:
np.unique(ROI_3D_FIAA456a)

In [None]:
df_min_distance["FIAA456d_counts"] = "NA"
df_min_distance["FIAA456a_counts"] = "NA"
for i, r in df_min_distance.iterrows():
    df_min_distance["FIAA456d_counts"].loc[i] = filtered_soma_FIAA456d[
        r["Tube A"]
    ].sum()
    df_min_distance["FIAA456a_counts"].loc[i] = filtered_soma_FIAA456a[
        r["Tube_B"]
    ].sum()

In [None]:
filtered_soma_FIAA456d[9].sum()

In [None]:
filtered_soma_FIAA456d[6].sum()

In [None]:
filtered_soma_FIAA456d[r["Tube A"]].sum()

In [None]:
tube_comp

In [None]:
distances

In [None]:
distances = []
for i in combinations:
    dist = np.linalg.norm(i[0] - i[1])
    distances.append(dist)

In [None]:
distances = cdist(ROI_coords, ROI_coords_FIAA456d, "euclidean")

# Create a DataFrame to store the distances and corresponding ROI names
data = {
    "ROI Brain 1": np.repeat(ROI_coords["Centroid"], len(roi_names_brain2)),
    "ROI Brain 2": np.tile(roi_names_brain2, len(roi_names_brain1)),
    "Distance": distances.flatten(),
}

In [None]:
centroids_brain1 = np.array(
    [np.mean(np.argwhere(roi_volume), axis=0) for roi_volume in roi_volumes_brain1]
)
centroids_brain2 = np.array(
    [np.mean(np.argwhere(roi_volume), axis=0) for roi_volume in roi_volumes_brain2]
)

# Calculate Euclidean distances between centroids of ROIs in different brains
distances = cdist(centroids_brain1, centroids_brain2, "euclidean")

# Create a DataFrame to store the distances and corresponding ROI names
data = {
    "ROI Brain 1": np.repeat(roi_names_brain1, len(roi_names_brain2)),
    "ROI Brain 2": np.tile(roi_names_brain2, len(roi_names_brain1)),
    "Distance": distances.flatten(),
}

In [None]:
FIAA456a_allen_comp = pd.DataFrame(
    columns=["Sample", "Allen_expt_a", "Allen_expt_b", "Allen_expt_c", "MAPseq_counts"]
)
for tube in FIAA456a_A1_source.columns:
    projection_strengths_a = expt_a_normalised[downsampled_array_FIAA456a == tube].sum()
    projection_strengths_b = expt_b_normalised[downsampled_array_FIAA456a == tube].sum()
    projection_strengths_c = expt_a_normalised[downsampled_array_FIAA456a == tube].sum()
    row_data = {
        "Sample": tube,
        "Allen_expt_a": projection_strengths_a,
        "Allen_expt_b": projection_strengths_b,
        "Allen_expt_c": projection_strengths_c,
        "MAPseq_counts": FIAA456a_A1_source[tube].sum(),
    }
    FIAA456a_allen_comp = FIAA456a_allen_comp.append(row_data, ignore_index=True)

In [None]:
FIAA456a_allen_comp["mean"] = np.log10(
    FIAA456a_allen_comp[["Allen_expt_a", "Allen_expt_b", "Allen_expt_c"]].mean(axis=1)
    + 1
)
FIAA456a_allen_comp["std"] = np.log10(
    FIAA456a_allen_comp[["Allen_expt_a", "Allen_expt_b", "Allen_expt_c"]].std(axis=1)
    + 1
)
FIAA456a_allen_comp["conf_interval_upper"] = np.log10(
    (1.96 * FIAA456a_allen_comp["std"] / np.sqrt(len(FIAA456a_allen_comp)))
    + FIAA456a_allen_comp["mean"]
    + 1
)
FIAA456a_allen_comp["conf_interval_lower"] = np.log10(
    FIAA456a_allen_comp["mean"]
    - (1.96 * FIAA456a_allen_comp["std"] / np.sqrt(len(FIAA456a_allen_comp)))
    + 1
)
FIAA456a_allen_comp["MAPseq_counts_numeric"] = np.log10(
    pd.to_numeric(FIAA456a_allen_comp["MAPseq_counts"], errors="coerce") + 1
)

In [None]:
plt.scatter(
    x=FIAA456a_allen_comp["MAPseq_counts_numeric"], y=FIAA456a_allen_comp["mean"]
)

In [None]:
FIAA456d_allen_comp = pd.DataFrame(
    columns=["Sample", "Allen_expt_a", "Allen_expt_b", "Allen_expt_c", "MAPseq_counts"]
)
for tube in FIAA456d_A1_source.columns:
    projection_strengths_a = expt_a_normalised[downsampled_array_FIAA456d == tube].sum()
    projection_strengths_b = expt_b_normalised[downsampled_array_FIAA456d == tube].sum()
    projection_strengths_c = expt_a_normalised[downsampled_array_FIAA456d == tube].sum()
    row_data = {
        "Sample": tube,
        "Allen_expt_a": projection_strengths_a,
        "Allen_expt_b": projection_strengths_b,
        "Allen_expt_c": projection_strengths_c,
        "MAPseq_counts": FIAA456d_A1_source[tube].sum(),
    }
    FIAA456d_allen_comp = FIAA456d_allen_comp.append(row_data, ignore_index=True)

In [None]:
FIAA456d_allen_comp["mean"] = np.log10(
    FIAA456d_allen_comp[["Allen_expt_a", "Allen_expt_b", "Allen_expt_c"]].mean(axis=1)
    + 1
)
FIAA456d_allen_comp["std"] = np.log10(
    FIAA456d_allen_comp[["Allen_expt_a", "Allen_expt_b", "Allen_expt_c"]].std(axis=1)
    + 1
)
FIAA456d_allen_comp["conf_interval_upper"] = np.log10(
    (1.96 * FIAA456d_allen_comp["std"] / np.sqrt(len(FIAA456d_allen_comp)))
    + FIAA456d_allen_comp["mean"]
    + 1
)
FIAA456d_allen_comp["conf_interval_lower"] = np.log10(
    FIAA456d_allen_comp["mean"]
    - (1.96 * FIAA456d_allen_comp["std"] / np.sqrt(len(FIAA456d_allen_comp)))
    + 1
)
FIAA456d_allen_comp["MAPseq_counts_numeric"] = np.log10(
    pd.to_numeric(FIAA456d_allen_comp["MAPseq_counts"], errors="coerce") + 1
)

In [None]:
FIAA456d_A1_source[tube].sum()

In [None]:
FIAA456d_A1_source

In [None]:
plt.scatter(
    x=FIAA456d_allen_comp["MAPseq_counts_numeric"], y=FIAA456d_allen_comp["mean"]
)

In [None]:
FIAA456d_allen_comp[FIAA456d_allen_comp["MAPseq_counts_numeric"] > 5]