<a href="https://colab.research.google.com/github/tb-harris/neuroscience-2024/blob/main/Figuring_Out_the_Feature_Extractor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Install the feature extractor library:

In [1]:
#Mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install neuron_morphology

In [3]:
import sys
sys.path.insert(0, "../")

from io import StringIO
import copy
import matplotlib.pyplot as plt

import neuron_morphology.swc_io as swcio
from neuron_morphology.morphology import Morphology
from neuron_morphology.swc_io import morphology_from_swc
from neuron_morphology.feature_extractor.data import Data
from neuron_morphology.feature_extractor.feature_extractor import FeatureExtractor
from neuron_morphology.features.default_features import default_features
from neuron_morphology.constants import (
    SOMA, AXON, BASAL_DENDRITE, APICAL_DENDRITE
)

import json
import numpy as np
import neuron_morphology.feature_extractor.feature_writer as fw

2024-07-15 13:50:23,645 numexpr.utils INFO     NumExpr defaulting to 2 threads.


Load in cell metadata and manifest file containing reconstruction file names

In [4]:
import pandas as pd

#metadata
path = '/content/drive/Shareddrives/Lisman Laboratory/Data 2023/Allen Brain Institute Data/20200711_patchseq_metadata_mouse.csv'
metadata = pd.read_csv(path)
# manifest
path1 = '/content/drive/Shareddrives/Lisman Laboratory/Data 2023/Allen Brain Institute Data/2021-09-13_mouse_file_manifest.csv'
manifest = pd.read_csv(path1)

Get the URL to download each of our 573 neuron reconstruction files:

In [5]:
#get 573 morph file and their ids
swc_urls = manifest.loc[
    (manifest["file_type"] == "transformed_swc")
]

archive_url = swc_urls["archive_uri"].values

Download the reconstruction files:

In [None]:
#5m 10s
for url in archive_url:
  !wget {url}

In [7]:
#get morph files
swc_path = swc_urls["file_name"].values

import os
# convert relative file paths to absolute file paths
swc_paths = [os.path.abspath(path) for path in swc_urls["file_name"].values.tolist()]

morphologies = []
# pass the file paths to morphology_from_swc() function
for path in swc_paths:
  morph = morphology_from_swc(path)
  morphologies.append(morph)

Move the files into Google Drive:

In [19]:
!cp -r ./*.swc "/content/drive/Shareddrives/Lisman Laboratory/Lisman 2024/Neuro/Data/raw_data/reconstructions/"

### Step 1: Import our features

Find our relevant feature(s) from the [documentation](https://neuron-morphology.readthedocs.io/en/latest/autoapi/neuron_morphology/features/index.html) and import the correct library.

In [8]:
from neuron_morphology.features.path import max_path_distance
from neuron_morphology.features.branching.bifurcations import mean_bifurcation_angle_local

### Step 2: Register our features
Register the features we want to use. For each new feature, add a new *specialize()* with the feature name, and the constant `NEURITE_SPECIALIZATIONS` (which indicates that we want features for all neurites -- you can narrow this down by including a different constant).

In [9]:
from neuron_morphology.feature_extractor.marked_feature import specialize
from neuron_morphology.feature_extractor.feature_specialization import NEURITE_SPECIALIZATIONS

fe = FeatureExtractor()
fe.register_features([
    specialize(max_path_distance, NEURITE_SPECIALIZATIONS),
    specialize(mean_bifurcation_angle_local, NEURITE_SPECIALIZATIONS)
])

<neuron_morphology.feature_extractor.feature_extractor.FeatureExtractor at 0x7ca26d1966e0>

### Step 3: Extract features

Run the two cells below to create a dataframe with our features of interest.

In [10]:
from neuron_morphology.feature_extractor.utilities import unnest

# Extract the features from a single neuron morphology object
def extract_features(neuron_morphology):
  data = Data(neuron_morphology)

  feature_extraction_run = fe.extract(data)
  results = feature_extraction_run.results

  unnest(results)
  return results

Create a data frame by running the *extract_features()* function on each neuron morphology.

In [None]:
features = pd.DataFrame(
    (extract_features(neuron) for neuron in morphologies),
    index=swc_urls["cell_specimen_id"].astype(int)
)

In [12]:
features

Unnamed: 0_level_0,basal_dendrite.max_path_distance,axon.max_path_distance,all_neurites.max_path_distance,dendrite.max_path_distance,basal_dendrite.mean_bifurcation_angle_local,axon.mean_bifurcation_angle_local,all_neurites.mean_bifurcation_angle_local,dendrite.mean_bifurcation_angle_local,apical_dendrite.max_path_distance,apical_dendrite.mean_bifurcation_angle_local
cell_specimen_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
601506507,523.700528,794.445695,794.445695,523.700528,1.312409,1.485740,1.453871,1.312409,,
601790961,325.760216,1061.473816,1061.473816,325.760216,1.223044,1.480208,1.450232,1.223044,,
601803754,281.814132,861.607466,861.607466,281.814132,1.675803,1.634313,1.637428,1.675803,,
601808698,262.570371,864.646669,864.646669,262.570371,1.399844,1.493160,1.486658,1.399844,,
601810307,273.631704,923.251790,923.251790,273.631704,1.178422,1.487785,1.463036,1.178422,,
...,...,...,...,...,...,...,...,...,...,...
992386952,322.624669,764.231479,764.231479,322.624669,1.378766,1.407236,1.404152,1.378766,,
992830261,307.323853,826.281806,826.281806,307.323853,1.335673,1.492918,1.462526,1.335673,,
993243528,311.198809,618.332210,618.332210,311.198809,1.334276,1.457965,1.422864,1.334276,,
993245688,289.194411,1155.124808,1155.124808,289.194411,1.425500,1.405364,1.407533,1.425500,,


## Step 4: Save data

Save your features data to Drive:

In [15]:
features.to_hdf(
    '/content/drive/Shareddrives/Lisman Laboratory/Lisman 2024/Neuro/Data/student data/Mr_Harris_path_bifurcation.hdf5',
    key='features'
)

### Step 5: Loading in your data

Now that we've saved our data to Drive, we can load it in any notebook (so we won't need to run the code again):

**Change the file name at the end of the path to have your name and a description of the data:**

In [16]:
cell_data_angle = pd.read_hdf('/content/drive/Shareddrives/Lisman Laboratory/Lisman 2024/Neuro/Data/student data/Mr_Harris_path_bifurcation.hdf5')

In [17]:
cell_data_angle

Unnamed: 0_level_0,basal_dendrite.max_path_distance,axon.max_path_distance,all_neurites.max_path_distance,dendrite.max_path_distance,basal_dendrite.mean_bifurcation_angle_local,axon.mean_bifurcation_angle_local,all_neurites.mean_bifurcation_angle_local,dendrite.mean_bifurcation_angle_local,apical_dendrite.max_path_distance,apical_dendrite.mean_bifurcation_angle_local
cell_specimen_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
601506507,523.700528,794.445695,794.445695,523.700528,1.312409,1.485740,1.453871,1.312409,,
601790961,325.760216,1061.473816,1061.473816,325.760216,1.223044,1.480208,1.450232,1.223044,,
601803754,281.814132,861.607466,861.607466,281.814132,1.675803,1.634313,1.637428,1.675803,,
601808698,262.570371,864.646669,864.646669,262.570371,1.399844,1.493160,1.486658,1.399844,,
601810307,273.631704,923.251790,923.251790,273.631704,1.178422,1.487785,1.463036,1.178422,,
...,...,...,...,...,...,...,...,...,...,...
992386952,322.624669,764.231479,764.231479,322.624669,1.378766,1.407236,1.404152,1.378766,,
992830261,307.323853,826.281806,826.281806,307.323853,1.335673,1.492918,1.462526,1.335673,,
993243528,311.198809,618.332210,618.332210,311.198809,1.334276,1.457965,1.422864,1.334276,,
993245688,289.194411,1155.124808,1155.124808,289.194411,1.425500,1.405364,1.407533,1.425500,,


### Step 6: Combine with other dataframes


We can also combine our dataframe with existing dataframes -- for example, our dataframe that has all the genes and other morph features:

Let's load in our existing gene+morph dataframe (and drop the zero gene values):

In [20]:
# Read in our morph + genetic data
cell_data = pd.read_hdf('/content/drive/Shareddrives/Lisman Laboratory/Lisman 2024/Neuro/Data/processed data/genes_morph.hdf5.lz4')

# Gets all gene cols where the value for every cell is 0.
zero_genes = cell_data.columns[(cell_data == 0).all()]
# gene_data[zero_genes].sum().sum() # Double check these columns are 0

# Removes all of the columns in zero_genes from our dataframe
cell_data = cell_data.drop(zero_genes, axis="columns")

Let's combine this with our dataframe:

In [23]:
# creates a new dataframe that combines the cell_data and our newly calculated features
cell_data_new = cell_data.join(features, how='inner')

In [24]:
cell_data_new

Unnamed: 0_level_0,0610005C13Rik,0610006L08Rik,0610007P14Rik,0610009B22Rik,0610009E02Rik,0610009L18Rik,0610009O20Rik,0610010B08Rik,0610010F05Rik,0610010K14Rik,...,basal_dendrite.max_path_distance,axon.max_path_distance,all_neurites.max_path_distance,dendrite.max_path_distance,basal_dendrite.mean_bifurcation_angle_local,axon.mean_bifurcation_angle_local,all_neurites.mean_bifurcation_angle_local,dendrite.mean_bifurcation_angle_local,apical_dendrite.max_path_distance,apical_dendrite.mean_bifurcation_angle_local
cell_specimen_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
601506507,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,15.216454,0.000000,...,523.700528,794.445695,794.445695,523.700528,1.312409,1.485740,1.453871,1.312409,,
601790961,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,325.760216,1061.473816,1061.473816,325.760216,1.223044,1.480208,1.450232,1.223044,,
601803754,0.0,0.0,0.000000,69.658825,0.000000,0.000000,0.000000,0.000000,1.698996,0.000000,...,281.814132,861.607466,861.607466,281.814132,1.675803,1.634313,1.637428,1.675803,,
601808698,0.0,0.0,152.239614,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,20.674515,...,262.570371,864.646669,864.646669,262.570371,1.399844,1.493160,1.486658,1.399844,,
601810307,0.0,0.0,165.312294,0.000000,0.000000,0.000000,0.000000,0.000000,25.931340,19.448505,...,273.631704,923.251790,923.251790,273.631704,1.178422,1.487785,1.463036,1.178422,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
963055521,0.0,0.0,46.237643,46.237643,18.495057,0.000000,46.237643,0.000000,231.188215,0.000000,...,210.006799,41.847068,353.113628,353.113628,1.170458,,1.202542,1.202542,353.113628,1.242266
963063283,0.0,0.0,57.018453,15.550487,5.183496,0.000000,5.183496,0.000000,114.036906,0.000000,...,183.198619,166.483139,400.715579,400.715579,1.418904,,1.347300,1.347300,400.715579,1.230131
992386952,0.0,0.0,131.578123,0.000000,0.000000,0.000000,0.000000,0.000000,106.515623,0.000000,...,322.624669,764.231479,764.231479,322.624669,1.378766,1.407236,1.404152,1.378766,,
992268582,0.0,0.0,0.000000,33.372182,0.000000,0.000000,2.567091,2.567091,77.012728,0.000000,...,289.480802,673.558855,673.558855,289.480802,1.087030,1.505030,1.464578,1.087030,,


This adds our new morphological features to our columns:

In [28]:
cell_data_new.columns[-34:]

Index(['axon.total_length', 'all_neurites.total_length',
       'basal_dendrite.total_length', 'dendrite.total_length',
       'dendrite.total_surface_area', 'axon.total_surface_area',
       'all_neurites.total_surface_area', 'basal_dendrite.total_surface_area',
       'dendrite.total_volume', 'axon.total_volume',
       'all_neurites.total_volume', 'basal_dendrite.total_volume',
       'dendrite.num_tips', 'axon.num_tips', 'all_neurites.num_tips',
       'basal_dendrite.num_tips', 'dendrite.num_nodes', 'axon.num_nodes',
       'all_neurites.num_nodes', 'basal_dendrite.num_nodes',
       'dendrite.num_branches', 'axon.num_branches',
       'all_neurites.num_branches', 'basal_dendrite.num_branches',
       'basal_dendrite.max_path_distance', 'axon.max_path_distance',
       'all_neurites.max_path_distance', 'dendrite.max_path_distance',
       'basal_dendrite.mean_bifurcation_angle_local',
       'axon.mean_bifurcation_angle_local',
       'all_neurites.mean_bifurcation_angle_local',
 

We can save this data to our Drive. **Change the file name (stuff after the last /) to have your name and a description of the overall data:**

In [29]:
cell_data_new.to_hdf(
    '/content/drive/Shareddrives/Lisman Laboratory/Lisman 2024/Neuro/Data/student data/Mr_Harris_gene_morph_with_path.hdf5',
    key='cell_data'
)

Now, in any future notebook, we can just directly load in the dataframe that we created:

In [31]:
my_data = pd.read_hdf('/content/drive/Shareddrives/Lisman Laboratory/Lisman 2024/Neuro/Data/student data/Mr_Harris_gene_morph_with_path.hdf5')
my_data

Unnamed: 0_level_0,0610005C13Rik,0610006L08Rik,0610007P14Rik,0610009B22Rik,0610009E02Rik,0610009L18Rik,0610009O20Rik,0610010B08Rik,0610010F05Rik,0610010K14Rik,...,basal_dendrite.max_path_distance,axon.max_path_distance,all_neurites.max_path_distance,dendrite.max_path_distance,basal_dendrite.mean_bifurcation_angle_local,axon.mean_bifurcation_angle_local,all_neurites.mean_bifurcation_angle_local,dendrite.mean_bifurcation_angle_local,apical_dendrite.max_path_distance,apical_dendrite.mean_bifurcation_angle_local
cell_specimen_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
601506507,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,15.216454,0.000000,...,523.700528,794.445695,794.445695,523.700528,1.312409,1.485740,1.453871,1.312409,,
601790961,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,325.760216,1061.473816,1061.473816,325.760216,1.223044,1.480208,1.450232,1.223044,,
601803754,0.0,0.0,0.000000,69.658825,0.000000,0.000000,0.000000,0.000000,1.698996,0.000000,...,281.814132,861.607466,861.607466,281.814132,1.675803,1.634313,1.637428,1.675803,,
601808698,0.0,0.0,152.239614,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,20.674515,...,262.570371,864.646669,864.646669,262.570371,1.399844,1.493160,1.486658,1.399844,,
601810307,0.0,0.0,165.312294,0.000000,0.000000,0.000000,0.000000,0.000000,25.931340,19.448505,...,273.631704,923.251790,923.251790,273.631704,1.178422,1.487785,1.463036,1.178422,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
963055521,0.0,0.0,46.237643,46.237643,18.495057,0.000000,46.237643,0.000000,231.188215,0.000000,...,210.006799,41.847068,353.113628,353.113628,1.170458,,1.202542,1.202542,353.113628,1.242266
963063283,0.0,0.0,57.018453,15.550487,5.183496,0.000000,5.183496,0.000000,114.036906,0.000000,...,183.198619,166.483139,400.715579,400.715579,1.418904,,1.347300,1.347300,400.715579,1.230131
992386952,0.0,0.0,131.578123,0.000000,0.000000,0.000000,0.000000,0.000000,106.515623,0.000000,...,322.624669,764.231479,764.231479,322.624669,1.378766,1.407236,1.404152,1.378766,,
992268582,0.0,0.0,0.000000,33.372182,0.000000,0.000000,2.567091,2.567091,77.012728,0.000000,...,289.480802,673.558855,673.558855,289.480802,1.087030,1.505030,1.464578,1.087030,,
