In [26]:
"""
Install required packages using pip.

Packages:
- pandas
- seaborn
- matplotlib
- scikit-learn
- numpy
- openpyxl
- pipenv

Usage: %pip install pandas seaborn matplotlib scikit-learn numpy openpyxl pipenv
"""
%pip install pandas seaborn matplotlib scikit-learn numpy openpyxl pipenv umap umap-learn pycombat

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [27]:

"""
Installed packages are loaded using the import statement. The # type: ignore comment is used to suppress warnings.
"""
import pandas as pd # type: ignore
import seaborn as sns # type: ignore
import matplotlib.pyplot as plt # type: ignore
import numpy as np # type: ignore
import os # type: ignore
import re # type: ignore
from sklearn.model_selection import train_test_split # type: ignore
from sklearn.decomposition import PCA # type: ignore
from sklearn.experimental import enable_iterative_imputer  # type: ignore
from sklearn.impute import IterativeImputer  # type: ignore
from sklearn.preprocessing import StandardScaler # type: ignore
from pycombat import pycombat # type: ignore

In [28]:
"""
Load data from an Excel file and extract specific features.
"""

# Specify the run. 


print("Loading data...")
data = pd.read_excel("data/01. Master_Latest data_Control clones_LP.xlsx", engine='openpyxl')
#data_extracted_features = pd.read_excel("data/01. Master_Latest data_Control clones_LP.xlsx", usecols=['Batch number', 'Genotype Neuron', 'Div calculated', 'Culture treatment', 'Capacitance', 'Input Resistance', 'Resting membrane potential ', 'Maximum firing ', 'Calculated input resistance', 'Rheobase', 'EPSC freq'])
data_extracted_features = pd.read_excel("data/01. Master_Latest data_Control clones_LP.xlsx", usecols=['Batch number', 'Genotype Neuron', 'Div calculated', 'Culture treatment', 'Cultured by', 'Capacitance', 'Input Resistance', 'Resting membrane potential ', 'Maximum firing ', 'Rheobase', 'EPSC freq'])

print("\n")
print(data_extracted_features.dtypes)
print(data_extracted_features)

Loading data...


Batch number                     int64
Genotype Neuron                 object
Div calculated                   int64
Culture treatment               object
Cultured by                     object
Capacitance                    float64
Input Resistance               float64
Resting membrane potential       int64
Maximum firing                 float64
Rheobase                       float64
EPSC freq                       object
dtype: object
     Batch number Genotype Neuron  Div calculated   Culture treatment  \
0               1         TSC12.3              33                none   
1               1         TSC12.3              33                none   
2               1         TSC12.3              33                none   
3               1         TSC12.3              33                none   
4               1         TSC12.3              33                none   
..            ...             ...             ...                 ...   
683             3           

In [29]:
"""
Imputes missing values in the given DataFrame using the IterativeImputer algorithm.
"""

#Create copy of the DataFrame
data_extracted_features_impute = data_extracted_features.copy() 

# Replace '>' values with NaN in 'EPSC freq' column
mask = data_extracted_features_impute['EPSC freq'].notna() & data_extracted_features_impute['EPSC freq'].str.contains(">")
data_extracted_features_impute.loc[mask, 'EPSC freq'] = np.nan

# # Convert 'EPSC freq' column to numeric
data_extracted_features_impute['EPSC freq'] = pd.to_numeric(data_extracted_features_impute['EPSC freq'])


# Initialize the IterativeImputer
imputer = IterativeImputer(random_state=0, min_value=0, sample_posterior=False, max_iter=100, tol=0.001)
# sample_posterior=True

# min_value uiteindleijk op 0
# tol (tolerance) uitgelegd vanuit algoritme/paper 
# sample_posterior uitgelegd vanuit algoritme/paper`
# max iter hoger (100, 1000), kijk naar verschil imputatie waardes 

# Drop object columns for imputation. The columns which contain categorical values will be dropped for the MICE imputation to work.
# These columns will be added back after imputation.
object_columns = data_extracted_features_impute.select_dtypes(include=['object']).columns
data_extracted_features_impute_copy = data_extracted_features_impute.drop(columns=object_columns)

# Impute missing values
data_extracted_features_imputed = imputer.fit_transform(data_extracted_features_impute_copy)
data_extracted_features_imputed = pd.DataFrame(data_extracted_features_imputed, columns=data_extracted_features_impute_copy.columns)

# Add back the object columns
for column in object_columns:
    data_extracted_features_imputed[column] = data_extracted_features_impute[column]

# Create dictioniary to store imputed values to be exported to Excel
dict_imputed_values = {
    'Feature': [],
    'Index': [],
    'Original Value': [],
    'Imputed Value': []
}

# Print imputed values for each column
for column in data_extracted_features_impute.columns:
    missing_mask_column = data_extracted_features_impute[column].isnull()
    imputed_missing_mask_column = data_extracted_features_imputed[column].isnull()
    imputed_values_mask_column = (missing_mask_column) & (~imputed_missing_mask_column)
    imputed_values_column = data_extracted_features_impute.loc[imputed_values_mask_column, column]
    imputed_value = data_extracted_features_imputed.loc[imputed_values_mask_column, column]
    if not imputed_values_column.empty:
        if not os.path.exists('run_1/qc/tables/MICE_imputation'):
            os.makedirs('run_1/qc/tables/MICE_imputation')
        imputed_value_pairs = [(original, imputed) for original, imputed in zip(imputed_values_column, imputed_value)]
        formatted_output = "\n".join([f"Index: {index + 2}\nOriginal Value: {original} --> Imputed value: {imputed}" for index, (original, imputed) in zip(imputed_values_column.index, imputed_value_pairs)])
        print(formatted_output + "\n")
        # Store imputed values in dictionary
        indices = imputed_values_mask_column[imputed_values_mask_column].index
        dict_imputed_values['Feature'].extend([column] * len(indices))
        dict_imputed_values['Index'].extend(indices + 2)  # Adding 2 to match your index adjustment
        dict_imputed_values['Original Value'].extend(imputed_values_column.fillna('nan').tolist())
        dict_imputed_values['Imputed Value'].extend(imputed_value.tolist())
        #print(dict_imputed_values)

# Export imputed values to Excel
imputed_df = pd.DataFrame(dict_imputed_values)  
imputed_df.to_csv(f"run_1/qc/tables/MICE_imputation/MICE_imputed_values_0_FALSE_{100}.csv", index=False)
        # Change negative frequencies to 0
        #data_extracted_features_imputed["EPSC freq"].loc[data_extracted_features_imputed["EPSC freq"] < 0] = 0

Index: 651
Original Value: nan --> Imputed value: 1466.167696462265

Index: 632
Original Value: nan --> Imputed value: 2.2005210848869
Index: 637
Original Value: nan --> Imputed value: 2.395739636908261

Index: 452
Original Value: nan --> Imputed value: 31.001103513574673

Index: 27
Original Value: nan --> Imputed value: 0.5369298251648332
Index: 49
Original Value: nan --> Imputed value: 0.1377523495720544
Index: 52
Original Value: nan --> Imputed value: 0.0
Index: 56
Original Value: nan --> Imputed value: 0.96060651305173
Index: 57
Original Value: nan --> Imputed value: 0.015885209623630492
Index: 60
Original Value: nan --> Imputed value: 0.8281488505263931
Index: 61
Original Value: nan --> Imputed value: 0.34442872867283947
Index: 62
Original Value: nan --> Imputed value: 0.5615120228356675
Index: 64
Original Value: nan --> Imputed value: 0.06725870090869646
Index: 66
Original Value: nan --> Imputed value: 0.3188403830959624
Index: 73
Original Value: nan --> Imputed value: 0.72646562

In [30]:
from sklearn.linear_model import LinearRegression

# Assuming 'data_extracted_features' is your input dataframe
data_extracted_features_impute_glm = data_extracted_features.copy()

# Handle 'EPSC freq' values
mask_glm = data_extracted_features_impute_glm['EPSC freq'].notna() & data_extracted_features_impute_glm['EPSC freq'].str.contains(">")
data_extracted_features_impute_glm.loc[mask_glm, 'EPSC freq'] = np.nan

# Convert 'EPSC freq' to numeric, setting errors='coerce' to handle any unexpected values
data_extracted_features_impute_glm['EPSC freq'] = pd.to_numeric(data_extracted_features_impute_glm['EPSC freq'], errors='coerce')

# Drop object columns for imputation
object_columns_glm = data_extracted_features_impute_glm.select_dtypes(include=['object']).columns
data_extracted_features_impute_glm_copy = data_extracted_features_impute_glm.drop(columns=object_columns_glm)

# Set up the imputer with LinearRegression estimator
imputer_glm = IterativeImputer(estimator=LinearRegression(), min_value=0,  random_state=0, max_iter=100)

# Fit the imputer to your data and transform it
data_extracted_features_imputed_glm = imputer_glm.fit_transform(data_extracted_features_impute_glm_copy)
data_extracted_features_imputed_glm = pd.DataFrame(data_extracted_features_imputed_glm, columns=data_extracted_features_impute_glm_copy.columns)

# Add back the object columns
for column in object_columns_glm:
    data_extracted_features_imputed_glm[column] = data_extracted_features_impute_glm[column]

# Create dictioniary to store imputed values to be exported to Excel
dict_imputed_glm_values = {
    'Feature': [],
    'Index': [],
    'Original Value': [],
    'Imputed Value': []
}

# Print imputed values for each column
for column_glm in data_extracted_features_impute_glm_copy.columns:
    missing_mask_column_glm = data_extracted_features_impute_glm_copy[column_glm].isnull()
    imputed_missing_mask_column_glm = data_extracted_features_imputed_glm[column_glm].isnull()
    imputed_values_mask_column_glm = missing_mask_column_glm & ~imputed_missing_mask_column_glm
    imputed_values_column_glm = data_extracted_features_impute_glm_copy.loc[imputed_values_mask_column_glm, column_glm]
    imputed_value_glm = data_extracted_features_imputed_glm.loc[imputed_values_mask_column_glm, column_glm]
    if not imputed_values_column_glm.empty:
        if not os.path.exists('run_1/qc/tables/GLM_imputation'):
            os.makedirs('run_1/qc/tables/GLM_imputation')
        imputed_value_pairs_glm = [(original_glm, imputed_glm) for original_glm, imputed_glm in zip(imputed_values_column_glm, imputed_value_glm)]
        print(f"Imputed values for {column_glm}:")
        formatted_output = "\n".join([f"Index: {index_glm + 2}\nOriginal Value: {original_glm} --> Imputed value: {imputed_glm}" for index_glm, (original_glm, imputed_glm) in zip(imputed_values_column_glm.index, imputed_value_pairs_glm)])
        print(formatted_output + "\n")
        # Store imputed values in dictionary
        indices_glm = imputed_values_mask_column[imputed_values_mask_column_glm].index
        dict_imputed_glm_values['Feature'].extend([column_glm] * len(indices_glm))
        dict_imputed_glm_values['Index'].extend(indices_glm + 2)  # Adding 2 to match your index adjustment
        dict_imputed_glm_values['Original Value'].extend(imputed_values_column_glm.fillna('nan').tolist())
        dict_imputed_glm_values['Imputed Value'].extend(imputed_value_glm.tolist())
        #print(dict_imputed_values)

# Export imputed values to Excel
imputed_df_glm = pd.DataFrame(dict_imputed_glm_values)  
imputed_df_glm.to_csv(f"run_1/qc/tables/GLM_imputation/GLM_imputed_values_0_FALSE_100.csv", index=False)

Imputed values for Input Resistance:
Index: 651
Original Value: nan --> Imputed value: 1459.2659726497807

Imputed values for Maximum firing :
Index: 632
Original Value: nan --> Imputed value: 2.2294232161907663
Index: 637
Original Value: nan --> Imputed value: 2.438648690197947

Imputed values for Rheobase:
Index: 452
Original Value: nan --> Imputed value: 30.53588848201106

Imputed values for EPSC freq:
Index: 27
Original Value: nan --> Imputed value: 0.46250544450208564
Index: 49
Original Value: nan --> Imputed value: 0.05891104191168961
Index: 52
Original Value: nan --> Imputed value: 0.0
Index: 56
Original Value: nan --> Imputed value: 0.9527539411550435
Index: 57
Original Value: nan --> Imputed value: 0.0
Index: 60
Original Value: nan --> Imputed value: 0.7497141200283882
Index: 61
Original Value: nan --> Imputed value: 0.29000422448053176
Index: 62
Original Value: nan --> Imputed value: 0.5072489168092112
Index: 64
Original Value: nan --> Imputed value: 0.006377806020746557
Inde

## Linear Regression

In [1]:
import matplotlib.pyplot as plt
import numpy as np

from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler 
from sklearn import preprocessing 
from sklearn.metrics import accuracy_score 
from sklearn.model_selection import StratifiedShuffleSplit

scaler = StandardScaler()

data_extracted_features_imputed_LR = data_extracted_features_imputed.copy()

X = data_extracted_features_imputed_LR[['Capacitance', 'Resting membrane potential ', 'Maximum firing ', 'Input Resistance', 'Rheobase']]
y = data_extracted_features_imputed_LR[['Div calculated']]

X = scaler.fit_transform(X)
y = scaler.fit_transform(y)

sss = StratifiedShuffleSplit(n_splits=5)
regr = linear_model.LinearRegression()
all_r2s = []
i = -1
best_r2 = {}
for train_index, test_index in sss.split(X, y):
    i += 1
    X_train, X_test = X[train_index], X[test_index] 
    y_train, y_test = y[train_index], y[test_index] 
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)

    print(f'Fold {i}:')
    # The coefficients
    print("Coefficients: \n", regr.coef_)
    # The mean squared error
    print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
    # The coefficient of determination: 1 is perfect prediction
    r2 = "%.2f" % r2_score(y_test, y_pred)
    print("Coefficient of determination:", r2)
    X_test = X_test[:,0].reshape(-1, 1)
    X_test = X_test.flatten()
    y_test = y_test.flatten()
    y_pred = y_pred.flatten()

    x = np.linspace(-2, 2, 111)
    b = 0
    y_plot = r2_score(y_test, y_pred)
    all_r2s.append(r2)
    best_r2[i] = [r2, y_test, y_pred, x, y_plot, b]
    plt.scatter(y_test, y_pred, color="black")
    plt.xlabel('True values')
    plt.ylabel('Predicted values')
    plt.title(f'Linear Regression DIV fold {i}: True vs. Predicted values')
    plt.plot(x, y_plot*x+b, linestyle='solid', linewidth=3) 
    plt.xticks(())
    plt.yticks(())
    plt.show()
print(all_r2s)
print(best_r2.values())
print(max(best_r2.values())[0])
print(best_r2)
"""
plt.scatter(best_r2[], y_pred, color="black")
plt.xlabel('True values')
plt.ylabel('Predicted values')
plt.title(f'Linear Regression DIV fold {i}: True vs. Predicted values')
plt.plot(x, y_plot*x+b, linestyle='solid', linewidth=3) 
plt.xticks(())
plt.yticks(())
plt.show()"""

NameError: name 'data_extracted_features_imputed' is not defined