<a href="https://colab.research.google.com/github/taryaksama/data-science/blob/master/task1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# A. Setup

In [None]:
#import packages
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Clone GitHub repository
!git clone https://github.com/taryaksama/data-science/
%cd data-science

# B. Experimental plan

provide a dataframe with experimental plan
- n_exp
- date
- strain
- replica
- file_adress

In [None]:
path = '.' #address of working directory

# create a DataFrame with all experiments data
folder_list = [f for f in os.listdir(path) if (os.path.isdir(os.path.join(path, f)) and f[:6].isdigit())]
exp = pd.DataFrame(folder_list, columns=['folderpath'])

In [None]:
exp_date = [exp.folderpath[i][:6] for i in range(len(exp.folderpath))]
exp_strain = [exp.folderpath[i][7:10] for i in range(len(exp.folderpath))]

exp['date'] = exp_date
exp['strain'] = exp_strain

# 1. Histogram Area

## Associated function

In [None]:
bin_edges = list(range(100))
bin_edges_norm = list(np.linspace(0,1,11))

def get_area_hist(path, filename, n_strain=0, n_replica=0):
  df = pd.read_csv(path+filename)
  df.columns = ['cellid', 'area', 'area_mean', 'area_norm']

  counts, _, _ = plt.hist(df['area'], bins=bin_edges, density=True)
  counts_norm, _, _ = plt.hist(df['area_norm'], bins=bin_edges_norm, density=True)
  plt.close()

  return counts, counts_norm, np.std(counts), np.std(counts_norm)

## Loop for all experiments

In [None]:
# get histogram from all experiments
exp['area_hist'] = [[] for _ in range(len(exp))]
exp['area_hist_norm'] = [[] for _ in range(len(exp))]
exp['area_hist_std'] = [[] for _ in range(len(exp))]
exp['area_hist_norm_std'] = [[] for _ in range(len(exp))]


for n in range(len(exp)):
  path = './' + exp.folderpath[n] + '/tessellation/'
  filename = 'frame_0_voronoi_areas_microns.csv'

  a, b, c, d = get_area_hist(path, filename)

  exp.at[n, 'area_hist'] = a
  exp.at[n, 'area_hist_norm'] = b
  exp.at[n, 'area_hist_std'] = c
  exp.at[n, 'area_hist_norm_std'] = d

# 2. Surface Coverage

In [None]:
# get surface coverage from all experiments
exp['surface_coverage'] = [range(0,len(bin_edges)-1) for _ in range(len(exp))]

for n in range(len(exp)):
  path = './' + exp.folderpath[n] + '/surface_coverage/'
  filename = 'surface_coverage_and_density.csv'
  df = pd.read_csv(path+filename)

  exp.at[n, 'surface_coverage'] = float(df.iloc[:,1])

In [None]:
exp

# C. Plots

## Area histograms

In [None]:
data_plot = pd.DataFrame([], columns=['x', 'y'])
data_plot['x'] = [bin_edges[:-1] for _ in range (len(exp))]
data_plot['y'] = exp['area_hist'] # change depending on column

In [None]:
# Plot histogram

for n in range(len(data_plot)):
  plt.plot(data_plot.at[n, 'x'], data_plot.at[n, 'y'])

plt.legend(exp['strain']+'_'+exp['date'])
plt.show()

## STD vs. surface coverage

In [None]:
## for each strain

for strain in exp['strain'].unique():
  data_plot = pd.DataFrame([], columns=['x', 'y'])
  data_plot['x'] = exp['surface_coverage']
  data_plot['y'] = exp['area_hist_std'][exp['strain'] == strain]

  plt.plot(data_plot['x'], data_plot['y'], marker='o', linestyle='none')

plt.legend(exp['strain'].unique())
plt.xlabel('Surface coverage')
plt.ylabel('Area Standard deviation')
plt.show()

In [None]:
# boxplot for surface coverage
sns.boxplot(exp, x='surface_coverage', y='strain', hue='strain', palette="hls")
sns.stripplot(exp, x='surface_coverage', y='strain', size=4, color=".3")
plt.show()

# boxplot for std(voronoi)
sns.boxplot(exp, x='area_hist_std', y='strain', hue='strain', palette="hls")
sns.stripplot(exp, x='area_hist_std', y='strain', size=4, color=".3")
plt.show()

## per surface coverage

In [None]:
# Initialize data
data2plot = exp
data2plot = data2plot.sort_values(by=['strain', 'surface_coverage'], axis=0, ascending=True)
data2plot['bin_edges'] = [bin_edges[:-1] for _ in range (len(data2plot))]
data2plot['bin_edges_norm'] = [bin_edges_norm[:-1] for _ in range (len(data2plot))]

# Bin the surface coverage into categories
bins = [0, 25, 60, 100]
bins_label = ['Low', 'Medium', 'High']
data2plot['surface_coverage_bin'] = pd.cut(data2plot['surface_coverage'], bins=bins, labels=bins_label)

# Create a column for combined category
data2plot['strain_surface_coverage'] = data2plot['strain'] + '_' + data2plot['surface_coverage_bin'].astype(str)

data2plot

1. plot all replicate in gray + mean
- area
- area_norm

one figure per strain, per surface coverage

=> can be a FacetGrid (x=bin, y=counts)
X = surface coverage category
Y = strain

In [None]:
# Custom function to draw each histogram and an overlay of the mean curve
def plot_records_and_mean(x, y, **kwarg):
  """
  Plots individual records and overlays a mean curve.

  Parameters:
  ----------
  x : list or pandas Series
      List of x-coordinates for the records. For a single record, it can be a Series.
  y : list or pandas Series
      List of y-coordinates for the records. For a single record, it can be a Series.
  **kwargs : dict
      Additional keyword arguments passed by seaborn's FacetGrid.

  Returns:
  -------
  None
      The function modifies the current matplotlib axis to add the plots.
  """

  ax = plt.gca()

  if len(x)==1: # CASE 1: ONLY 1 DATA
    ax.plot(x.to_list()[0], y.to_list()[0], color='red', linewidth=2, label="Mean Curve")

  else: #CASE 2: MORE THAN 1 DATA
    # Plot individual records
    for x_vals, y_vals in zip(x,y):
      ax.plot(x_vals, y_vals, linestyle='-', color='gray', alpha=0.5)

    # Compute mean data
    x_flattened = [val for sublist in x for val in sublist]
    # y_flattened = [val for sublist in y for val in sublist]
    unique_x = sorted(set(x_flattened))

    # Calculate mean y-values for each unique x
    mean_y = [
        np.mean([y_vals[x_vals.index(pt)] for x_vals, y_vals in zip(x, y) if pt in x_vals])
        for pt in unique_x
    ]

    # Plot the mean curve in blue
    ax.plot(unique_x, mean_y, color='red', linewidth=1, label="Mean Curve")

    # set x-axis limits (TO BE ADJUSTED AS FUNCTION PARAMETER)
    ax.set_xlim([0,40])

In [None]:
# Initialize the FacetGrid object
g = sns.FacetGrid(data2plot, col='strain_surface_coverage', col_wrap=3, sharex=True, sharey=True)

# Draw on FacetGrid
g.map(plot_records_and_mean, 'bin_edges', 'area_hist')

2. mean histogram with strains combines
- area
- area_norm

one figure per surface coverage

3. boxplot

- y = STD(area)
- x = strain with 3 sub box (one for each surface coverage)

In [None]:
# Create the boxplot
sns.boxplot(data=data2plot, x='strain_surface_coverage', y='area_hist_std')
sns.stripplot(data2plot, x='strain_surface_coverage', y='area_hist_std', size=4, color=".3")