<a href="https://colab.research.google.com/github/taryaksama/data-science/blob/master/task1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# A. Setup

In [None]:
#import packages
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Clone GitHub repository
!git clone https://github.com/taryaksama/data-science/
%cd data-science

# B. Experimental plan

provide a dataframe with experimental plan
- n_exp
- date
- strain
- replica
- file_adress

In [None]:
path = '.' #address of working directory

# create a DataFrame with all experiments data
folder_list = [f for f in os.listdir(path) if (os.path.isdir(os.path.join(path, f)) and f[:6].isdigit())]
exp = pd.DataFrame(folder_list, columns=['folderpath'])

In [None]:
exp_date = [exp.folderpath[i][:6] for i in range(len(exp.folderpath))]
exp_strain = [exp.folderpath[i][7:10] for i in range(len(exp.folderpath))]

exp['date'] = exp_date
exp['strain'] = exp_strain

# 1. Histogram Area

## Associated function

In [None]:
bin_edges = list(range(100))
bin_edges_norm = list(np.linspace(0,1,11))

def get_area_hist(path, filename, n_strain=0, n_replica=0):
  df = pd.read_csv(path+filename)
  df.columns = ['cellid', 'area', 'area_mean', 'area_norm']

  counts, _, _ = plt.hist(df['area'], bins=bin_edges, density=True)
  counts_norm, _, _ = plt.hist(df['area_norm'], bins=bin_edges_norm, density=True)
  plt.close()

  return counts, counts_norm, np.std(counts), np.std(counts_norm)

## Loop for all experiments

In [None]:
# get histogram from all experiments
exp['area_hist'] = [[] for _ in range(len(exp))]
exp['area_hist_norm'] = [[] for _ in range(len(exp))]
exp['area_hist_std'] = [[] for _ in range(len(exp))]
exp['area_hist_norm_std'] = [[] for _ in range(len(exp))]


for n in range(len(exp)):
  path = './' + exp.folderpath[n] + '/tessellation/'
  filename = 'frame_0_voronoi_areas_microns.csv'

  a, b, c, d = get_area_hist(path, filename)

  exp.at[n, 'area_hist'] = a
  exp.at[n, 'area_hist_norm'] = b
  exp.at[n, 'area_hist_std'] = c
  exp.at[n, 'area_hist_norm_std'] = d

# 2. Surface Coverage

In [None]:
# get surface coverage from all experiments
exp['surface_coverage'] = [range(0,len(bin_edges)-1) for _ in range(len(exp))]

for n in range(len(exp)):
  path = './' + exp.folderpath[n] + '/surface_coverage/'
  filename = 'surface_coverage_and_density.csv'
  df = pd.read_csv(path+filename)

  exp.at[n, 'surface_coverage'] = float(df.iloc[:,1])

# C. Plots

## Area histograms

In [None]:
data_plot = pd.DataFrame([], columns=['x', 'y'])
data_plot['x'] = [bin_edges[:-1] for _ in range (len(exp))]
data_plot['y'] = exp['area_hist'] # change depending on column

In [None]:
# Plot histogram

for n in range(len(data_plot)):
  plt.plot(data_plot.at[n, 'x'], data_plot.at[n, 'y'])

plt.legend(exp['strain']+'_'+exp['date'])
plt.show()

In [None]:
# FacetGrid representation

# Expand the DataFrame for FacetGrid compatibility
expanded_data_plot = pd.DataFrame({
    'row_id': data_plot.index.repeat(data_plot['x'].str.len()),  # Repeat row index for each (x, y) pair
    'x': [val for sublist in data_plot['x'] for val in sublist],  # Flatten 'x'
    'y': [val for sublist in data_plot['y'] for val in sublist]   # Flatten 'y'
})

# Initialize the FacetGrid object
pal = sns.cubehelix_palette(10, rot=-.25, light=.7)
g = sns.FacetGrid(expanded_data_plot, row='row_id', aspect=15, height=.5, palette=pal)

# Draw the densities in a few steps
g.map(plt.plot, 'x', 'y')
g.map(plt.fill_between, 'x', 'y', alpha=1)

# Set the subplots to overlap
g.figure.subplots_adjust(hspace=-.25)

# Remove axes details that don't play well with overlap
g.set_titles("")
g.set(yticks=[], ylabel="")
g.despine(bottom=True, left=True)

## STD vs. surface coverage

In [None]:
## for each strain

for strain in exp['strain'].unique():
  data_plot = pd.DataFrame([], columns=['x', 'y'])
  data_plot['x'] = exp['surface_coverage']
  data_plot['y'] = exp['area_hist_std'][exp['strain'] == strain]

  plt.plot(data_plot['x'], data_plot['y'], marker='o', linestyle='none')

plt.legend(exp['strain'].unique())
plt.xlabel('Surface coverage')
plt.ylabel('Area Standard deviation')
plt.show()

In [None]:
# boxplot for surface coverage
sns.boxplot(exp, x='surface_coverage', y='strain', hue='strain', palette="hls")
sns.stripplot(exp, x='surface_coverage', y='strain', size=4, color=".3")
plt.show()

# boxplot for std(voronoi)
sns.boxplot(exp, x='area_hist_std', y='strain', hue='strain', palette="hls")
sns.stripplot(exp, x='area_hist_std', y='strain', size=4, color=".3")
plt.show()

## per surface coverage

1. plot all replicate in gray + mean
- area
- area_norm

one figure per strain, per surface coverage

2. mean histogram with strains combines
- area
- area_norm

one figure per surface coverage

3. boxplot

- y = STD(area)
- x = strain with 3 sub box (one for each surface coverage)

In [None]:
data2plot = exp

# Bin the surface coverage into categories
bins = [0, 25, 60, 100]
bins_label = ['Low', 'Medium', 'High']
data2plot['surface_coverage_bin'] = pd.cut(data2plot['surface_coverage'], bins=bins, labels=bins_label)

# Create a column for combined category
data2plot['strain_surface_coverage'] = data2plot['strain'] + '_' + data2plot['surface_coverage_bin'].astype(str)
data2plot = data2plot.sort_values(by='strain_surface_coverage', axis=0, ascending=True)

# Create the boxplot
sns.boxplot(data=data2plot, x='strain_surface_coverage', y='area_hist_std')
sns.stripplot(exp, x='strain_surface_coverage', y='area_hist_std', size=4, color=".3")
plt.show()