# gini_calculator.ipynb

Pulls in some histograms - one trustworthy set from Fiji, one deeply cursed set from CellProfiler.

Approximates the Gini coefficient for each set of histograms. Looks for a difference.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
fiji_df = pd.read_excel('input_folder/gini_fiji.xlsx', sheet_name='D02_histo')

In [None]:
fiji_df = fiji_df.dropna()

In [None]:
fig, ax = plt.subplots()
for col in range(1, 30):
    ax.plot(fiji_df['bin'], fiji_df[col].cumsum()/ fiji_df[col].sum(), label=col)
ax.set_xlim(0, 10)

In [None]:
fiji_df[1]

In [None]:
fiji_df['bin']

In [None]:
import pandas as pd
import numpy as np

def calculate_gini(df):
    # Sorting the DataFrame by bin number
    df_sorted = df.sort_values('bin_number')
    
    # Calculating cumulative population and cumulative wealth
    df_sorted['cumulative_population'] = df_sorted['frequency'].cumsum()
    df_sorted['cumulative_wealth'] = df_sorted['bin_number'] * df_sorted['frequency']
    df_sorted['cumulative_wealth'] = df_sorted['cumulative_wealth'].cumsum()
    
    # Normalizing the cumulative values to go from 0 to 1
    total_population = df_sorted['frequency'].sum()
    total_wealth = df_sorted['bin_number'].dot(df_sorted['frequency'])
    df_sorted['cumulative_population'] /= total_population
    df_sorted['cumulative_wealth'] /= total_wealth

    # Adding (0,0) as the starting point for Lorenz curve
    lorenz_points = pd.concat([pd.DataFrame({'cumulative_population': [0], 'cumulative_wealth': [0]}),
                               df_sorted[['cumulative_population', 'cumulative_wealth']]])

    # Calculating the area under the Lorenz curve using the trapezoidal rule
    area_under_lorenz = np.trapz(lorenz_points['cumulative_wealth'], lorenz_points['cumulative_population'])

    # Gini coefficient is 1 - 2 * area under the Lorenz curve
    gini_coefficient = 1 - 2 * area_under_lorenz
    return gini_coefficient

# Example DataFrame
data = {
    'bin_number': [1, 2, 3, 4, 5],  # Example bin numbers
    'frequency': [10, 20, 30, 25, 15]  # Example frequencies
}
df = pd.DataFrame(data)

# Calculate Gini coefficient
gini_coefficient = calculate_gini(df)
print("Gini Coefficient:", gini_coefficient)

In [None]:
fiji_df['frequency'] = fiji_df[1]
fiji_df['bin_number'] = fiji_df['bin']

In [None]:
ginis = []
for col in range(1, 30):
    fiji_df['frequency'] = fiji_df[col]
    print("Gini Coefficient:", calculate_gini(fiji_df))
    ginis.append(calculate_gini(fiji_df))

In [None]:
plt.scatter(range(len(ginis)), ginis)

In [None]:
expand_nuclei_df = pd.read_csv('input_folder/gini_expand_nuclei.csv')

In [None]:
expand_nuclei_df

In [None]:
for _, row in expand_nuclei_df.iterrows():
    bins = list(range(1,51))
    vals = [row[f'Histogram_50BinsHistBin{col}_MIRO160mer'] for col in range(50)]
    hist_df = pd.DataFrame({'bin_number': bins, 'frequency': vals})
    print(calculate_gini(hist_df))

In [None]:
filtered = expand_nuclei_df[expand_nuclei_df['FileName_Hoechst'].str.startswith('Plate000_WellE02_Channel405,561,488,640_Seq0003-MaxIP_XY1_405.tif')]

In [None]:
filtered

In [None]:
ginis = []
hists = []
for _, row in filtered.iterrows():
    bins = list(range(1,51))
    vals = [row[f'Histogram_50BinsHistBin{col}_MIRO160mer'] for col in range(50)]
    hist_df = pd.DataFrame({'bin_number': bins, 'frequency': vals})
    hists.append(hist_df)
    ginis.append(calculate_gini(hist_df))

In [None]:
plt.scatter(range(len(ginis)), ginis)

In [None]:
import seaborn as sns

sns.distplot(ginis)

In [None]:
import seaborn as sns

sns.distplot(ginis)