# Seaborn

In [None]:
import pandas as pd
import seaborn as sns

In [None]:
# http://seaborn.pydata.org/tutorial/aesthetics.html
sns.set(rc={"figure.dpi":100, 'savefig.dpi':100})
sns.set_context('notebook') # talk
sns.set_style("ticks")

In [None]:
# sorted(plt.style.available)

In [None]:
df = pd.read_csv('data/igc_prefiltered.tsv.gz', sep='\t')

In [None]:
df.head()

In [None]:
(df.columns.str.replace('occurence_frequency', 'freq')
           .str.replace('taxonomic_annotation_', '')
           .str.replace('functional_categories', 'cat')
           .str.replace('_annotation', ''))

¿Por qué en la línea anterior a pesar de utilizar `str.replace()` no obtuve un **_warning_** ni utilicé la opción `regex=False` o `regex=True`?

In [None]:
df.columns = (df.columns.str.replace('occurence_frequency', 'freq')
                        .str.replace('taxonomic_annotation_', '')
                        .str.replace('functional_categories', 'cat')
                        .str.replace('_annotation', ''))

In [None]:
df.columns

In [None]:
df.head()

### Agreguemos algo de ruido

In [None]:
import numpy as np

rng = np.random.default_rng()
df['freq01'] = rng.normal(90, 0.2, len(df))
df['freq02'] = rng.normal(100, 0.6, len(df))
df['freq03'] = rng.negative_binomial(1, 0.1, len(df))

## Gráficas de distribución

<img src="./imgs/01.png" align="center"/>

In [None]:
df.head()

In [None]:
df.dtypes

### Violin

In [None]:
sns.violinplot(y='gene_length', data=df);

In [None]:
sns.violinplot(y='sample_freq', data=df);

In [None]:
sns.violinplot(y='freq01', data=df);

In [None]:
sns.violinplot(y='freq02', data=df);

In [None]:
sns.violinplot(y='freq03', data=df);

In [None]:
sns.violinplot(x='cohort_origin', y='gene_length', data=df);

In [None]:
sns.violinplot(x='cohort_origin', y='freq01', data=df);

In [None]:
g = sns.violinplot(x='cohort_origin', y='freq01', hue='cohort_origin', 
                    dodge=False, palette='Set2', data=df);

g.set(xlabel='Cohort Origin', ylabel='Frequency')
g.set_title('Frequency per cohort origin', loc='left', fontsize=16);
g.legend(bbox_to_anchor=(1, 1), loc=2, frameon=False);

# g.savefig("violinplot.svg")

### Density

In [None]:
sns.kdeplot(x='gene_length', data=df);

In [None]:
sns.kdeplot(x='freq01', data=df);

In [None]:
sns.kdeplot(x='freq02', data=df);

In [None]:
sns.kdeplot(x='freq03', data=df);

In [None]:
g = sns.kdeplot('freq01', data=df, shade=True, color="r")
g = sns.kdeplot('freq02', data=df, shade=True, color="b")

Estamos tratando de suponer la densidad de una función f que describa la aleatoreidad de los datod.

### Histograms

In [None]:
sns.histplot(x='gene_length', data=df);

In [None]:
sns.histplot(x='gene_length', data=df, bins=50);

In [None]:
sns.histplot(x='gene_length', data=df, bins=20);

In [None]:
sns.histplot(x='freq01', data=df, bins=50);

In [None]:
sns.histplot(x='freq03', data=df, bins=50);

In [None]:
g = sns.histplot(x='freq01', data=df, color="r")
g = sns.histplot(x='freq02', data=df, color="b")

<img src="./imgs/02.png" align="center"/>

### Líneas

In [None]:
df.select_dtypes('number')

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(df['gene_length']);

In [None]:
plt.plot(df['gene_length'].unique());

In [None]:
plt.plot(sorted(df['gene_length'].unique()));

In [None]:
plt.plot(df['gene_length'].sample(20), marker='o');

In [None]:
plt.plot(sorted(df['gene_length'].sample(20)), marker='o');

In [None]:
plt.plot(sorted(df['freq01'].unique()));

### Scatterplot

In [None]:
df.head()

In [None]:
sns.scatterplot(x='gene_length', y='sample_freq', data=df);

In [None]:
sns.scatterplot(x='gene_length', y='sample_freq', hue='cohort_origin', data=df);

In [None]:
sns.scatterplot(x='gene_length', y='sample_freq', hue='cohort_origin', 
                data=df[df['gene_length'] >= 7500]);

In [None]:
sns.scatterplot(x='sample_freq', y='individual_freq', hue='cohort_origin',
                data=df.head(100));

In [None]:
sns.scatterplot(x='sample_freq', y='individual_freq', hue='cohort_origin',
                data=df);

In [None]:
sns.scatterplot(x='sample_freq', y='individual_freq', hue='cohort_origin', 
                legend=False, data=df[df['gene_length'] >= 15000].head(50));

### Heatmap

In [None]:
df.head()

In [None]:
df[['phylum_level', 'kegg_cat', 'sample_freq']].head()

In [None]:
df.head(20).pivot_table(index='kegg_cat',
                        columns='phylum_level',
                        values="sample_freq",
                        fill_value=0)

In [None]:
data = df.head(100).pivot_table(index='kegg_cat',
                                columns='phylum_level',
                                values='sample_freq',
                                fill_value=0)

In [None]:
sns.heatmap(data);

In [None]:
sns.heatmap(data, cmap='Blues');

### Barplots

In [None]:
df.head()

In [None]:
sns.barplot(x='cohort_origin', y='sample_freq', data=df)

In [None]:
sns.barplot(x='phylum_level', y='gene_length', data=df.head(50));

Categorical data

In [None]:
sns.catplot(x='phylum_level', y='genus_level', hue='cohort_origin', 
            col='cohort_origin', col_wrap=2, data=df.head(50));

In [None]:
sns.catplot(x='cohort_origin', y='gene_length', hue='cohort_origin', 
            col='phylum_level', col_wrap=4, data=df.head(5000));

### Boxplot

In [None]:
sns.boxplot(x='cohort_origin', y='freq01', data=df);

In [None]:
sns.boxplot(x='cohort_origin', y='gene_length', data=df);

In [None]:
g = sns.boxplot(x='phylum_level', y='gene_length', data=df);
g.set_xticklabels(g.get_xticklabels(), rotation=45, ha='right',
                  rotation_mode="anchor"); 

# If "anchor", then alignment occurs before rotation.

### Swarmplot

In [None]:
g = sns.swarmplot(x='cohort_origin', y='gene_length', 
                  hue='phylum_level', data=df[df['gene_length'] >= 7500], size=3.5);

g.legend(bbox_to_anchor=(1, 1), loc=2, frameon=False);

In [None]:
data = df[(df['gene_length'] >= 5000) & 
          (df['gene_length'] <= 7000)]

g = sns.swarmplot(x='cohort_origin', y='gene_length', 
                  hue='phylum_level', data=data, size=3.5);

g.legend(bbox_to_anchor=(1, 1), loc=2, frameon=False);

In [None]:
data = df[(df['gene_length'] >= 5000) & 
          (df['gene_length'] <= 7000)]

g = sns.swarmplot(x='phylum_level', y='gene_length', 
                  hue='cohort_origin', data=data, size=2.8);

g.set_xticklabels(g.get_xticklabels(), rotation=45, ha='right',
                  rotation_mode='anchor'); 
g.legend(bbox_to_anchor=(1, 1), loc=2, frameon=False);

Las cosas se pueden complicar un poco, pero al final se pueden obtener gráficos bastante informativos.

In [None]:
data = df[(df['gene_length'] >= 5000) & 
          (df['gene_length'] <= 7000)]

sns.swarmplot(x='phylum_level', y='gene_length', 
                  hue='cohort_origin', data=data, size=2.8);

g = sns.boxplot(x='phylum_level', y='gene_length', data=data,
                boxprops={'facecolor':'None'})

g.set_xticklabels(g.get_xticklabels(), rotation=45, ha='right',
                  rotation_mode='anchor');
g.legend(bbox_to_anchor=(1, 1), loc=2, frameon=False);