# Calculate MI for each individual songbird
1. load datasets
2. calculate MI

In [None]:
import pandas as pd
import numpy as np
from parallelspaper.config.paths import DATA_DIR
from parallelspaper.birdsong_datasets import MI_seqs, compress_seq
from parallelspaper import information_theory as it 
from tqdm.autonotebook import tqdm
from parallelspaper import model_fitting as mf

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

### Load data

In [None]:
starling_seq_df = pd.read_pickle(DATA_DIR / 'song_seq_df/starling.pickle')
CAVI_CATH_seq_df = pd.read_pickle(DATA_DIR / 'song_seq_df/CAVI_CATH.pickle')
BF_seq_df = pd.read_pickle(DATA_DIR / 'song_seq_df/BF.pickle')

In [None]:
seq_dfs = pd.concat([starling_seq_df, CAVI_CATH_seq_df, BF_seq_df])

In [None]:
# sequence lengths
seq_dfs['sequence_lens'] = [len(i) for i in seq_dfs.syllables]
# recording number as integer
seq_dfs['rec_num'] = seq_dfs.rec_num.values.astype('int32')
# sort sequences
seq_dfs = seq_dfs.sort_values(by=['species', 'bird', 'rec_num'])
# get rid of unID'd birds (CAVI, CATH)
seq_dfs = seq_dfs[seq_dfs.bird != '?']

In [None]:
len(seq_dfs)

In [None]:
seq_dfs[:3]

### Calculate MIs within bird

In [None]:
verbose=False
MI_DF = pd.DataFrame(columns=['indv', 'species', 'type', 'rep', 'MI', 'MI_shuff', 'distances',
                              'MI_var', 'MI_shuff_var', 'n_elements',
                              'exp_results', 'pow_results', 'concat_results',
                              'R2_exp', 'R2_concat', 'R2_power', 'AICc_exp', 'AICc_concat',
                              'AICc_power', 'bestfitmodel'])

for species in np.unique(seq_dfs.species):
    species_df = seq_dfs[seq_dfs.species ==
                         species].sort_values(by=['bird', 'rec_num'])
    print(species)
    for bird in tqdm(np.unique(species_df.bird), leave=False):
        
        indv = bird
        # analysis by day
        distances = np.arange(1, 101)
        bird_df = species_df[species_df.bird ==
                             bird].sort_values(by=['rec_num'])

        # split into days, etc
        day_group = []
        day_group_compressed = []
        day_group_shuff_within = []
        day_group_shuff_between = []
        for day in np.unique(bird_df.day.values):
                day_df = bird_df[bird_df.day == day]
                day_group.append(np.concatenate(day_df.syllables.values))
                day_group_compressed.append(np.concatenate(
                    [compress_seq(sq) for sq in day_df.syllables.values]))
                day_group_shuff_within.append(np.concatenate(
                    [np.random.permutation(sq) for sq in day_df.syllables.values]))
                day_group_shuff_between.append(np.concatenate(
                    np.random.permutation(day_df.syllables.values)))
        # skip if too little data
        if len(np.concatenate(day_group)) < 150: continue

        if verbose: print('within day')
        MI_DF.loc[len(MI_DF)] = MI_seqs(day_group, distances, species,
                                        type_="day", n_jobs=20, verbosity=0, nrep=1, indv=indv, verbose=False)

        if verbose: print('compress')
        MI_DF.loc[len(MI_DF)] = MI_seqs(day_group_compressed, distances, species,
                                        type_="compress", n_jobs=20, verbosity=0, nrep=1, indv=indv, verbose=False)

        if verbose: print('shuffled within')
        MI_DF.loc[len(MI_DF)] = MI_seqs(day_group_shuff_within, distances, species,
                                        type_="shuffled_within", n_jobs=20, verbosity=0, nrep=1, indv=indv, verbose=False)

        if verbose: print('shuffled between')
        MI_DF.loc[len(MI_DF)] = MI_seqs(day_group_shuff_between, distances, species,
                                        type_="shuffled_between", n_jobs=20, verbosity=0, nrep=1, indv=indv, verbose=False)

        # within song
        if verbose: print('within song')
        seqs = bird_df.syllables.values
        # skip if not enough data
        seq_lens = np.array([len(i) for i in seqs])
        median_seq_len = int(np.median(seq_lens))
        seqs = [seq for seq in seqs if len(seq) > median_seq_len]
        if len(seqs) < 1:continue
        if verbose: print(species, median_seq_len)
        distances = np.unique(np.linspace(1, median_seq_len/2, num=10000).astype(int))
        MI_DF.loc[len(MI_DF)] = MI_seqs(seqs, distances, species,
                                        type_="song", n_jobs=20, verbosity=0, nrep=1, indv=indv, verbose=False)

In [None]:
MI_DF.to_pickle(DATA_DIR / 'MI_DF/birdsong/birdsong_MI_DF_individual.pickle')