# Transpose and merge miRNA data
Run notebook to transpose and merge miRNA data in the files:
* `Sg_8_weeks.xlsx`
* `Sg_Sham_8_weeks.xlsx`
* `Sg_16_weeks.xlsx`
* `Sg_Sham_16_weeks.xlsx`

Outputs the `transposed_Sg_miRNA.xlsx` file.

In [29]:
import pandas as pd
import numpy as np

In [30]:
def transpose_df(file_name, cohort=None):
    def create_cohort_key(cohorts, mouse_numbers):
        cohort_keys = [
            c + '_' + n
            for c,n in zip(cohorts, mouse_numbers)
        ]
        return cohort_keys
        
    df = pd.read_excel(file_name) # read data

    # merge miR and accession values into name col
    # and add as first col in datafame
    names = df.miR.str.cat(' (' + df.accession + ')')
    df.insert(0, 'miR_name', names)

    # now drop miR and accession cols
    df = df.drop(columns=['miR', 'accession'])
    
    # flip/transpose df
    tx_df  = df.transpose()

    # set columns to first row
    tx_df = tx_df.rename(columns=tx_df.iloc[0]).iloc[1:] # set colums to first row
    tx_df.index.name = 'Name'
    tx_df = tx_df.reset_index()
    tx_df.Name = tx_df.Name.astype(str)

    # add mouse numbers to each cohort
    mouse_numbers = list(tx_df['Name'].map(lambda x: x.split('-')[-1]))
    tx_df.insert(0, 'mouse_number', mouse_numbers)

    # add sex info: mice 1-5 are male, rest are female
    tx_df['female'] = np.where(tx_df['mouse_number'].astype(int) < 6, 0, 1)
    
    # add name of cohort
    if cohort:
        tx_df.insert(0, 'cohort', cohort)

        # add cohort key
        cohort_keys = create_cohort_key(tx_df.cohort, tx_df.mouse_number)
        tx_df.insert(0, 'cohort_key', cohort_keys)

    return tx_df

In [31]:
file_names = ['../data/Sg_8_weeks.xlsx', '../data/Sg_Sham_8_weeks.xlsx', '../data/Sg_16_weeks.xlsx', '../data/Sg_Sham_16_weeks.xlsx']
cohort_names = ['sg_8_weeks', 'sham_8_weeks', 'sg_16_weeks', 'sham_16_weeks']

In [32]:
list(zip(file_names, cohort_names))

[('../data/Sg_8_weeks.xlsx', 'sg_8_weeks'),
 ('../data/Sg_Sham_8_weeks.xlsx', 'sham_8_weeks'),
 ('../data/Sg_16_weeks.xlsx', 'sg_16_weeks'),
 ('../data/Sg_Sham_16_weeks.xlsx', 'sham_16_weeks')]

In [24]:
# # testing
# for (file_name, cohort_name) in zip(file_names, cohort_names):
#     print(file_name, cohort_name)
#     transpose_df(file_name, cohort_name)

../data/Sg_8_weeks.xlsx sg_8_weeks
../data/Sg_Sham_8_weeks.xlsx sham_8_weeks
../data/Sg_16_weeks.xlsx sg_16_weeks
../data/Sg_Sham_16_weeks.xlsx sham_16_weeks


In [25]:
dfs = [transpose_df(file_name, cohort_name) for (file_name, cohort_name) in zip(file_names, cohort_names)]

In [26]:
final_df = pd.concat(dfs)
len(final_df)

40

In [27]:
final_df.head()

Unnamed: 0,cohort_key,cohort,mouse_number,Name,mcmv-miR-M23-1-3p (MIMAT0005543),mcmv-miR-M23-1-5p (MIMAT0005542),mcmv-miR-M23-2 (MIMAT0005545),mcmv-miR-M44-1 (MIMAT0005546),mcmv-miR-M55-1 (MIMAT0005547),mcmv-miR-M87-1 (MIMAT0005550),...,mmu-miR-883b-5p (MIMAT0004850),mmu-miR-9 (MIMAT0000142),mmu-miR-92a (MIMAT0000539),mmu-miR-92b (MIMAT0004899),mmu-miR-93 (MIMAT0000540),mmu-miR-96 (MIMAT0000541),mmu-miR-98 (MIMAT0000545),mmu-miR-99a (MIMAT0000131),mmu-miR-99b (MIMAT0000132),female
0,sg_8_weeks_10_12,sg_8_weeks,10_12,20220829_run10_GVI-8W-10_12,89,80,22,37,61,33,...,67,140,25,25,245,392,790,5559,364,1
1,sg_8_weeks_1_03,sg_8_weeks,1_03,20220829_run10_GVI-8W-1_03,88,60,18,14,52,23,...,42,71,19,25,151,254,765,3564,264,1
2,sg_8_weeks_2_04,sg_8_weeks,2_04,20220829_run10_GVI-8W-2_04,75,82,12,22,63,30,...,62,143,21,18,174,393,659,5139,274,1
3,sg_8_weeks_3_05,sg_8_weeks,3_05,20220829_run10_GVI-8W-3_05,59,67,15,24,42,42,...,68,169,26,20,166,197,630,3887,250,1
4,sg_8_weeks_4_06,sg_8_weeks,4_06,20220829_run10_GVI-8W-4_06,94,83,20,28,83,35,...,102,116,26,20,218,338,829,5606,356,1


In [28]:
final_df.to_excel('../data/Streptococcus_gordonii/transposed_Sg_miRNA.xlsx', index=False, engine='openpyxl')