In [1]:
import pandas as pd

# DATASET QIITA 550

In [2]:
%cd /storage/zkarwowska/microbiome-interactions/datasets/processed/

/storage/zkarwowska/microbiome-interactions/datasets/processed


In [3]:
# read feature table
#DF = pd.read_csv('raw_data/550_reference.tsv', skiprows=[0], sep = '\t', index_col=[0])

FILE = 'taxonomy/550_class.csv'
DF = pd.read_csv(FILE, index_col = [0]).T

# read metadata
METADATA = pd.read_csv('qiita_550/raw_data/550_metadata.txt', sep='\t')
METADATA = METADATA[METADATA['sample_name'].isin(DF.columns)]

In [4]:
# filter samples where body_site=UBERON:feces & mislabeled=False 
# split dataset to male and female sub-datasets

FEMALE_GUT_SAMPLES = METADATA[(METADATA['body_site']=='UBERON:feces')&(METADATA['mislabeled']==False)&(METADATA['sex']=='female')]['sample_name']
MALE_GUT_SAMPLES = METADATA[(METADATA['body_site']=='UBERON:feces')&(METADATA['mislabeled']==False)&(METADATA['sex']=='male')]['sample_name']

FEMALE_GUT_DF = DF[FEMALE_GUT_SAMPLES]
MALE_GUT_DF = DF[MALE_GUT_SAMPLES]

In [5]:
# change sample name to time step for each subject seperately
FEMALE_METADATA = METADATA[METADATA['sample_name'].isin(FEMALE_GUT_SAMPLES)]
MALE_METADATA = METADATA[METADATA['sample_name'].isin(MALE_GUT_SAMPLES)]

MALE_DICT = dict(zip(MALE_METADATA['sample_name'], MALE_METADATA['days_since_experiment_start']))
FEMALE_DICT = dict(zip(FEMALE_METADATA['sample_name'], FEMALE_METADATA['days_since_experiment_start']))

FEMALE_GUT_TIMESTEP = FEMALE_GUT_DF.rename(columns=FEMALE_DICT).T
MALE_GUT_TIMESTEP = MALE_GUT_DF.rename(columns=MALE_DICT).T

In [6]:
FEMALE_GUT_TIMESTEP = FEMALE_GUT_TIMESTEP.sort_index()
MALE_GUT_TIMESTEP = MALE_GUT_TIMESTEP.sort_index()

In [7]:
# drop columns with all zeros
MALE_GUT_TIMESTEP = MALE_GUT_TIMESTEP.loc[:, (MALE_GUT_TIMESTEP != 0).any(axis=0)]
FEMALE_GUT_TIMESTEP = FEMALE_GUT_TIMESTEP.loc[:, (FEMALE_GUT_TIMESTEP != 0).any(axis=0)]

In [8]:
MALE_GUT_TIMESTEP.index.name='timestep'
FEMALE_GUT_TIMESTEP.index.name='timestep'

In [9]:
# save files to csv
# saved dataframe will have OTUs as columns and timesteps as rows
PATH = 'ready_datasets_assigned_taxonomy/'
MALE_GUT_TIMESTEP.to_csv(PATH + 'male_class.csv')
FEMALE_GUT_TIMESTEP.to_csv(PATH + 'female_class.csv')

# DATASET QIITA 2202

In [None]:
%cd qiita_2202/

In [None]:
# read feature table
DF = pd.read_csv('2202_feature_table.tsv', skiprows=[0], sep = '\t', index_col=[0])

# read metadata
METADATA = pd.read_csv('2202_metadata.tsv', sep='\t')
METADATA = METADATA[METADATA['sample_id'].isin(DF.columns)]

In [None]:
# filter samples where body_site=UBERON:feces & mislabeled=False 
# split dataset to DONOR A and DONOR B sub-datasets

DONORA_GUT_SAMPLES = METADATA[(METADATA['body_site']=='UBERON:feces')&(METADATA['host_subject_id']=='DonorA')]['sample_id']
DONORB_GUT_SAMPLES = METADATA[(METADATA['body_site']=='UBERON:feces')&(METADATA['host_subject_id']=='DonorB')]['sample_id']

DONORA_GUT_DF = DF[DONORA_GUT_SAMPLES]
DONORB_GUT_DF = DF[DONORB_GUT_SAMPLES]

# change sample name to time step for each subject seperately
DONORA_METADATA = METADATA[METADATA['sample_id'].isin(DONORA_GUT_SAMPLES)]
DONORB_METADATA = METADATA[METADATA['sample_id'].isin(DONORB_GUT_SAMPLES)]

DONORA_DICT = dict(zip(DONORA_METADATA['sample_id'], DONORA_METADATA['collection_day']))
DONORB_DICT = dict(zip(DONORB_METADATA['sample_id'], DONORB_METADATA['collection_day']))

DONORA_GUT_TIMESTEP = DONORA_GUT_DF.rename(columns=DONORA_DICT).T
DONORB_GUT_TIMESTEP = DONORB_GUT_DF.rename(columns=DONORB_DICT).T

#sort index
DONORA_GUT_TIMESTEP = DONORA_GUT_TIMESTEP.sort_index()
DONORB_GUT_TIMESTEP = DONORB_GUT_TIMESTEP.sort_index()

# drop columns with all zeros
DONORA_GUT_TIMESTEP = DONORA_GUT_TIMESTEP.loc[:, (DONORA_GUT_TIMESTEP != 0).any(axis=0)]
DONORB_GUT_TIMESTEP = DONORB_GUT_TIMESTEP.loc[:, (DONORB_GUT_TIMESTEP != 0).any(axis=0)]

DONORA_GUT_TIMESTEP.index.name='timestep'
DONORB_GUT_TIMESTEP.index.name='timestep'

In [None]:
!mkdir assigned_sample_names_data

In [None]:
# save files to csv
# saved dataframe will have OTUs as columns and timesteps as rows

PATH = 'assigned_sample_names_data/'
DONORA_GUT_TIMESTEP.to_csv(PATH + 'donorA_assigned_sample_names.csv')
DONORB_GUT_TIMESTEP.to_csv(PATH + 'donorB_assigned_sample_names.csv')