In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import os
from functools import reduce
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats.stats import pearsonr
from scipy import stats
from scipy.stats import ks_2samp
from scipy.stats import entropy

In [3]:
# 
# Input data are in individual tab-delimited files, one for each experiment
# Here I load each one into its own Pandas dataframe and create a list of frames
#

frames = []

for file in os.listdir("rawdata"):
    if file.endswith(".gbgout"):
        filepath = os.path.join("rawdata/", file)
        fs = file.split('_')
        name = "_".join(fs[0:2])
        frames.append(pd.read_csv(filepath, sep="\t", header = None, names=['Gene',name]))

In [4]:
#
# This combines all of the individual frames into one dataframe, combining on the "Gene" column
#

df = reduce(lambda  left,right: pd.merge(left,right,on=['Gene'], how='outer'), frames)

In [6]:
#
# Sort columns by name
#

df = df.reindex(sorted(df.columns), axis=1)

In [7]:
#
# Use only genes starting with "Y" which are protein-coding genes
#

df = df.loc[df['Gene'].str.startswith('Y')]

In [8]:
#
# Index dataframe on 'Gene'
#

df = df.set_index('Gene')

In [9]:
# Save

df.to_csv("Barton_combined_Ygenes.txt", sep='\t')