In [8]:
import pandas as pd
import os
import random
import numpy as np
from sklearn.preprocessing import scale

## Pooling 18 plates into one big file

In [None]:
path = 'N:/HiTS Projects and Data/Personal/Jake/mcf10a/raw_data_selected_features'
os.chdir(path)
list_fn = sorted(os.listdir())
pooled_expr_data = pd.DataFrame()
for fn in list_fn:
    _df = pd.read_hdf(fn)
    pooled_expr_data = pooled_expr_data.append(_df)
pooled_expr_data.to_hdf('pooled_expr_data_5_features.hdf','pooled')

## Prepare log2 transformed intensity data
### This is used for well-based differential analysis and binned data PCA

In [None]:
pooled_expr_data = pd.read_hdf('pooled_expr_data_5_features.hdf')
feature_meta = pd.read_json('../feature_metadata.json').transpose()
valid_cols = feature_meta[(feature_meta.feature_cat=='int') & 
                          (feature_meta.sublocation.isin(['nuc','cytoplasm']))&
                          (feature_meta.feature_type=='mean')&
                         (feature_meta.marker!='none')&
                         (feature_meta.dye!='none')]
valid_cols = valid_cols[~((valid_cols.marker=='dna')&(valid_cols.order!='4'))]
valid_cols = (valid_cols.sublocation + '_' + valid_cols.marker)
int_expr_data = pooled_expr_data[valid_cols.index].copy()
int_expr_data.columns = valid_cols.values
corrected_values = int_expr_data.fillna(0).values
corrected_values[corrected_values<1]=1
int_expr_data.loc[:,:] = np.log2(corrected_values)
int_expr_data.to_hdf('intensity_nuc_cytoplasm.hdf','meh')

## Prepare normalized, pose-QC data with NC ration and robust scaling
### This is used for per well clustering and unsupervised analysis

In [9]:
path = 'N:/HiTS Projects and Data/Personal/Jake/mcf10a'
os.chdir(path)
expr_data = pd.read_hdf('raw_data_selected_features/pooled_expr_data_5_features.hdf')
metadata = pd.read_csv('proper_FFC_metadata.csv',index_col=0)
valid_cells = metadata[(metadata.labeled_as_lost=='No')&(metadata.num_nuclei_in_mask==1)].index
metadata = metadata.loc[valid_cells]
expr_data = expr_data.loc[valid_cells]
feature_metadata = pd.read_json('feature_metadata.json').transpose()

# Handling missing data and negative intensity values
expr_data.fillna(1, inplace=True) # missing data filled with 1
# Negative intensity filled with 1
int_cols = feature_metadata[(feature_metadata.feature_type=='mean')].index
int_cols = [x for x in expr_data.columns if x in int_cols]
expr_data[int_cols] = expr_data[int_cols].apply(lambda x: [1 if i<1 else i for i in x.values])

# add nuclear to cytoplasm ratio
nucs = feature_metadata[(feature_metadata.feature_type=='mean')&(feature_metadata.sublocation=='nuc')&(feature_metadata.marker!='none')].index
cytoplasms = feature_metadata[(feature_metadata.feature_type=='mean')&(feature_metadata.sublocation=='cytoplasm')&(feature_metadata.marker!='none')].index
nc_ratio = np.divide(expr_data[nucs], expr_data[cytoplasms])

expr_data = expr_data-expr_data.min()
nc_ratio.columns = [x.replace('mean','NC-ratio') for x in nc_ratio.columns]
expr_data = pd.concat([expr_data,nc_ratio],axis=1)

# Normlalize
expr_data.loc[:,:] = scale(np.log2(1+expr_data))
# check for infs
assert(~(~np.isfinite(expr_data)).any().any())
# check for nulls
assert(~expr_data.isnull().any().any())
expr_data.to_hdf('postQC_log_normed_scaled_NC_ratio.hdf','meh')

## Check cells with negative intensity values and nulls

In [98]:
path = 'N:/HiTS Projects and Data/Personal/Jake/mcf10a'
os.chdir(path)
expr_data = pd.read_hdf('raw_data_selected_features/pooled_expr_data_5_features.hdf')
metadata = pd.read_csv('proper_FFC_metadata.csv',index_col=0)
feature_metadata = pd.read_json('feature_metadata.json').transpose()
metadata = metadata[(metadata.labeled_as_lost=='No')&(metadata.num_nuclei_in_mask==1)]
expr_data = expr_data.loc[metadata.index]
int_cols = feature_metadata[(feature_metadata.feature_type=='mean')].index
int_cols = [x for x in expr_data.columns if x in int_cols]
cells_with_neg_values = (expr_data[int_cols]<0).any(axis=1)
print(cells_with_neg_values.value_counts())

False    390717
True         95
dtype: int64


In [89]:
# Null cells
expr_data.isnull().any(axis=1).value_counts()

False    378728
True      12084
dtype: int64

In [92]:
# null columns
expr_data.isnull().any().value_counts()

True     576
False    144
dtype: int64

In [101]:
# negative values cells
(expr_data<0).any(axis=1).value_counts()

True    390812
dtype: int64

## Getting biological replicates

In [17]:
path = 'N:/HiTS Projects and Data/Personal/Jake/mcf10a'
os.chdir(path)
expr_data = pd.read_hdf('raw_data_selected_features/pooled_expr_data_5_features.hdf')
metadata = pd.read_csv('proper_FFC_metadata.csv',index_col=0)
metadata = metadata[(metadata.labeled_as_lost=='No')&(metadata.num_nuclei_in_mask==1)]
expr_data = expr_data.loc[metadata.index]

groups = metadata.ligand + '_' + metadata.time.astype(str) + '_' + metadata.replicate
bio_rep_mean =expr_data.loc[groups.index].groupby(groups.values).mean()
bio_rep_mean.to_csv('properFFC_biological_replicate_mean.csv')

## Add nuclei area to metadata

In [None]:
os.chdir('Z:/sorger/data/IN_Cell_Analyzer_6000/Connor/Fixed MCF10 Common/20x full exp/20180905_Updated/')
metadata = pd.read_csv('N:/HiTS Projects and Data/Personal/Jake/mcf10a/proper_FFC_metadata.csv',index_col=0)
for plate in range(1, 19):
    expr_data = pd.DataFrame()
    plate_id = plate
    plate = 'plate' + str(plate)
    print(plate)
    # Assumes the data was organized by plate and results are in the /analysis
    # folder
    path_analysis = os.path.join(plate, 'analysisCorrected')
    txt_files = [x for x in os.listdir(path_analysis) if 'txt' in x]
    for txt_fn in txt_files:
        well = txt_fn.split('_')[0]
        field = txt_fn.split('_')[1]
        cell_name_prefix = '_'.join([str(plate_id), well, field])
        fn = os.path.join(path_analysis, txt_fn)
        _df = pd.read_table(fn)
        _df.index = [cell_name_prefix + '_' +
                     str(x) for x in range(1, 1 + _df.shape[0])]
        metadata.loc[_df.index,'area'] = _df.none_none_none_mor_area_nuc.values
metadata.to_csv('N:/HiTS Projects and Data/Personal/Jake/mcf10a/proper_FFC_metadata.csv')

plate1
plate2
plate3
plate4
plate5
plate6
plate7
plate8
plate9
plate10
plate11
plate12
plate13
plate14
plate15
plate16
plate17
plate18
