In [113]:
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

warnings.filterwarnings('ignore')

In [114]:
class Config:
    TRAINING_FILE = "../input/train_data_2000.csv"
    CONTACT_FILE = "../input/contact_data_2000.csv"
    TEST_FILE = "../input/test_data_2000.csv"
    MODEL_OUTPUT = "../models/"
    PROCESSED_DATA_OUTPUT = "../input/processed_data.csv"


config = Config()

In [None]:
test = pd.read_csv(config.TEST_FILE)
train = pd.read_csv(config.TRAINING_FILE)
contact = pd.read_csv(config.CONTACT_FILE)

In [None]:
contact['intra'] = (contact['chr1'] == contact['chr2']).astype(int)

In [None]:
contact['log_dist'] = np.where(contact['intra'] == 1, (np.log2(
    np.abs(contact['end2'] - contact['end1']) + 1) + 0.1) / 0.1, 0)

In [118]:
n = (contact['log_dist'] > 0).sum()
bins = [-1, 0, 50, 100, 150, 200, 250, 300]
labels = ['0', '0-50', '50-100', '100-150', '150-200', '200-250', '250-300']
contact['log_dist_bin'] = pd.cut(
    contact['log_dist'], bins=bins, labels=labels)
contact.head()

Unnamed: 0,chr1,start1,end1,chr2,start2,end2,cellid,intra,log_dist,log_dist_bin
0,chr13-M,74316813,74316959,chr13-M,72727004,72727154,SCG0088_TTTAACCTCAGCCAAT-1,1,207.004193,200-250
1,chr1-M,79322530,79322563,chr12-M,4538118,4538268,SCG0088_TATAGGTGTCCCGGAA-1,0,0.0,0
2,chr2-M,75633331,75633491,chr8-P,125695812,125695962,SCG0088_CGTTAACAGTACCGCA-1,0,0.0,0
3,chr7-P,136324163,136324313,chr7-P,136352442,136352592,SCG0088_TTTAACCTCAGCCAAT-1,1,148.874945,100-150
4,chr6-M,49253365,49253515,chr6-M,49323546,49323680,SCG0088_CGTTAACAGTACCGCA-1,1,161.984845,150-200


In [119]:
prob_df = contact.groupby(['cellid', 'log_dist_bin']
                          ).size().unstack(fill_value=0)
prob_df = prob_df.div(prob_df.sum(axis=1), axis=0)
prob_df

log_dist_bin,0,0-50,50-100,100-150,150-200,200-250,250-300
cellid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
SCG0088_AAAGGACGTTAACGGC-1,0.175143,0.005968,0.158796,0.259211,0.210431,0.125584,0.064868
SCG0088_AAATCCGGTGACATAT-1,0.116243,0.005389,0.224788,0.279061,0.257891,0.074673,0.041955
SCG0088_AACAGCAAGACAGGCG-1,0.115605,0.004902,0.223856,0.348039,0.192402,0.083742,0.031454
SCG0088_AACATCATCAGGTTTA-1,0.091257,0.004867,0.171563,0.356683,0.280028,0.067617,0.027985
SCG0088_AACCTTAAGCTGCACA-1,0.144164,0.009582,0.212979,0.280923,0.216028,0.087108,0.049216
...,...,...,...,...,...,...,...
SCG0093_TTTGACTTCAAGGACA-1,0.149098,0.010446,0.204653,0.280152,0.250712,0.075499,0.029440
SCG0093_TTTGAGTCAATGCGCT-1,0.157895,0.006352,0.250000,0.284936,0.191924,0.069419,0.039474
SCG0093_TTTGCGGAGGATGATG-1,0.159850,0.007052,0.253879,0.280207,0.193230,0.075693,0.030089
SCG0093_TTTGTGAAGGCATGTT-1,0.143485,0.010737,0.252806,0.262567,0.229868,0.076623,0.023914


In [None]:
train = train.merge(prob_df, on='cellid', how='left')

In [None]:
total_contacts = contact.groupby('cellid').size().rename('total_contacts')
train = train.merge(total_contacts, on='cellid', how='left')

In [None]:
# Intra vs Inter contacts proportions
contact_type = contact.groupby(
    ['cellid', 'intra']).size().unstack(fill_value=0)
contact_type['inter_ratio'] = contact_type[0] / \
    (contact_type[0] + contact_type[1])
contact_type['intra_ratio'] = contact_type[1] / \
    (contact_type[0] + contact_type[1])
train = train.merge(
    contact_type[['inter_ratio', 'intra_ratio']], on='cellid', how='left')

In [None]:
# Relative proportions within intra-chromosomal contacts only
intra_contact = contact[contact['intra'] == 1].copy()
intra_bins = intra_contact.groupby(
    ['cellid', 'log_dist_bin']).size().unstack(fill_value=0)
intra_bins = intra_bins.div(intra_bins.sum(axis=1), axis=0)
intra_bins.columns = [f'intra_{col}' for col in intra_bins.columns]
train = train.merge(intra_bins, on='cellid', how='left')

In [None]:
# Define short-range (<200) vs long-range (>=200)
train['short_range_ratio'] = train[[
    '0-50', '50-100', '100-150', '150-200']].sum(axis=1)
train['long_range_ratio'] = train[['200-250', '250-300']].sum(axis=1)
train['short_long_ratio'] = train['short_range_ratio'] / \
    (train['long_range_ratio'] + 1e-6)

In [None]:
train.columns

Index(['cellid', 'phase', 'order_within_phase', 'order', '0', '0-50', '50-100',
       '100-150', '150-200', '200-250', '250-300', 'total_contacts',
       'inter_ratio', 'intra_ratio', 'intra_0', 'intra_0-50', 'intra_50-100',
       'intra_100-150', 'intra_150-200', 'intra_200-250', 'intra_250-300',
       'short_range_ratio', 'long_range_ratio', 'short_long_ratio'],
      dtype='object')

In [None]:
train.head()

Unnamed: 0,cellid,phase,order_within_phase,order,0,0-50,50-100,100-150,150-200,200-250,...,intra_0,intra_0-50,intra_50-100,intra_100-150,intra_150-200,intra_200-250,intra_250-300,short_range_ratio,long_range_ratio,short_long_ratio
0,SCG0088_CTATGAGGTACCGGAT-1,G1,0,0,0.106918,0.004324,0.150943,0.342767,0.223664,0.116352,...,0.0,0.004842,0.169014,0.383803,0.25044,0.130282,0.06162,0.721698,0.171384,4.210985
1,SCG0088_GCTAAGCGTATTGGTG-1,G1,0,0,0.117595,0.009354,0.251225,0.271715,0.198218,0.106904,...,0.0,0.010601,0.284705,0.307925,0.224634,0.121151,0.050984,0.730512,0.151893,4.809353
2,SCG0089_TCCATTGTCTGTAAGC-1,G1,0,0,0.117763,0.005888,0.226202,0.301766,0.236997,0.084396,...,0.0,0.006674,0.256396,0.342047,0.268632,0.095662,0.03059,0.770854,0.111384,6.920643
3,SCG0092_GTTTATCTCATGCTAA-1,G1,0,0,0.103506,0.003362,0.175793,0.32805,0.191162,0.12536,...,0.0,0.00375,0.196089,0.365926,0.213233,0.139834,0.081168,0.698367,0.198127,3.524831
4,SCG0092_AACCGCTCAGCTCATA-1,G1,0,0,0.148919,0.011424,0.24643,0.302326,0.19747,0.064055,...,0.0,0.013423,0.289549,0.355225,0.232023,0.075264,0.034516,0.75765,0.093431,8.109084


In [None]:
train.isnull().sum()

cellid                0
phase                 0
order_within_phase    0
order                 0
0                     0
0-50                  0
50-100                0
100-150               0
150-200               0
200-250               0
250-300               0
total_contacts        0
inter_ratio           0
intra_ratio           0
intra_0               0
intra_0-50            0
intra_50-100          0
intra_100-150         0
intra_150-200         0
intra_200-250         0
intra_250-300         0
short_range_ratio     0
long_range_ratio      0
short_long_ratio      0
dtype: int64

In [None]:
train.fillna(0, inplace=True)

In [112]:
# Save after processing
train.to_csv(config.PROCESSED_DATA_OUTPUT, index=False)