In [3]:
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

warnings.filterwarnings('ignore')

  from pandas.core import (


In [4]:
class Config:
    TRAINING_FILE = "../input/train_data_2000.csv"
    CONTACT_FILE = "../input/contact_data_2000.csv"
    TEST_FILE = "../input/test_data_2000.csv"
    MODEL_OUTPUT = "../models/"
    PROCESSED_DATA_OUTPUT = "../input/processed_data.csv"


config = Config()

In [5]:
test = pd.read_csv(config.TEST_FILE)
train = pd.read_csv(config.TRAINING_FILE)
contact = pd.read_csv(config.CONTACT_FILE)

In [6]:
contact['intra'] = (contact['chr1'] == contact['chr2']).astype(int)

In [7]:
contact['log_dist'] = np.where(contact['intra'] == 1, (np.log2(
    np.abs(contact['end2'] - contact['end1']) + 1) + 0.1) / 0.1, 0)

In [8]:
n = (contact['log_dist'] > 0).sum()
bins = [-1, 0, 50, 100, 150, 200, 250, 300]
labels = ['0', '0-50', '50-100', '100-150', '150-200', '200-250', '250-300']
contact['log_dist_bin'] = pd.cut(
    contact['log_dist'], bins=bins, labels=labels)
contact.head()

Unnamed: 0,chr1,start1,end1,chr2,start2,end2,cellid,intra,log_dist,log_dist_bin
0,chr13-M,74316813,74316959,chr13-M,72727004,72727154,SCG0088_TTTAACCTCAGCCAAT-1,1,207.004193,200-250
1,chr1-M,79322530,79322563,chr12-M,4538118,4538268,SCG0088_TATAGGTGTCCCGGAA-1,0,0.0,0
2,chr2-M,75633331,75633491,chr8-P,125695812,125695962,SCG0088_CGTTAACAGTACCGCA-1,0,0.0,0
3,chr7-P,136324163,136324313,chr7-P,136352442,136352592,SCG0088_TTTAACCTCAGCCAAT-1,1,148.874945,100-150
4,chr6-M,49253365,49253515,chr6-M,49323546,49323680,SCG0088_CGTTAACAGTACCGCA-1,1,161.984845,150-200


In [9]:
prob_df = contact.groupby(['cellid', 'log_dist_bin']
                          ).size().unstack(fill_value=0)
prob_df = prob_df.div(prob_df.sum(axis=1), axis=0)

train = train.merge(prob_df, on='cellid', how='left')

In [10]:
contact['log_dist_not_intra'] = np.where(contact['intra'] != 1, (np.log2(
    np.abs(contact['end2'] - contact['end1']) + 1) + 0.1) / 0.1, 0)

n = (contact['log_dist_not_intra'] > 0).sum()
bins = [-1, 0, 50, 100, 150, 200, 250, 300]
labels = ['0a', '0-50a', '50-100a', '100-150a',
          '150-200a', '200-250a', '250-300a']
contact['log_dist_bin_ni'] = pd.cut(
    contact['log_dist_not_intra'], bins=bins, labels=labels)

prob_df = contact.groupby(['cellid', 'log_dist_bin_ni']
                          ).size().unstack(fill_value=0)
prob_df = prob_df.div(prob_df.sum(axis=1), axis=0)

train = train.merge(prob_df, on='cellid', how='left')

In [11]:
# total_contacts = contact.groupby('cellid').size().rename('total_contacts')
# train = train.merge(total_contacts, on='cellid', how='left')

In [12]:
# Intra vs Inter contacts proportions
contact_type = contact.groupby(
    ['cellid', 'intra']).size().unstack(fill_value=0)
contact_type['inter_ratio'] = contact_type[0] / \
    (contact_type[0] + contact_type[1])
contact_type['intra_ratio'] = contact_type[1] / \
    (contact_type[0] + contact_type[1])
train = train.merge(
    contact_type[['inter_ratio', 'intra_ratio']], on='cellid', how='left')

In [13]:
# Relative proportions within intra-chromosomal contacts only
intra_contact = contact[contact['intra'] == 1].copy()
intra_bins = intra_contact.groupby(
    ['cellid', 'log_dist_bin']).size().unstack(fill_value=0)
intra_bins = intra_bins.div(intra_bins.sum(axis=1), axis=0)
intra_bins.columns = [f'intra_{col}' for col in intra_bins.columns]
train = train.merge(intra_bins, on='cellid', how='left')

In [14]:
train

Unnamed: 0,cellid,phase,order_within_phase,order,0,0-50,50-100,100-150,150-200,200-250,...,250-300a,inter_ratio,intra_ratio,intra_0,intra_0-50,intra_50-100,intra_100-150,intra_150-200,intra_200-250,intra_250-300
0,SCG0088_CTATGAGGTACCGGAT-1,G1,0,0,0.106918,0.004324,0.150943,0.342767,0.223664,0.116352,...,0.060142,0.106918,0.893082,0.0,0.004842,0.169014,0.383803,0.250440,0.130282,0.061620
1,SCG0088_GCTAAGCGTATTGGTG-1,G1,0,0,0.117595,0.009354,0.251225,0.271715,0.198218,0.106904,...,0.071715,0.117595,0.882405,0.0,0.010601,0.284705,0.307925,0.224634,0.121151,0.050984
2,SCG0089_TCCATTGTCTGTAAGC-1,G1,0,0,0.117763,0.005888,0.226202,0.301766,0.236997,0.084396,...,0.074092,0.117763,0.882237,0.0,0.006674,0.256396,0.342047,0.268632,0.095662,0.030590
3,SCG0092_GTTTATCTCATGCTAA-1,G1,0,0,0.103506,0.003362,0.175793,0.328050,0.191162,0.125360,...,0.057157,0.103506,0.896494,0.0,0.003750,0.196089,0.365926,0.213233,0.139834,0.081168
4,SCG0092_AACCGCTCAGCTCATA-1,G1,0,0,0.148919,0.011424,0.246430,0.302326,0.197470,0.064055,...,0.084863,0.148919,0.851081,0.0,0.013423,0.289549,0.355225,0.232023,0.075264,0.034516
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1496,SCG0093_GTCCAGGGTCAGGCAT-1,G2M,11,43,0.137519,0.011958,0.248630,0.271550,0.217738,0.075237,...,0.082711,0.137519,0.862481,0.0,0.013865,0.288273,0.314847,0.252455,0.087233,0.043328
1497,SCG0090_GAGCATGCAAACGCGA-1,G2M,11,43,0.126285,0.005874,0.253059,0.288302,0.245228,0.059716,...,0.078806,0.126285,0.873715,0.0,0.006723,0.289636,0.329972,0.280672,0.068347,0.024650
1498,SCG0092_GCTAGCCAGTTTCCGC-1,G2M,11,43,0.105968,0.004839,0.181290,0.329355,0.254355,0.097258,...,0.064032,0.105968,0.894032,0.0,0.005412,0.202778,0.368393,0.284503,0.108786,0.030128
1499,SCG0092_TAGGGTTTCGCCTAAG-1,G2M,11,43,0.124407,0.008814,0.210169,0.292203,0.254915,0.075932,...,0.073559,0.124407,0.875593,0.0,0.010066,0.240031,0.333720,0.291134,0.086721,0.038328


In [15]:
# Define short-range (<200) vs long-range (>=200)
train['short_range_ratio'] = train[[
    '0-50', '50-100', '100-150', '150-200']].sum(axis=1)
train['long_range_ratio'] = train[['200-250', '250-300']].sum(axis=1)
train['short_long_ratio'] = train['short_range_ratio'] / \
    (train['long_range_ratio'] + 1e-6)

In [16]:
# # probably it is decreasing accuracy :()

# chr1_chr2_contacts = contact.groupby(
#     "chr1")["chr2"].size().rename("chr1_chr2_count")
# chr2_chr1_contacts = contact.groupby(
#     "chr2")["chr1"].size().rename("chr2_chr1_count")

# total_contacts = len(contact)

# chr1_chr2_contacts_percentage = (
#     chr1_chr2_contacts / total_contacts * 100).rename("chr1_chr2_percentage")
# chr2_chr1_contacts_percentage = (
#     chr2_chr1_contacts / total_contacts * 100).rename("chr2_chr1_percentage")

# contact = contact.merge(chr1_chr2_contacts_percentage, on="chr1", how="left")
# contact = contact.merge(chr2_chr1_contacts_percentage, on="chr2", how="left")

# cont_per1 = contact.groupby('cellid', as_index=False)[
#     'chr1_chr2_percentage'].count()
# cont_per2 = contact.groupby('cellid', as_index=False)[
#     'chr2_chr1_percentage'].count()


# train = train.merge(cont_per1, on='cellid', how='left')
# train = train.merge(cont_per2, on='cellid', how='left')

In [17]:
# # %% 6. Additional Feature Extraction from Contact Distances
# # Compute aggregated statistics of log_dist per cell:
# # Only consider intra-chromosomal contacts (log_dist > 0)
# from scipy.stats import entropy
# intra_contacts = contact[contact['log_dist'] > 0]

# # Compute mean, median, std, min and max of log_dist for each cell
# agg_stats = intra_contacts.groupby('cellid')['log_dist'].agg(
#     mean_log_dist='mean',
#     std_log_dist='std'
# ).reset_index()

# # Merge aggregated stats into training data
# train = train.merge(agg_stats, on='cellid', how='left')


# # Compute entropy for each cell based on log_dist_bin distribution
# log_dist_bin_distribution = contact.groupby(
#     'cellid')['log_dist_bin'].value_counts(normalize=True).unstack(fill_value=0)
# log_dist_bin_entropy = log_dist_bin_distribution.apply(
#     entropy, axis=1).reset_index()
# log_dist_bin_entropy.columns = ['cellid', 'entropy']

# # Merge entropy into training data
# train = train.merge(log_dist_bin_entropy, on='cellid', how='left')

In [18]:
from scipy.stats import skew, kurtosis

# 1. Flags for chromosome origins: maternal (M) and paternal (P)
contact["is_m_chr1"] = contact["chr1"].str.endswith("M").astype(int)
contact["is_m_chr2"] = contact["chr2"].str.endswith("M").astype(int)
contact["is_p_chr1"] = contact["chr1"].str.endswith("P").astype(int)
contact["is_p_chr2"] = contact["chr2"].str.endswith("P").astype(int)

# 2. Calculate contact lengths
contact["length1"] = contact["end1"] - contact["start1"]
contact["length2"] = contact["end2"] - contact["start2"]

# 3. Interaction frequencies between chromosomes
chr1_chr2_contacts = contact.groupby(
    "chr1")["chr2"].size().rename("chr1_chr2_count")
chr2_chr1_contacts = contact.groupby(
    "chr2")["chr1"].size().rename("chr2_chr1_count")
contact = contact.merge(chr1_chr2_contacts, on="chr1", how="left")
contact = contact.merge(chr2_chr1_contacts, on="chr2", how="left")

# 4. Calculate distances between contacts (as differences and absolute values)
contact["start_distance"] = contact["start1"] - contact["start2"]
contact["end_distance"] = contact["end1"] - contact["end2"]
contact["abs_start_distance"] = contact["start_distance"].abs()
contact["abs_end_distance"] = contact["end_distance"].abs()

# 5. Length ratio of contacts (asymmetry)
contact["length_ratio"] = contact["length1"] / (contact["length2"] + 1e-5)

# 6. Categorize contacts: intra (within the same chromosome) and inter (between chromosomes)
contact["intra_contact"] = (contact["chr1"] == contact["chr2"]).astype(int)
contact["inter_contact"] = (contact["chr1"] != contact["chr2"]).astype(int)

# 7. Categorize distances: create categories for abs_start_distance and abs_end_distance
threshold_short = 200e3  # 200 thousand nucleotides
threshold_mid = 2e6      # 2 million nucleotides

contact["bin_start"] = np.where(contact["abs_start_distance"] < threshold_short, "short",
                                np.where(contact["abs_start_distance"] <= threshold_mid, "mid", "long"))
contact["bin_end"] = np.where(contact["abs_end_distance"] < threshold_short, "short",
                              np.where(contact["abs_end_distance"] <= threshold_mid, "mid", "long"))

# 8. Define inter-origin contacts: maternal-maternal, paternal-paternal, and mixed
contact["mm_contact"] = ((contact["is_m_chr1"] == 1) &
                         (contact["is_m_chr2"] == 1)).astype(int)
contact["pp_contact"] = ((contact["is_p_chr1"] == 1) &
                         (contact["is_p_chr2"] == 1)).astype(int)
contact["mp_contact"] = (((contact["is_m_chr1"] == 1) & (contact["is_p_chr2"] == 1)) |
                         ((contact["is_p_chr1"] == 1) & (contact["is_m_chr2"] == 1))).astype(int)

# 9. Count unique chromosome pairs for each cell (sorted alphabetically)


def unique_chr_pairs(df):
    pairs = df.apply(lambda row: tuple(
        sorted([row["chr1"], row["chr2"]])), axis=1)
    return pairs.nunique()


unique_pairs = contact.groupby("cellid").apply(
    unique_chr_pairs).rename("unique_chr_pairs")

# 10. Additional distance distribution characteristics: skewness and kurtosis


def compute_skew(series):
    return skew(series) if len(series) > 1 else 0


def compute_kurtosis(series):
    return kurtosis(series) if len(series) > 1 else 0


skew_start = contact.groupby("cellid")["abs_start_distance"].apply(
    compute_skew).rename("skew_abs_start")
kurtosis_start = contact.groupby("cellid")["abs_start_distance"].apply(
    compute_kurtosis).rename("kurtosis_abs_start")
skew_end = contact.groupby("cellid")["abs_end_distance"].apply(
    compute_skew).rename("skew_abs_end")
kurtosis_end = contact.groupby("cellid")["abs_end_distance"].apply(
    compute_kurtosis).rename("kurtosis_abs_end")

# 11. Aggregate features at the cell level (including new biologically relevant features)
agg_features = contact.groupby("cellid").agg(
    # General contact characteristics
    total_contacts=("chr1", "count"),

    # Distances between contacts
    median_start_distance=("start_distance", "median"),
    median_end_distance=("end_distance", "median"),
    min_start_distance=("start_distance", "min"),
    min_end_distance=("end_distance", "min"),
    max_start_distance=("start_distance", "max"),
    max_end_distance=("end_distance", "max"),
    std_start_distance=("start_distance", "std"),
    std_end_distance=("end_distance", "std"),
    median_abs_start_distance=("abs_start_distance", "median"),
    median_abs_end_distance=("abs_end_distance", "median"),

    # Contact lengths and their ratios
    median_length1=("length1", "median"),
    median_length2=("length2", "median"),
    mean_length1=("length1", "mean"),
    mean_length2=("length2", "mean"),
    median_length_ratio=("length_ratio", "median"),
    mean_length_ratio=("length_ratio", "mean"),

    # Chromosome origin flags
    sum_is_m_chr1=("is_m_chr1", "sum"),
    sum_is_m_chr2=("is_m_chr2", "sum"),
    sum_is_p_chr1=("is_p_chr1", "sum"),
    sum_is_p_chr2=("is_p_chr2", "sum"),

    # Interaction frequencies between chromosomes
    mean_chr1_chr2_count=("chr1_chr2_count", "mean"),
    std_chr1_chr2_count=("chr1_chr2_count", "std"),
    mean_chr2_chr1_count=("chr2_chr1_count", "mean"),
    std_chr2_chr1_count=("chr2_chr1_count", "std"),

    # Intra and inter contacts
    intra_contacts=("intra_contact", "sum"),
    inter_contacts=("inter_contact", "sum"),

    # Categorization by distance (start)
    long_contacts_start=("bin_start", lambda x: (x == "long").sum()),
    mid_contacts_start=("bin_start", lambda x: (x == "mid").sum()),
    short_contacts_start=("bin_start", lambda x: (x == "short").sum()),
    # Categorization by distance (end)
    long_contacts_end=("bin_end", lambda x: (x == "long").sum()),
    mid_contacts_end=("bin_end", lambda x: (x == "mid").sum()),
    short_contacts_end=("bin_end", lambda x: (x == "short").sum()),

    # Inter-origin contacts
    mm_contacts=("mm_contact", "sum"),
    pp_contacts=("pp_contact", "sum"),
    mp_contacts=("mp_contact", "sum")
).fillna(0)

# 12. Add additional features: unique chromosome pairs and distance distribution characteristics
agg_features = (agg_features
                .merge(unique_pairs, left_index=True, right_index=True)
                .merge(skew_start, left_index=True, right_index=True)
                .merge(kurtosis_start, left_index=True, right_index=True)
                .merge(skew_end, left_index=True, right_index=True)
                .merge(kurtosis_end, left_index=True, right_index=True))

# 13. Additional ratios
agg_features["ratio_intra"] = agg_features["intra_contacts"] / \
    (agg_features["total_contacts"] + 1e-5)
agg_features["ratio_inter"] = agg_features["inter_contacts"] / \
    (agg_features["total_contacts"] + 1e-5)
agg_features["ratio_mm"] = agg_features["mm_contacts"] / \
    (agg_features["total_contacts"] + 1e-5)
agg_features["ratio_pp"] = agg_features["pp_contacts"] / \
    (agg_features["total_contacts"] + 1e-5)
agg_features["ratio_mp"] = agg_features["mp_contacts"] / \
    (agg_features["total_contacts"] + 1e-5)

# Ratios for distance categories (start)
agg_features["ratio_short_start"] = agg_features["short_contacts_start"] / \
    (agg_features["total_contacts"] + 1e-5)
agg_features["ratio_mid_start"] = agg_features["mid_contacts_start"] / \
    (agg_features["total_contacts"] + 1e-5)
agg_features["ratio_long_start"] = agg_features["long_contacts_start"] / \
    (agg_features["total_contacts"] + 1e-5)
# Similarly for end
agg_features["ratio_short_end"] = agg_features["short_contacts_end"] / \
    (agg_features["total_contacts"] + 1e-5)
agg_features["ratio_mid_end"] = agg_features["mid_contacts_end"] / \
    (agg_features["total_contacts"] + 1e-5)
agg_features["ratio_long_end"] = agg_features["long_contacts_end"] / \
    (agg_features["total_contacts"] + 1e-5)
# 15. Ratio of very short contacts (<50 kb)
very_short_threshold = 50e3
contact["is_very_short"] = (
    contact["abs_start_distance"] < very_short_threshold).astype(int)
very_short_agg = contact.groupby("cellid").agg(
    very_short_contacts=("is_very_short", "sum")
)
# Use the total number of contacts from the previous aggregation (agg_features)
very_short_agg["ratio_very_short"] = very_short_agg["very_short_contacts"] / \
    (agg_features["total_contacts"] + 1e-5)

# 16. Median lengths for intra and inter contacts
intra_contacts = contact[contact["intra_contact"] == 1]
inter_contacts = contact[contact["intra_contact"] == 0]

intra_medians = intra_contacts.groupby("cellid").agg(
    median_length1_intra=("length1", "median"),
    median_length2_intra=("length2", "median")
)
inter_medians = inter_contacts.groupby("cellid").agg(
    median_length1_inter=("length1", "median"),
    median_length2_inter=("length2", "median")
)

# Merge with main aggregates
agg_features = agg_features.merge(
    intra_medians, left_index=True, right_index=True, how="left")
agg_features = agg_features.merge(
    inter_medians, left_index=True, right_index=True, how="left")
agg_features["ratio_median_length1_intra_inter"] = agg_features["median_length1_intra"] / \
    (agg_features["median_length1_inter"] + 1e-5)
agg_features["ratio_median_length2_intra_inter"] = agg_features["median_length2_intra"] / \
    (agg_features["median_length2_inter"] + 1e-5)

# 17. Contact decay exponent


def compute_decay_slope(distances):
    if len(distances) < 10:
        return np.nan
    try:
        # Define 7 bins on a logarithmic scale from 1e2 to the maximum distance
        bin_edges = np.logspace(2, np.log10(distances.max()+1), num=7)
    except Exception:
        bin_edges = np.linspace(0, distances.max()+1, 7)
    counts, edges = np.histogram(distances, bins=bin_edges)
    mid_points = (edges[:-1] + edges[1:]) / 2
    valid = counts > 0
    if valid.sum() < 2:
        return np.nan
    log_mid = np.log10(mid_points[valid])
    log_counts = np.log10(counts[valid])
    slope, _ = np.polyfit(log_mid, log_counts, 1)
    return slope


decay_slope = contact.groupby("cellid")["abs_start_distance"].apply(
    compute_decay_slope).rename("decay_slope")

# 18. Difference between maternal and paternal contacts (chr1 and chr2 separately)
agg_features["diff_m_p_chr1"] = agg_features["sum_is_m_chr1"] - \
    agg_features["sum_is_p_chr1"]
agg_features["diff_m_p_chr2"] = agg_features["sum_is_m_chr2"] - \
    agg_features["sum_is_p_chr2"]

# 19. Entropy of distribution by distance categories (using bin_start)


def bin_entropy(series):
    probs = series.value_counts(normalize=True)
    return -np.sum(probs * np.log(probs + 1e-5))


entropy_bin_start = contact.groupby("cellid")["bin_start"].apply(
    bin_entropy).rename("entropy_bin_start")
entropy_bin_end = contact.groupby("cellid")["bin_end"].apply(
    bin_entropy).rename("entropy_bin_end")

# 20. Merge additional features into the main agg_features table
agg_features = agg_features.merge(
    very_short_agg[["ratio_very_short"]], left_index=True, right_index=True, how="left")
agg_features = agg_features.merge(
    decay_slope, left_index=True, right_index=True, how="left")
agg_features = agg_features.merge(
    entropy_bin_start, left_index=True, right_index=True, how="left")
agg_features = agg_features.merge(
    entropy_bin_end, left_index=True, right_index=True, how="left")

# Fill missing values for new features
agg_features.fillna(0, inplace=True)

# Merge final features with training and test sets
train = train.merge(agg_features, on="cellid", how="left")
# test = test.merge(agg_features, on="cellid", how="left")

In [20]:
train.columns

Index(['cellid', 'phase', 'order_within_phase', 'order', '0', '0-50', '50-100',
       '100-150', '150-200', '200-250', '250-300', '0a', '0-50a', '50-100a',
       '100-150a', '150-200a', '200-250a', '250-300a', 'inter_ratio',
       'intra_ratio', 'intra_0', 'intra_0-50', 'intra_50-100', 'intra_100-150',
       'intra_150-200', 'intra_200-250', 'intra_250-300', 'short_range_ratio',
       'long_range_ratio', 'short_long_ratio', 'total_contacts',
       'median_start_distance', 'median_end_distance', 'min_start_distance',
       'min_end_distance', 'max_start_distance', 'max_end_distance',
       'std_start_distance', 'std_end_distance', 'median_abs_start_distance',
       'median_abs_end_distance', 'median_length1', 'median_length2',
       'mean_length1', 'mean_length2', 'median_length_ratio',
       'mean_length_ratio', 'sum_is_m_chr1', 'sum_is_m_chr2', 'sum_is_p_chr1',
       'sum_is_p_chr2', 'mean_chr1_chr2_count', 'std_chr1_chr2_count',
       'mean_chr2_chr1_count', 'std_chr2_ch

In [133]:
train.head()

Unnamed: 0,cellid,phase,order_within_phase,order,0,0-50,50-100,100-150,150-200,200-250,...,intra_200-250,intra_250-300,short_range_ratio,long_range_ratio,short_long_ratio,chr1_chr2_percentage,chr2_chr1_percentage,mean_log_dist,std_log_dist,entropy
0,SCG0088_CTATGAGGTACCGGAT-1,G1,0,0,0.106918,0.004324,0.150943,0.342767,0.223664,0.116352,...,0.130282,0.06162,0.721698,0.171384,4.210985,2544,2544,150.4524,54.348113,1.659821
1,SCG0088_GCTAAGCGTATTGGTG-1,G1,0,0,0.117595,0.009354,0.251225,0.271715,0.198218,0.106904,...,0.121151,0.050984,0.730512,0.151893,4.809353,2245,2245,140.85235,58.305005,1.695842
2,SCG0089_TCCATTGTCTGTAAGC-1,G1,0,0,0.117763,0.005888,0.226202,0.301766,0.236997,0.084396,...,0.095662,0.03059,0.770854,0.111384,6.920643,2038,2038,138.707222,51.550218,1.627238
3,SCG0092_GTTTATCTCATGCTAA-1,G1,0,0,0.103506,0.003362,0.175793,0.32805,0.191162,0.12536,...,0.139834,0.081168,0.698367,0.198127,3.524831,4164,4164,150.548704,57.888858,1.692467
4,SCG0092_AACCGCTCAGCTCATA-1,G1,0,0,0.148919,0.011424,0.24643,0.302326,0.19747,0.064055,...,0.075264,0.034516,0.75765,0.093431,8.109084,2451,2451,133.292388,52.504001,1.641489


In [157]:
train.isnull().sum()

cellid                0
phase                 0
order_within_phase    0
order                 0
0                     0
0-25                  0
25-50                 0
50-75                 0
75-100                0
100-125               0
125-150               0
150-175               0
175-200               0
200-225               0
225-250               0
250-275               0
275-300               0
total_contacts        0
inter_ratio           0
intra_ratio           0
intra_0               0
intra_0-25            0
intra_25-50           0
intra_50-75           0
intra_75-100          0
intra_100-125         0
intra_125-150         0
intra_150-175         0
intra_175-200         0
intra_200-225         0
intra_225-250         0
intra_250-275         0
intra_275-300         0
dtype: int64

In [89]:
train.fillna(0, inplace=True)

In [21]:
train.shape

(1501, 94)

In [25]:
from sklearn.feature_selection import SelectKBest
X = train.drop(['cellid', 'phase', 'order', 'order_within_phase'], axis=1)
y = train['phase']

X_new = SelectKBest(k=20).fit_transform(X, y)

# Create a new dataframe with the selected features
new_features_df = pd.DataFrame(
    X_new, columns=[f'feature_{i}' for i in range(X_new.shape[1])])

# Add the phase column to the new dataframe
new_features_df['phase'] = y.values

new_features_df.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,phase
0,0.106918,0.342767,0.223664,0.055031,0.106918,0.893082,0.383803,0.25044,0.06162,22556260.0,...,3.068321,10.581094,3.068321,10.581095,0.106918,0.125393,0.125393,0.905115,0.905115,G1
1,0.117595,0.271715,0.198218,0.044989,0.117595,0.882405,0.307925,0.224634,0.050984,24057590.0,...,2.941968,9.377806,2.941967,9.377801,0.117595,0.102895,0.102895,0.8625,0.8625,G1
2,0.117763,0.301766,0.236997,0.026987,0.117763,0.882237,0.342047,0.268632,0.03059,25275480.0,...,3.3006,10.989629,3.3006,10.98963,0.117763,0.134446,0.134446,0.865186,0.865186,G1
3,0.103506,0.32805,0.191162,0.072767,0.103506,0.896494,0.365926,0.213233,0.081168,25278470.0,...,2.973627,10.27193,2.973627,10.271931,0.103506,0.102305,0.102305,0.882389,0.882389,G1
4,0.148919,0.302326,0.19747,0.029376,0.148919,0.851081,0.355225,0.232023,0.034516,25175390.0,...,3.069279,9.863208,3.06928,9.863211,0.148919,0.099143,0.099143,0.827496,0.827496,G1


In [26]:
# Save after processing
new_features_df.to_csv(config.PROCESSED_DATA_OUTPUT, index=False)