# Dataset __Bioresponse__

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('bioresponse.csv')

### EDA

In [3]:
print(data.head())
print(data.info())
print(data.describe())

print(data.dtypes)

         D1        D2    D3   D4        D5        D6        D7        D8  \
0  0.000000  0.497009  0.10  0.0  0.132956  0.678031  0.273166  0.585445   
1  0.366667  0.606291  0.05  0.0  0.111209  0.803455  0.106105  0.411754   
2  0.033300  0.480124  0.00  0.0  0.209791  0.610350  0.356453  0.517720   
3  0.000000  0.538825  0.00  0.5  0.196344  0.724230  0.235606  0.288764   
4  0.100000  0.517794  0.00  0.0  0.494734  0.781422  0.154361  0.303809   

         D9       D10  ...  D1768  D1769  D1770  D1771  D1772  D1773  D1774  \
0  0.743663  0.243144  ...    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
1  0.836582  0.106480  ...    1.0    1.0    1.0    0.0    1.0    0.0    0.0   
2  0.679051  0.352308  ...    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
3  0.805110  0.208989  ...    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
4  0.812646  0.125177  ...    0.0    0.0    0.0    0.0    0.0    0.0    0.0   

   D1775  D1776  target  
0    0.0    0.0       1  
1    1.0    0.0 

Each row in this data set represents a molecule. The first column contains experimental data describing an actual biological response; the molecule was seen to elicit this response (1), or not (0). The remaining columns represent molecular descriptors (d1 through d1776), these are calculated properties that can capture some of the characteristics of the molecule - for example size, shape, or elemental constitution. The "target" column is the biological response.

In [4]:
X = data.drop(columns=['target']) # Input features (molecular descriptors)
Y = data['target'] # Target variable (biological response)

### Data Visualization

In [5]:
# Target value distribution
def target_distribution(data):
    plt.figure(figsize=(6, 4))
    sns.countplot(x='target', data=data, palette='pastel')
    plt.title('Distribution of the target value')
    plt.xlabel('Target Variable')
    plt.ylabel('Count')
    plt.show()

# Boxplots for the first N descriptors
def firstN_descriptors(data, num):
    plt.figure(figsize=(12, 8))
    sns.boxplot(data=data.iloc[:, 1:num])
    plt.title('Boxplot of Molecular Descriptors (d1-d10)')
    plt.xlabel('Descriptor')
    plt.ylabel('Value')
    plt.show()

# Visualisation of the relationship between the first molecular descriptor (X1) and the target variable
def descriptor_target_relationship(data, idx):
    plt.figure(figsize=(8, 6))
    sns.boxplot(x=data['target'], y=data.iloc[:, idx], color='lightgreen')
    plt.title('Relationship between the first descriptor and Target Variable')
    plt.xlabel('Target Variable')
    plt.ylabel('X')
    plt.show()

def heatmaps_corr(data):
    X = data.drop(columns=['target']) # Input features (molecular descriptors)
    Y = data['target'] # Target variable (biological response)

    correlation_X = X.corr()  # Correlation among molecular descriptors
    correlation_Y = X.apply(lambda x: x.corr(Y))  # Correlation between each molecular descriptor and the target variable

    # Heatmap for correlation among molecular descriptors
    sns.heatmap(correlation_X, cmap='coolwarm', annot=False, ax=axes[0])
    axes[0].set_title('Correlation Heatmap - Molecular Descriptors')
    axes[0].set_xlabel('Molecular Descriptors')
    axes[0].set_ylabel('Molecular Descriptors')

    # Heatmap for correlation between molecular descriptors and target variable
    sns.heatmap(correlation_Y.to_frame().transpose(), cmap='coolwarm', annot=True, fmt=".2f", ax=axes[1])
    axes[1].set_title('Correlation Heatmap - Molecular Descriptors vs. Target Variable')
    axes[1].set_xlabel('Molecular Descriptors')
    axes[1].set_ylabel('Target Variable (Y)')

    plt.tight_layout()
    plt.show()


## Feature selection

#### Merging descriptors with similar correlation

In [6]:
# correlation_threshold = treshold for correlation to merge descriptors (bigger corr => merget descriptors) 
def merge_descriptors(data, correlation_threshold = 0.5):
    correlation_matrix = data.corr()
    merged_descriptors = set()

    # Iterate over the correlation matrix to identify pairs of descriptors with similar correlation
    for i in range(len(correlation_matrix.columns)):
        for j in range(i+1, len(correlation_matrix.columns)):
            if abs(correlation_matrix.iloc[i, j]) >= correlation_threshold:
                # Add correlated descriptors to the set
                merged_descriptors.add((correlation_matrix.columns[i], correlation_matrix.columns[j]))

    merged_data = data.copy()

    # Merge descriptors
    for descriptor_pair in merged_descriptors:
        # Check if both descriptors exist in the dataset
        if all(descriptor in merged_data.columns for descriptor in descriptor_pair):
            merged_descriptor_name = '_'.join(descriptor_pair)
            merged_data[merged_descriptor_name] = (data[descriptor_pair[0]] + data[descriptor_pair[1]]) / 2
            merged_data.drop(list(descriptor_pair), axis=1, inplace=True)

    print("Information about Merged DataFrame:")
    print(merged_data.info())

    return merged_data

#### Linear correlation

In [7]:
def correlation_selection_original(data, correlation_threshold = 0.2):
    selected_features = correlation_Y[correlation_Y >= correlation_threshold].index.tolist()

    selected_data = data[selected_features]

    print("Selected Features from original data:")
    print(selected_features)
    
    return pd.DataFrame(data=data[selected_features + ['target']])

def correlation_selection_merged(data, correlation_threshold = 0.2):
    correlation_matrix = data.corr()

    correlation_threshold_m = 0.5  # Adjust as needed

    merged_descriptors = set()

    # Iterate over the correlation matrix to identify pairs of descriptors with similar correlation
    for i in range(len(correlation_matrix.columns)):
        for j in range(i+1, len(correlation_matrix.columns)):
            if abs(correlation_matrix.iloc[i, j]) >= correlation_threshold_m:
                # Add correlated descriptors to the set
                merged_descriptors.add((correlation_matrix.columns[i], correlation_matrix.columns[j]))

    merged_data = data.copy()

    # Merge descriptors
    for descriptor_pair in merged_descriptors:
        # Check if both descriptors exist in the dataset
        if all(descriptor in merged_data.columns for descriptor in descriptor_pair):
            merged_descriptor_name = '_'.join(descriptor_pair)
            merged_data[merged_descriptor_name] = (data[descriptor_pair[0]] + data[descriptor_pair[1]]) / 2
            merged_data.drop(list(descriptor_pair), axis=1, inplace=True)
            
    X_m = merged_data.drop(columns=['target']) # Input features (molecular descriptors)
    Y_m = merged_data['target'] # Target variable (biological response)

    correlation_Y_m = X_m.apply(lambda x: x.corr(Y_m))
    selected_features_m = correlation_Y_m[correlation_Y_m >= correlation_threshold].index.tolist()

    selected_data_m = merged_data[selected_features_m]

    print("Selected Features from merged data:")
    print(selected_features_m)
    
    return pd.DataFrame(data=merged_data[selected_features_m + ['target']])

As you can see, the linear correlation is not too high between the target_value and the descriptors, so I have to experiment fith another feature selection methods.

#### Tree based selection

In [8]:
def tree_based_original(data, n_estimators=100, top_n=50, random_state=42):
    X = data.drop(columns=['target'])
    Y = data['target']
    
    rf = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)
    rf.fit(X, Y)

    feature_importances = rf.feature_importances_
    importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)
    top_features = importance_df['Feature'].head(top_n).tolist()
    
    print("Top features:")
    top_features[:5]
    
    return pd.DataFrame(data=data[top_features + ['target']]) 

def tree_based_merged(data, n_estimators=100, top_n=50, random_state=42):
    correlation_matrix = data.corr()

    correlation_threshold = 0.5  # Adjust as needed

    merged_descriptors = set()

    # Iterate over the correlation matrix to identify pairs of descriptors with similar correlation
    for i in range(len(correlation_matrix.columns)):
        for j in range(i+1, len(correlation_matrix.columns)):
            if abs(correlation_matrix.iloc[i, j]) >= correlation_threshold:
                # Add correlated descriptors to the set
                merged_descriptors.add((correlation_matrix.columns[i], correlation_matrix.columns[j]))

    merged_data = data.copy()

    # Merge descriptors
    for descriptor_pair in merged_descriptors:
        # Check if both descriptors exist in the dataset
        if all(descriptor in merged_data.columns for descriptor in descriptor_pair):
            merged_descriptor_name = '_'.join(descriptor_pair)
            merged_data[merged_descriptor_name] = (data[descriptor_pair[0]] + data[descriptor_pair[1]]) / 2
            merged_data.drop(list(descriptor_pair), axis=1, inplace=True)
            
    X_m = merged_data.drop(columns=['target'])
    Y_m = merged_data['target']
    
    rf_m = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)
    rf_m.fit(X_m, Y_m)

    feature_importances_m = rf_m.feature_importances_
    importance_df_m = pd.DataFrame({'Feature': X_m.columns, 'Importance': feature_importances_m})
    importance_df_m = importance_df_m.sort_values(by='Importance', ascending=False)
    top_features_m = importance_df_m['Feature'].head(top_n).tolist()
    
    print("Top featuresmerged:")
    top_features_m[:5]
    
    return pd.DataFrame(data=merged_data[top_features_m + ['target']])

#### PCA

In [24]:
from sklearn.preprocessing import MinMaxScaler

def pca(data, n_components=50):
    X = data.drop(columns=['target'])
    y = data['target']
    
    scaler = MinMaxScaler(feature_range=(0, 1))
    X_scaled = scaler.fit_transform(X)

    pca = PCA(n_components=n_components)

    pca.fit(X_scaled)

    X_pca = pca.transform(X_scaled)

    pca_scaler = StandardScaler()
    X_pca_normalized = pca_scaler.fit_transform(X_pca)

    principal_components_df = pd.DataFrame(data=X_pca_normalized, columns=[f'PC{i+1}_normalized' for i in range(n_components)])
    
    principal_components_df['target'] = data['target']
    
    return principal_components_df

In [25]:
pca(data, 10)

Unnamed: 0,PC1_normalized,PC2_normalized,PC3_normalized,PC4_normalized,PC5_normalized,PC6_normalized,PC7_normalized,PC8_normalized,PC9_normalized,PC10_normalized,target
0,-0.907717,0.482206,-0.128484,-1.849404,-0.999578,-0.273188,0.248699,-0.300778,-0.486060,-0.067402,1
1,2.076563,2.084961,4.734596,-0.115921,-0.171535,-0.405104,-1.775952,-0.266095,2.011384,3.067987,1
2,-0.943033,0.637368,0.125603,-0.627555,-1.239454,0.095470,0.971724,-0.876554,0.191216,0.124700,1
3,-0.936173,0.762274,-0.164366,0.214169,0.026790,0.009333,-0.007323,-0.446322,-0.738950,0.255671,1
4,-1.113560,1.071062,-0.330413,1.623841,1.105684,0.193087,-0.945600,0.342518,0.104023,0.372876,0
...,...,...,...,...,...,...,...,...,...,...,...
3746,0.049068,-1.402846,0.011396,-0.783234,2.404541,0.956580,1.249742,0.241699,-1.064347,-0.373697,1
3747,1.395941,1.635597,2.839070,0.377879,0.461329,-1.984444,-1.197600,-1.075677,-1.526109,0.103762,1
3748,0.869268,0.859314,-0.360736,1.922090,-1.600359,-0.737172,0.034640,2.002952,1.144269,-0.385626,0
3749,-0.863932,0.578126,0.058405,0.760917,-1.031633,-0.625317,1.008153,0.088348,-0.772715,0.750037,1


Calling functions and viewing dataset:

In [10]:
#new_data = correlation_selection_merged(data)
#new_data = tree_based_merged(data)
#new_data = pca(data, n_components = 10)

#new_data

### Polynomial feature selection

In [11]:
# data_float32 = data.astype('float32')

In [12]:
# from sklearn.preprocessing import PolynomialFeatures

# # Feature Engineering
# poly = PolynomialFeatures(degree=1.5, interaction_only=True, include_bias=False)
# X_poly = poly.fit_transform(X)

# # Convert the polynomial feature matrix to a DataFrame
# X_poly_df = pd.DataFrame(X_poly, columns=poly.get_feature_names_out(X.columns))

# # Concatenate the original features with the polynomial features
# X_combined = pd.concat([X, X_poly_df], axis=1)

This method occures memory problems.

## Summary of the feature selection
We created a feature selection according to linear correlation for original data, and tried to merge the descriptors with similar correlation value in relation with the target value. [selected_features, selected_features_m] 


We also selected features using tree based feature selection method using the original and the merged data. These lists represents the top 50 features. [top_features, top_features_m]


We tried polynomial feature selection as well, but in this case we had memory problems.

#### Selected features dataframe export functions:

You can get the dataframes, which are containing the specific selected features with calling: 

correlation_selection_original(data, correlation_threshold), correlation_selection_merged(data, correlation_threshold), tree_based_original(data, n_estimators, top_n, random_state), tree_based_merged(data, n_estimators, top_n, random_state), pca(data, n_components)