In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.metrics import matthews_corrcoef

In [2]:
data = pd.read_csv('./data/data.txt', sep='\t')

In [3]:
data.head()

Unnamed: 0,response,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V16553,V16554,V16555,V16556,V16557,V16558,V16559,V16560,V16561,V16562
0,1,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
response_distribution = data.groupby('response').size().reset_index(name='count')
response_distribution

Unnamed: 0,response,count
0,0,407
1,1,123


In [5]:
target = data['response']
input_features = data.drop(columns='response')

## Drop duplicated columns
Drop columns having the same values in every row and keep the first one

In [6]:
input_features.transpose().head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,520,521,522,523,524,525,526,527,528,529
V1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
V2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
V3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
V4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
V5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
input_features = input_features.transpose().reset_index(drop=True)
num_columns_before_drop = input_features.shape[0]

In [8]:
input_features.drop_duplicates(inplace=True)
input_features.reset_index(inplace=True, drop=True)

In [9]:
num_columns_after_drop = input_features.shape[0]

In [10]:
print(
    'Reducing number of columns from {} to {}, which is {:.2f}% of the initial state'.format(
        num_columns_before_drop, num_columns_after_drop, num_columns_after_drop * 100/num_columns_before_drop
    )
)

Reducing number of columns from 16562 to 9545, which is 57.63% of the initial state


In [11]:
# transform back to columns
input_features = input_features.transpose()
input_features.shape

(530, 9545)

In [12]:
nunique = input_features.apply(pd.Series.nunique)
cols_to_drop = nunique[nunique == 1].index
dropped_constant_columns = input_features.drop(cols_to_drop, axis=1).reset_index(drop=True)

In [13]:
dropped_constant_columns.shape

(530, 9543)

In [14]:
dropped_constant_columns.head()

Unnamed: 0,0,1,2,4,6,7,8,9,10,11,...,9535,9536,9537,9538,9539,9540,9541,9542,9543,9544
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,1,1,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0


In [15]:
dropped_constant_columns.columns = list(range(len(dropped_constant_columns.columns)))
col_len = len(dropped_constant_columns.columns)
correlated_colums = []
for i in range(col_len):
    column_corr = matthews_corrcoef(target, dropped_constant_columns[i])
    correlated_colums.append((i, column_corr))

In [16]:
correlated_colums = pd.DataFrame(correlated_colums, columns=['correlated_column', 'correlation'])

In [17]:
correlated_colums['correlation'] = np.abs(correlated_colums.correlation)
correlated_colums.sort_values(by='correlation', ascending=False, inplace=True)

In [18]:
correlation_thresholds = np.arange(0.05, 0.5, 0.05)
for threshold in correlation_thresholds:
    num_corr_columns = correlated_colums[correlated_colums.correlation > threshold].shape[0]
    print('{0} columns whose correlation is higher than {1:.2f}'.format(num_corr_columns, threshold))

3598 columns whose correlation is higher than 0.05
1102 columns whose correlation is higher than 0.10
297 columns whose correlation is higher than 0.15
86 columns whose correlation is higher than 0.20
33 columns whose correlation is higher than 0.25
15 columns whose correlation is higher than 0.30
11 columns whose correlation is higher than 0.35
11 columns whose correlation is higher than 0.40
9 columns whose correlation is higher than 0.45


In [19]:
columns = list(correlated_colums[correlated_colums.correlation > 0.1]['correlated_column'].values)
dropped_constant_columns.loc[:, 'response'] = target
dropped_constant_columns = dropped_constant_columns[['response'] + columns]

In [20]:
dropped_constant_columns.to_csv('./data/filtered_data.csv', index=False)

In [68]:
response_distribution

Unnamed: 0,response,count
0,0,407
1,1,123


Now, lets take 90% of data for training and the remaining 10% for testing. Knowing that class distribution is uneven,
we will sample the same amount of data from both classes `0` and `1`. To do that we will take 111 samples (0.9 * 123 = 111) randomly from both classes

In [21]:
# add supportive index column which allows tracking selected rows
dropped_constant_columns['index_support'] = dropped_constant_columns.index

In [24]:
train_data = dropped_constant_columns.groupby('response').apply(lambda x: x.sample(n=111, axis=0)).reset_index(drop=True).copy()
test_data = dropped_constant_columns[~dropped_constant_columns['index_support'].isin(train_data['index_support'])].copy()

In [25]:
train_data.drop(columns='index_support', inplace=True)
test_data.drop(columns='index_support', inplace=True)

In [26]:
train_data.groupby('response').size().reset_index(name='count')

Unnamed: 0,response,count
0,0,111
1,1,111


In [27]:
test_data.groupby('response').size().reset_index(name='count')

Unnamed: 0,response,count
0,0,296
1,1,12


In [28]:
train_data.to_csv('./data/train_data.csv', index=False)
test_data.to_csv('./data/test_data.csv', index=False)