In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from scipy.stats import pearsonr
from sklearn.decomposition import PCA

In [2]:
data = pd.read_csv('./data/data.txt', sep='\t')

In [3]:
data.head()

Unnamed: 0,response,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V16553,V16554,V16555,V16556,V16557,V16558,V16559,V16560,V16561,V16562
0,1,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
response_distribution = data.groupby('response').size().reset_index(name='count')
response_distribution

Unnamed: 0,response,count
0,0,407
1,1,123


In [5]:
target = data['response']
input_features = data.drop(columns='response')

## Drop duplicated columns
Drop columns having the same values in every row and keep the first one

In [6]:
input_features.transpose().head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,520,521,522,523,524,525,526,527,528,529
V1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
V2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
V3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
V4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
V5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
input_features = input_features.transpose().reset_index(drop=True)
num_columns_before_drop = input_features.shape[0]

In [8]:
input_features.drop_duplicates(inplace=True)
input_features.reset_index(inplace=True, drop=True)

In [9]:
num_columns_after_drop = input_features.shape[0]

In [10]:
print(
    'Reducing number of columns from {} to {}, which is {:.2f}% of the initial state'.format(
        num_columns_before_drop, num_columns_after_drop, num_columns_after_drop * 100/num_columns_before_drop
    )
)

Reducing number of columns from 16562 to 9545, which is 57.63% of the initial state


In [11]:
# transform back to columns
input_features = input_features.transpose()
input_features.shape

(530, 9545)

In [12]:
nunique = input_features.apply(pd.Series.nunique)
cols_to_drop = nunique[nunique == 1].index
dropped_constant_columns = input_features.drop(cols_to_drop, axis=1).reset_index(drop=True)

In [13]:
dropped_constant_columns.shape

(530, 9543)

In [14]:
dropped_constant_columns.head()

Unnamed: 0,0,1,2,4,6,7,8,9,10,11,...,9535,9536,9537,9538,9539,9540,9541,9542,9543,9544
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,1,1,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0


In [15]:
corr_matrix = np.corrcoef(dropped_constant_columns.transpose())
corr_matrix.shape

(9543, 9543)

In [16]:
abs_corr_matrix = np.abs(corr_matrix)

In [17]:
threshold = 0.1
col_corr = set() # Set of all the names of deleted columns
for i in range(len(abs_corr_matrix)):
    for j in range(i):
        if (np.abs(abs_corr_matrix[i, j]) >= threshold) and (j not in col_corr):
            col_corr.add((i, abs_corr_matrix[i, j]))

## Remove correlated columns
Remove columns which are highly correlated together

In [18]:
def get_number_of_corr_columns(threshold):
    """Get number of columns which are highly correlated with any other
    
    :return: number of correlated columns
    :rtype: int
    """
    return len(set(col for col, corr_val in col_corr if corr_val > threshold))

In [19]:
thresholds = np.arange(0.1, 1, 0.05)
for threshold in thresholds:
    print(
        'Number of correlated columns when threshold is {:.2f}: {}'.format(
            threshold, get_number_of_corr_columns(threshold)
        )
    )

Number of correlated columns when threshold is 0.10: 9527
Number of correlated columns when threshold is 0.15: 9501
Number of correlated columns when threshold is 0.20: 9425
Number of correlated columns when threshold is 0.25: 9097
Number of correlated columns when threshold is 0.30: 8350
Number of correlated columns when threshold is 0.35: 7313
Number of correlated columns when threshold is 0.40: 6208
Number of correlated columns when threshold is 0.45: 4904
Number of correlated columns when threshold is 0.50: 3776
Number of correlated columns when threshold is 0.55: 3150
Number of correlated columns when threshold is 0.60: 2116
Number of correlated columns when threshold is 0.65: 1784
Number of correlated columns when threshold is 0.70: 1508
Number of correlated columns when threshold is 0.75: 398
Number of correlated columns when threshold is 0.80: 302
Number of correlated columns when threshold is 0.85: 88
Number of correlated columns when threshold is 0.90: 44
Number of correlated

In [20]:
threshold = 0.3
correlated_columns = list(set(col for col, corr_val in col_corr if corr_val > threshold))

8350 columns has correlation value higher than 0.3 then we remove them

In [21]:
dropped_constant_columns.drop(columns=correlated_columns, inplace=True)
dropped_constant_columns.columns = list(range(dropped_constant_columns.shape[1]))
print('Number of columns after dropping the correlated ones: {}'.format(dropped_constant_columns.shape[1]))

Number of columns after dropping the correlated ones: 1193


In [40]:
dropped_constant_columns['response'] = target

In [41]:
columns_order = ['response'] + list(range(dropped_constant_columns.shape[1] - 1))
dropped_constant_columns = dropped_constant_columns[columns_order]

In [43]:
dropped_constant_columns.to_csv('./data/filtered_data.csv', index=False)

In [36]:
response_distribution

Unnamed: 0,response,count
0,0,407
1,1,123


Now, lets take 90% of data for training and the remaining 10% for testing. Knowing that class distribution is uneven,
we will sample the same amount of data from both classes `0` and `1`. To do that we will take 111 samples (0.9 * 123 = 111) randomly from both classes

In [69]:
# add supportive index column which allows tracking selected rows
dropped_constant_columns['index_support'] = dropped_constant_columns.index

In [70]:
train_data = dropped_constant_columns.groupby('response').apply(lambda x: x.sample(n=111, axis=0)).reset_index(drop=True)
test_data = dropped_constant_columns[~dropped_constant_columns['index_support'].isin(train_data['index_support'])]

In [71]:
train_data.drop(columns='index_support', inplace=True)
test_data.drop(columns='index_support', inplace=True)

In [73]:
train_data.groupby('response').size().reset_index(name='count')

Unnamed: 0,response,count
0,0,111
1,1,111


In [75]:
test_data.groupby('response').size().reset_index(name='count')

Unnamed: 0,response,count
0,0,296
1,1,12


In [76]:
train_data.to_csv('./data/train_data.csv', index=False)
test_data.to_csv('./data/test_data.csv', index=False)