# Importing the Prerequisite Liberaries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
%matplotlib inline
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
from sklearn.cluster import Birch
import requests
import json

np.random.seed(0)

# Loading the Data

In [2]:
training_data = pd.read_csv('data_tr.txt',sep='\t',header=None)
actual_data=pd.read_csv('gene_names.txt',header=None)

labels = actual_data[0].tolist()

training_data.columns = labels



# Feature Selection 

### Using Standard Deviation we drop all the features with STD < 0.10

In [3]:
genes = []
std_deviations = []
for (col_name, col_data) in training_data.iteritems():
    if(np.std(col_data) <= 0.10):
        genes.append(col_name)
        std_deviations.append(np.std(col_data))  
training_data.drop(genes, axis=1, inplace= True)

print("No. of features dropped ",len(genes))

No. of features dropped  4994


### Sampling the Training Data into 100 equal parts

In [4]:
df_collection = np.array_split(training_data, 100, axis = 1)

### Iterating over the samples and dropping features with Correlation > 0.60

In [5]:
for i in df_collection:
    corr_matrix = i.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    to_drop = [column for column in upper.columns if any(upper[column] > 0.60)]
    training_data.drop(to_drop, axis=1, inplace=True)
    print("Features dropped ",len(to_drop))

Features dropped  0
Features dropped  0
Features dropped  0
Features dropped  0
Features dropped  0
Features dropped  0
Features dropped  0
Features dropped  0
Features dropped  0
Features dropped  0
Features dropped  0
Features dropped  0
Features dropped  0
Features dropped  0
Features dropped  0
Features dropped  0
Features dropped  0
Features dropped  0
Features dropped  0
Features dropped  1
Features dropped  0
Features dropped  0
Features dropped  1
Features dropped  0
Features dropped  0
Features dropped  0
Features dropped  0
Features dropped  0
Features dropped  0
Features dropped  0
Features dropped  0
Features dropped  0
Features dropped  0
Features dropped  0
Features dropped  0
Features dropped  0
Features dropped  1
Features dropped  0
Features dropped  0
Features dropped  0
Features dropped  0
Features dropped  0
Features dropped  1
Features dropped  0
Features dropped  0
Features dropped  0
Features dropped  0
Features dropped  0
Features dropped  0
Features dropped  0


### Applying Variance Threshold and Dropping the Quasi Constant Features

In [6]:
variance_filter = VarianceThreshold(threshold=0.1)
new_data = variance_filter.fit(training_data)

qconstant_columns = [column for column in training_data.columns
                    if column not in training_data.columns[variance_filter.get_support()]]

new_data = training_data.drop(qconstant_columns,axis = 1)
training_data = new_data

new_data_T = new_data.T
print(new_data_T.shape)

(1038, 13177)


### Applying Correlation Again, this time to the Full Training Dataset

In [8]:
correlation_matrix = training_data.corr()

In [9]:
correlated_features = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            colname = correlation_matrix.columns[i]
            correlated_features.append(colname)
            
training_data.drop(correlated_features, axis = 1, inplace = True)

In [12]:
final_features = len(training_data.columns)
print("Final list of Features ",final_features)

Final list of Features  1025


# Scaling / Normalization


In [13]:
scaler = MinMaxScaler()
scaler.fit(training_data)
scaled_features_train = scaler.transform(training_data)

# PCA

In [22]:
pca = PCA(n_components=45,svd_solver='full')
fitted_data = pca.fit_transform(scaled_features_train)


# Finally saving these features to CSV

In [20]:
training_data.to_csv('final_features.csv')

### We are now ready to use this data to train and test our models for evaluation

# KMeans

In [23]:
model = KMeans(n_clusters=16, random_state=0, init="k-means++")
model.fit(fitted_data)
predictions = model.predict(fitted_data)
kmeans_silhouette = silhouette_score(fitted_data, model.labels_)
print(kmeans_silhouette)

0.11152430940105786


# Birch

In [24]:
# Creating the BIRCH clustering model
model = Birch(n_clusters = 16)
  
# Fit the data (Training)
model.fit(fitted_data)
  
# Predict the same data
predictions = model.predict(fitted_data)

birch_silhouette = silhouette_score(fitted_data, model.labels_)
print(birch_silhouette)

0.1490037516769233


# Spectral Clustering

In [26]:
from sklearn.cluster import SpectralClustering
model = SpectralClustering(n_clusters=16,assign_labels='discretize',random_state=0).fit_predict(fitted_data)

spec_silhouette = silhouette_score(fitted_data, model)
print(spec_silhouette)

0.1566103581508076
