# Supervised Learning

In [1]:
import numpy as np
import pandas as pd

In [2]:
import warnings
warnings.filterwarnings("ignore")

## Step 1 - Fit GBR models

#### Read training data

In [3]:
X_train_sc = pd.read_csv("Files/X_train_sc.csv", index_col=0)

In [4]:
y_train = pd.read_csv("Files/y_train.csv", index_col=0)
y_train = pd.Series(y_train["tm"])

#### Select features

In [5]:
mutual_info = [float(elem.strip()) for elem in open("Files/mutual_info.txt").readlines()]

In [6]:
num_features = 200
best_features = np.argsort(mutual_info)[-num_features:]

In [7]:
X_train_sc_best = X_train_sc.iloc[:, best_features]

#### Method 1 - Run KMeans and fit GBR models

In [8]:
from sklearn.cluster import KMeans
from sklearn.ensemble import GradientBoostingRegressor as GBR

In [9]:
n_clusters_km = 6
km = KMeans(n_clusters=n_clusters_km, n_init=20, max_iter=400)

In [10]:
km.fit(X_train_sc_best)

KMeans(max_iter=400, n_clusters=6, n_init=20)

In [11]:
for i in range(n_clusters_km):
    print(len(km.labels_[km.labels_ == i]))

2670
4743
3747
10289
838
6116


In [12]:
models_km = {}
for i in range(n_clusters_km):
    x_train_cluster = X_train_sc_best[km.labels_ == i]
    y_train_cluster = y_train[km.labels_ == i]
    model = GBR()
    models_km[i] = model.fit(x_train_cluster, y_train_cluster)

#### Method 2 - Run KNN Regressor and fit GBR models

In [13]:
from sklearn.neighbors import KNeighborsRegressor as KNR

In [14]:
# prepare choice of examples
n_clusters_knn = 6
len_ = X_train_sc_best.shape[0]
range_ = len_ // n_clusters_knn

In [15]:
# fit knn regressor model (n neighbors)
n_neighbors = len_ // n_clusters_knn
knn = KNR(n_neighbors=n_neighbors, p=1)
knn.fit(X_train_sc_best, y_train)

KNeighborsRegressor(n_neighbors=4733, p=1)

In [16]:
models_knn = {}
for i in range(0, len_, range_):
    # get example
    example = X_train_sc_best.iloc[i, :].to_numpy().reshape(1, -1)
    # get distances and indices of k closest neighbors to example
    _, indices = knn.kneighbors(example)
    # mask X_train and y_train
    x_train_cluster = X_train_sc_best.iloc[indices[0], :]
    y_train_cluster = y_train.iloc[indices[0]]
    # compute "centroid" for each cluster
    centroid = np.mean(x_train_cluster, axis=0)
    # update models
    models_knn[i] = [centroid, GBR().fit(x_train_cluster, y_train_cluster)]

## Step 2 - Predict labels

#### Read testing data

In [17]:
data_test = pd.read_csv("data_test.csv")

In [18]:
X_test = data_test.iloc[:, 2:-1]

#### Scale testing data

In [19]:
from sklearn.preprocessing import MinMaxScaler

In [20]:
X_test_arr = MinMaxScaler().fit_transform(X_test)
X_test_sc = pd.DataFrame(data=X_test_arr, columns=X_test.columns)

#### Mask testing data

In [21]:
X_test_sc_best = X_test_sc.iloc[:, best_features]

#### Method 1 - Predict testing data labels (KMeans)

In [22]:
y_preds_km = []
for i, row in X_test_sc_best.iterrows():
    example = row.to_numpy().reshape(1, -1)
    label = km.predict(example)[0]
    model = models_km[label]
    y_pred = model.predict(example)[0]
    y_preds_km.append(y_pred)

In [23]:
min(y_preds_km), max(y_preds_km)

(22.8400196340927, 102.43688639568039)

#### Method 2 - Predict testing data labels (KNN Regressor)

In [24]:
from sklearn.metrics.pairwise import euclidean_distances

In [25]:
y_preds_knn = []
for i, row in X_test_sc_best.iterrows():
    # convert to numpy and reshape
    example = row.to_numpy().reshape(1, -1)
    # compute distance of row to all clusters and get min_cluster
    min_dist = np.inf
    min_cluster = None
    for model in models_knn:
        centroid = models_knn[model][0].to_numpy().reshape(1, -1)
        dist = euclidean_distances(centroid, example)
        if dist < min_dist:
            min_dist = dist
            min_cluster = model
    # get prediction
    gbr = models_knn[min_cluster][1]
    y_pred = gbr.predict(example)[0]
    y_preds_knn.append(y_pred)

In [26]:
min(y_preds_knn), max(y_preds_knn)

(34.27549116967443, 107.35855377349316)

#### Create csv files with predictions

##### KMeans

In [27]:
preds_km = pd.concat([data_test["seq_id"], pd.Series(y_preds_km, name="tm")], axis=1)

In [28]:
preds_km.to_csv("novozymes_km.csv", index=False)

##### KNN Regressor

In [29]:
preds_knn = pd.concat([data_test["seq_id"], pd.Series(y_preds_knn, name="tm")], axis=1)

In [30]:
preds_knn.to_csv("novozymes_knn.csv", index=False)