<a href="https://colab.research.google.com/github/vasudevgupta7/prml-assignments/blob/main/notebooks/ME18B181_ME18B182.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!wget https://huggingface.co/datasets/vasudevgupta/prml_data_contest/resolve/main/Dataset_1_Testing.csv
!wget https://huggingface.co/datasets/vasudevgupta/prml_data_contest/resolve/main/Dataset_1_Training.csv
!wget https://huggingface.co/datasets/vasudevgupta/prml_data_contest/resolve/main/Dataset_2_Testing.csv
!wget https://huggingface.co/datasets/vasudevgupta/prml_data_contest/resolve/main/Dataset_2_Training.csv

In [2]:
import numpy as np
import pandas as pd

from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.feature_selection import SelectKBest, RFE
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, matthews_corrcoef, make_scorer

SEED = 42

In [3]:
def load_data(train_file1, test_file1, train_file2, test_file2):
  df1 = pd.read_csv(train_file1, index_col=0, header=0)
  X1 = df1.values.T[:, :-2]
  co1 = df1.iloc[-2,:].values.astype(np.int32)
  co2 = df1.iloc[-1,:].values.astype(np.int32)

  df1_test = pd.read_csv(test_file1, index_col=0, header=0)
  X1_test = df1_test.values.T

  df2 = pd.read_csv(train_file2, index_col=0, header=0)
  X2 = df2.values.T[:, :-4]
  co3 = df2.iloc[-4, :].values.astype(np.int32)
  co4 = df2.iloc[-3, :].values.astype(np.int32)
  co5 = df2.iloc[-2, :].values.astype(np.int32)
  co6 = df2.iloc[-1, :].values.astype(np.int32)

  df2_test = pd.read_csv(test_file2, index_col=0, header=0)
  X2_test = df2_test.values.T

  return X1, X2, (co1, co2, co3, co4, co5, co6), X1_test, X2_test

In [4]:
train_file1 = "Dataset_1_Training.csv"
test_file1 = "Dataset_1_Testing.csv"

train_file2 = "Dataset_2_Training.csv"
test_file2 = "Dataset_2_Testing.csv"

X1, X2, targets, X1_test, X2_test = load_data(train_file1, test_file1, train_file2, test_file2)
co1, co2, co3, co4, co5, co6 = targets

print("dataset-1:", X1.shape, co1.shape, co2.shape, X1_test.shape)
print("dataset-2:", X2.shape, co3.shape, co4.shape, co5.shape, co6.shape, X2_test.shape)

dataset-1: (130, 22283) (130,) (130,) (100, 22283)
dataset-2: (340, 54675) (340,) (340,) (340,) (340,) (214, 54675)


## CO1

In [5]:
pca = PCA(n_components=0.9, svd_solver="full", random_state=SEED)
X1_pca = pca.fit_transform(X1)
X1_pca_test = pca.transform(X1_test)

rfe = RFE(SVC(kernel="linear", random_state=SEED), step=1, n_features_to_select=60)
X1_rfe = rfe.fit_transform(X1_pca, co1)
X1_rfe_test = rfe.transform(X1_pca_test)

bgc1 = BaggingClassifier(SVC(kernel="linear", random_state=SEED), n_estimators=100, max_samples=1.0, max_features=1.0, random_state=SEED)
bgc1.fit(X1_rfe, co1)

co1_pred = bgc1.predict(X1_rfe_test)
co1_pred

array([0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

## CO2

In [6]:
pca = PCA(n_components=0.9, svd_solver="full", random_state=SEED)
X1_pca = pca.fit_transform(X1)
X1_pca_test = pca.transform(X1_test)

rfe = RFE(SVC(kernel="linear", random_state=SEED), step=1, n_features_to_select=70)
X1_rfe = rfe.fit_transform(X1_pca, co2)
X1_rfe_test = rfe.transform(X1_pca_test)

bgc2 = BaggingClassifier(SVC(kernel="rbf", random_state=SEED), n_estimators=75, max_samples=0.75, max_features=0.9, random_state=SEED)
bgc2.fit(X1_rfe, co2)

co2_pred = bgc2.predict(X1_rfe_test)
co2_pred

array([1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0], dtype=int32)

## CO3

In [7]:
pca = PCA(n_components=0.9, svd_solver="full", random_state=SEED)
X2_pca = pca.fit_transform(X2)
X2_pca_test = pca.transform(X2_test)

rfe = RFE(SVC(kernel="linear", random_state=SEED), step=1, n_features_to_select=75)
X2_rfe = rfe.fit_transform(X2_pca, co3)
X2_rfe_test = rfe.transform(X2_pca_test)

bgc3 = BaggingClassifier(SVC(kernel="rbf", C=10, random_state=SEED), n_estimators=100, max_samples=0.75, max_features=1.0, random_state=SEED)
bgc3.fit(X2_rfe, co3)

co3_pred = bgc3.predict(X2_rfe_test)
co3_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], dtype=int32)

## CO4, CO5, CO6

In [8]:
DEFAULT_CONFIGS = {
    "SelectKBest__k": 1000,
    "PCA__n_components": 300,
    "RFECV__SVC__C": 0.1,
    "RFECV__SVC__gamma": 1,
    "RFECV__SVC__kernel": 'linear',
    "RFECV__n_features_to_select": 10,
    "BaggingClassifier__SVC__C": 0.1,
    "BaggingClassifier__SVC__gamma": 1,
    "BaggingClassifier__SVC__kernel": "linear",
    "BaggingClassifier__n_estimators": 100,
}

def train_and_predict(X, y, X_test, configs={}):
  cfg = DEFAULT_CONFIGS.copy()
  cfg.update(configs)
  print(cfg)

  features_selector = SelectKBest(k=cfg["SelectKBest__k"])
  features_selector.fit(X, y)

  ftrs = features_selector.get_support()
  X = X[:, ftrs]
  X_test = X_test[:, ftrs]

  print("After SelectKBest, X_shape:", X.shape)

  pca = PCA(n_components=cfg["PCA__n_components"], random_state=SEED)
  X = pca.fit_transform(X)
  X_test = pca.transform(X_test)
  print("After PCA, X_shape:", X.shape)

  rfe_svc = SVC(C=cfg["RFECV__SVC__C"], gamma=cfg["RFECV__SVC__gamma"], kernel=cfg["RFECV__SVC__kernel"], random_state=SEED)
  rfe = RFE(rfe_svc, n_features_to_select=cfg["RFECV__n_features_to_select"])

  X = rfe.fit_transform(X, y)
  X_test = rfe.transform(X_test)

  print("After RFE, X_shape:", X.shape)

  bgc_svc = SVC(
      C=cfg["BaggingClassifier__SVC__C"],
      gamma=cfg["BaggingClassifier__SVC__gamma"],
      kernel=cfg["BaggingClassifier__SVC__kernel"],
      random_state=SEED,
  )
  bgc = BaggingClassifier(
      base_estimator=bgc_svc,
      n_estimators=cfg["BaggingClassifier__n_estimators"],
      random_state=SEED,
  )
  bgc.fit(X, y)

  return bgc.predict(X_test)

In [9]:
configs = {"SelectKBest__k": 3000, "RFECV__n_features_to_select": 100}
co4_pred = train_and_predict(X2, co4, X2_test, configs=configs)

{'SelectKBest__k': 3000, 'PCA__n_components': 300, 'RFECV__SVC__C': 0.1, 'RFECV__SVC__gamma': 1, 'RFECV__SVC__kernel': 'linear', 'RFECV__n_features_to_select': 100, 'BaggingClassifier__SVC__C': 0.1, 'BaggingClassifier__SVC__gamma': 1, 'BaggingClassifier__SVC__kernel': 'linear', 'BaggingClassifier__n_estimators': 100}
After SelectKBest, X_shape: (340, 3000)
After PCA, X_shape: (340, 300)
After RFE, X_shape: (340, 100)


In [10]:
configs = {"SelectKBest__k": 3000, "RFECV__n_features_to_select": 105}
co5_pred = train_and_predict(X2, co5, X2_test, configs=configs)

{'SelectKBest__k': 3000, 'PCA__n_components': 300, 'RFECV__SVC__C': 0.1, 'RFECV__SVC__gamma': 1, 'RFECV__SVC__kernel': 'linear', 'RFECV__n_features_to_select': 105, 'BaggingClassifier__SVC__C': 0.1, 'BaggingClassifier__SVC__gamma': 1, 'BaggingClassifier__SVC__kernel': 'linear', 'BaggingClassifier__n_estimators': 100}
After SelectKBest, X_shape: (340, 3000)
After PCA, X_shape: (340, 300)
After RFE, X_shape: (340, 105)


In [11]:
configs = {
    "SelectKBest__k": 3000,
    "BaggingClassifier__SVC__kernel": "rbf",
    "RFECV__n_features_to_select": 95,
}

co6_pred = train_and_predict(X2, co6, X2_test, configs=configs)

{'SelectKBest__k': 3000, 'PCA__n_components': 300, 'RFECV__SVC__C': 0.1, 'RFECV__SVC__gamma': 1, 'RFECV__SVC__kernel': 'linear', 'RFECV__n_features_to_select': 95, 'BaggingClassifier__SVC__C': 0.1, 'BaggingClassifier__SVC__gamma': 1, 'BaggingClassifier__SVC__kernel': 'rbf', 'BaggingClassifier__n_estimators': 100}
After SelectKBest, X_shape: (340, 3000)
After PCA, X_shape: (340, 300)
After RFE, X_shape: (340, 95)


## Submission

In [12]:
def make_submission(co1_pred, co2_pred, co3_pred, co4_pred, co5_pred, co6_pred):
  predictions = np.concatenate([co1_pred, co2_pred, co3_pred, co4_pred, co5_pred, co6_pred])
  predictions = predictions.astype(np.int32)
  submission = pd.DataFrame({"Id": np.arange(len(predictions)), "Predicted": predictions})
  return submission

submission = make_submission(co1_pred, co2_pred, co3_pred, co4_pred, co5_pred, co6_pred)
submission.to_csv("submission.csv", index=False)