In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import matthews_corrcoef


# data_dir = "/kaggle/input/cbbl-recruitment-semester-2-classification"
data_dir = "."
random_state = 42

# Data Processing

In [2]:
train_df = pd.read_csv(f"{data_dir}/train.csv")
X_train, y_train = train_df.iloc[:, 1:-1], train_df.iloc[:, -1]
X_train.shape, y_train.shape

((1775, 20), (1775,))

In [3]:
test_df = pd.read_csv(f"{data_dir}/test.csv")
X_test = test_df.iloc[:, 1:]
X_test

Unnamed: 0,Material,Type,Shape,Coat/Functional Group,Synthesis_Method,Surface_Charge,Cell_Type,No_of_Cells (cells/well),Human_Animal,Cell_Source,Cell_Tissue,Cell_Morphology,Cell_Age,Cell Line_Primary Cell,Time (hr),Concentration (ug/ml),Test,Test_Indicator,Size,Zeta
0,Ag,I,Sphere,Citrate,Commercial,Negative,CCL-110,5000,H,Human,Skin,Fibroblast,Fetus,P,24,0.50,MTS,TetrazoliumSalt,39.94,-23.5
1,Chitosan,O,Irregular,,Ionotropic Gelation Method,Positive,MBMC,10000,A,Mouse,BoneMarrow,Spindle,Adult,P,72,31.25,MTT,TetrazoliumSalt,475.20,71.3
2,NiO,I,Sphere,,Commercial,Positive,HepG2,10000,H,Human,Liver,Epithelial,Adult,L,24,100.00,MTS,TetrazoliumSalt,21.60,25.3
3,TiO2,I,Sphere,,Commercial,Negative,A549,30000,H,Human,Lung,Epithelial,Adult,L,48,10.00,MTT,TetrazoliumSalt,81.80,-8.6
4,Chitosan,O,Irregular,,Ionotropic Gelation Method,Positive,MBMC,10000,A,Mouse,BoneMarrow,Spindle,Adult,P,24,1000.00,MTT,TetrazoliumSalt,384.60,62.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
757,Au,I,Sphere,CYS,Chemical Reduction,Negative,L929,5000,A,Mouse,SubcutaneousConnectiveTissue,Fibroblast,Adult,L,24,5.00,MTT,TetrazoliumSalt,25.76,-46.8
758,MWCNT,C,Nanotube,,Commercial,Neutral,HUVEC,40000,H,Human,Umbilical Vein,Endothelial,Embryonic,P,24,32.00,NRU,NeutralRed,17.60,0.9
759,PLGA,O,Sphere,Chitosan,Emulsion-Solvent Evaporation,Positive,Colo_205,10000,H,Human,Colon,Epithelial,Adult,L,24,1250.00,MTS,TetrazoliumSalt,204.00,10.0
760,C60,C,Sphere,Dextran,Commercial,Negative,C6,10000,A,Rat,Brain,Fibroblast,Adult,L,24,25.00,MTT,TetrazoliumSalt,30.00,-16.9


## Feature Scaling

In [4]:
num_cols = X_train.select_dtypes(include=["float64", "int64"]).columns.tolist()

scaler = StandardScaler()
# scaler = MinMaxScaler()

# Scaling numerical features in the training set
scaled_X_train = pd.DataFrame(
    scaler.fit_transform(X_train[num_cols]),
    columns=scaler.get_feature_names_out(num_cols)
)
X_train = pd.concat([X_train.drop(columns=num_cols), scaled_X_train], axis=1)

# Scaling numerical features in the test set
scaled_X_test = pd.DataFrame(
    scaler.transform(X_test[num_cols]),
    columns=scaler.get_feature_names_out(num_cols)
)
X_test = pd.concat([X_test.drop(columns=num_cols), scaled_X_test], axis=1)

In [5]:
X_train

Unnamed: 0,Material,Type,Shape,Coat/Functional Group,Synthesis_Method,Surface_Charge,Cell_Type,Human_Animal,Cell_Source,Cell_Tissue,Cell_Morphology,Cell_Age,Cell Line_Primary Cell,Test,Test_Indicator,No_of_Cells (cells/well),Time (hr),Concentration (ug/ml),Size,Zeta
0,Pt,I,Sphere,PVP,Chemical Reduction,Negative,IMR90,H,Human,Lung,Fibroblast,Adult,L,CellTiterGlo,LuciferaseEnzyme,-0.418714,-0.453925,-0.242830,-0.667768,0.168256
1,Au,I,Sphere,CYS,Chemical Reduction,Negative,L929,A,Mouse,SubcutaneousConnectiveTissue,Fibroblast,Adult,L,MTT,TetrazoliumSalt,-0.418714,-0.453925,-0.208418,-0.525181,-1.072697
2,EudragitRL,O,Sphere,,Emulsion-Solvent Evaporation,Positive,THP-1,H,Human,Blood,Monocyte,Adult,L,MTT,TetrazoliumSalt,-0.418714,-0.453925,-0.242830,0.236503,2.215188
3,Au,I,Sphere,Citrate,Chemical Reduction,Negative,Jurkat,H,Human,Blood,Lymphoblast,Adult,L,MTT,TetrazoliumSalt,3.232934,-0.453925,-0.275178,-0.648110,-2.742227
4,SLN,O,Sphere,,Emulsion-Solvent Evaporation,Negative,A549,H,Human,Lung,Epithelial,Adult,L,NRU,TetrazoliumSalt,0.282107,-0.453925,4.775996,-0.049194,-0.042834
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1770,Dendrimer,O,Sphere,C12,Commercial,Positive,hNPC,H,Human,Brain,Neuronal,Embryonic,P,MTT,TetrazoliumSalt,0.282107,1.694532,-0.275866,-0.664491,2.166254
1771,ZnO,I,Sphere,,Commercial,Positive,HCMEC,H,Human,Heart,Epithelial,Adult,P,MTT,TetrazoliumSalt,-0.086746,-0.991039,-0.277105,-0.397142,1.469977
1772,C70,C,Sphere,,Commercial,Negative,V79,A,Hamster,Lung,Fibroblast,Adult,L,MTT,TetrazoliumSalt,-0.381829,-0.453925,-0.270360,-0.320475,-0.695294
1773,Polystyrene,O,Sphere,COOH,Commercial,Negative,HeLa,H,Human,Cervix,Epithelial,Adult,L,CCK-8,TetrazoliumSalt,-0.418714,0.620304,-0.270360,-0.366344,0.136272


In [6]:
X_test

Unnamed: 0,Material,Type,Shape,Coat/Functional Group,Synthesis_Method,Surface_Charge,Cell_Type,Human_Animal,Cell_Source,Cell_Tissue,Cell_Morphology,Cell_Age,Cell Line_Primary Cell,Test,Test_Indicator,No_of_Cells (cells/well),Time (hr),Concentration (ug/ml),Size,Zeta
0,Ag,I,Sphere,Citrate,Commercial,Negative,CCL-110,H,Human,Skin,Fibroblast,Fetus,P,MTS,TetrazoliumSalt,-0.418714,-0.453925,-0.276554,-0.432264,-0.327486
1,Chitosan,O,Irregular,,Ionotropic Gelation Method,Positive,MBMC,A,Mouse,BoneMarrow,Spindle,Adult,P,MTT,TetrazoliumSalt,-0.381829,1.694532,-0.234227,2.419859,2.704533
2,NiO,I,Sphere,,Commercial,Positive,HepG2,H,Human,Liver,Epithelial,Adult,L,MTS,TetrazoliumSalt,-0.381829,-0.453925,-0.139593,-0.552440,1.233300
3,TiO2,I,Sphere,,Commercial,Negative,A549,H,Human,Lung,Epithelial,Adult,L,MTT,TetrazoliumSalt,-0.234287,0.620304,-0.263478,-0.157969,0.149066
4,Chitosan,O,Irregular,,Ionotropic Gelation Method,Positive,MBMC,A,Mouse,BoneMarrow,Spindle,Adult,P,MTT,TetrazoliumSalt,-0.381829,-0.453925,1.099249,1.826185,2.410287
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
757,Au,I,Sphere,CYS,Chemical Reduction,Negative,L929,A,Mouse,SubcutaneousConnectiveTissue,Fibroblast,Adult,L,MTT,TetrazoliumSalt,-0.418714,-0.453925,-0.270360,-0.525181,-1.072697
758,MWCNT,C,Nanotube,,Commercial,Neutral,HUVEC,H,Human,Umbilical Vein,Endothelial,Embryonic,P,NRU,NeutralRed,-0.160517,-0.453925,-0.233195,-0.578651,0.452907
759,PLGA,O,Sphere,Chitosan,Emulsion-Solvent Evaporation,Positive,Colo_205,H,Human,Colon,Epithelial,Adult,L,MTS,TetrazoliumSalt,-0.381829,-0.453925,1.443372,0.642770,0.743955
760,C60,C,Sphere,Dextran,Commercial,Negative,C6,A,Rat,Brain,Fibroblast,Adult,L,MTT,TetrazoliumSalt,-0.381829,-0.453925,-0.242830,-0.497398,-0.116396


## Feature Encoding

In [7]:
cat_cols = X_train.select_dtypes(include=["object"]).columns.tolist()
label_encoding_cols = ["Type"]
one_hot_encoding_cols = list(set(cat_cols) - set(label_encoding_cols))

In [8]:
# ONE-HOT ENCODING
encoder = OneHotEncoder(sparse_output=False, drop='first')

# Encoding the categorical columns in the training set
encoded_X_train = pd.DataFrame(
    encoder.fit_transform(X_train[one_hot_encoding_cols]),
    columns=encoder.get_feature_names_out(one_hot_encoding_cols)
)
X_train = pd.concat([X_train.drop(columns=one_hot_encoding_cols), encoded_X_train], axis=1)

# Encoding the categorical columns in the test set
encoded_X_test = pd.DataFrame(
    encoder.transform(X_test[one_hot_encoding_cols]),
    columns=encoder.get_feature_names_out(one_hot_encoding_cols)
)
X_test = pd.concat([X_test.drop(columns=one_hot_encoding_cols), encoded_X_test], axis=1)

In [9]:
# LABEL ENCODING
encoder = LabelEncoder()

# Encoding the Type column
for cols in label_encoding_cols:
    X_train[cols] = encoder.fit_transform(X_train[cols])
    X_test[cols] = encoder.transform(X_test[cols])

In [10]:
X_train

Unnamed: 0,Type,No_of_Cells (cells/well),Time (hr),Concentration (ug/ml),Size,Zeta,Test_CCK-8,Test_CellTiterBlue,Test_CellTiterGlo,Test_CoulterCounter,...,Cell_Tissue_RespiratoryTract,Cell_Tissue_Retina,Cell_Tissue_Skin,Cell_Tissue_Stomach,Cell_Tissue_SubcutaneousConnectiveTissue,Cell_Tissue_Teeth,Cell_Tissue_Umbilical Vein,Human_Animal_H,Cell_Age_Embryonic,Cell_Age_Fetus
0,1,-0.418714,-0.453925,-0.242830,-0.667768,0.168256,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1,-0.418714,-0.453925,-0.208418,-0.525181,-1.072697,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,2,-0.418714,-0.453925,-0.242830,0.236503,2.215188,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1,3.232934,-0.453925,-0.275178,-0.648110,-2.742227,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,2,0.282107,-0.453925,4.775996,-0.049194,-0.042834,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1770,2,0.282107,1.694532,-0.275866,-0.664491,2.166254,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1771,1,-0.086746,-0.991039,-0.277105,-0.397142,1.469977,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1772,0,-0.381829,-0.453925,-0.270360,-0.320475,-0.695294,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1773,2,-0.418714,0.620304,-0.270360,-0.366344,0.136272,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [11]:
X_test

Unnamed: 0,Type,No_of_Cells (cells/well),Time (hr),Concentration (ug/ml),Size,Zeta,Test_CCK-8,Test_CellTiterBlue,Test_CellTiterGlo,Test_CoulterCounter,...,Cell_Tissue_RespiratoryTract,Cell_Tissue_Retina,Cell_Tissue_Skin,Cell_Tissue_Stomach,Cell_Tissue_SubcutaneousConnectiveTissue,Cell_Tissue_Teeth,Cell_Tissue_Umbilical Vein,Human_Animal_H,Cell_Age_Embryonic,Cell_Age_Fetus
0,1,-0.418714,-0.453925,-0.276554,-0.432264,-0.327486,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,2,-0.381829,1.694532,-0.234227,2.419859,2.704533,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,-0.381829,-0.453925,-0.139593,-0.552440,1.233300,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1,-0.234287,0.620304,-0.263478,-0.157969,0.149066,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,2,-0.381829,-0.453925,1.099249,1.826185,2.410287,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
757,1,-0.418714,-0.453925,-0.270360,-0.525181,-1.072697,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
758,0,-0.160517,-0.453925,-0.233195,-0.578651,0.452907,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
759,2,-0.381829,-0.453925,1.443372,0.642770,0.743955,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
760,0,-0.381829,-0.453925,-0.242830,-0.497398,-0.116396,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Model Training

In [12]:
model = SVC(
    random_state=random_state,
    C=1.0,
    kernel="rbf",
    degree=3,
    gamma="scale",
    coef0=0.0,
)
model.fit(X_train, y_train)

print("Training MCC:", matthews_corrcoef(y_train, model.predict(X_train)))

Training MCC: 0.6312499745005428


# Model Optimization

In [13]:
param_grid = {
    "C": [0.1, 1, 10, 100],
    "kernel": ["linear", "poly", "rbf", "sigmoid"],
    "degree": [2, 3, 4],
    "gamma": ["scale", "auto"],
    "coef0": [0.0, 0.1, 0.5],
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='matthews_corrcoef',
    n_jobs=-1,
    cv=5,
    verbose=2,
)

In [14]:
grid_search.fit(X_train, y_train)

best_model_class = grid_search.best_estimator_
best_score_class = grid_search.best_score_
print(f"Best Model Hyperparameters: {grid_search.best_params_}")
print(f"Best Model Score (MCC): {best_score_class}")

# Store the best models (optional, but good practice)
# You can save these models using joblib or pickle
# Example using joblib:
# import joblib
# joblib.dump(best_model_class, 'best_random_forest_classifier.pkl')

Fitting 5 folds for each of 288 candidates, totalling 1440 fits
Best Model Hyperparameters: {'C': 100, 'coef0': 0.5, 'degree': 4, 'gamma': 'scale', 'kernel': 'poly'}
Best Model Score (MCC): 0.6339648814489933


# Model Evaluation

In [None]:
result_df = pd.DataFrame(data={"ID": test_df.iloc[:, 0], "Target": model.predict(X_test)}, dtype=int)
result_df.to_csv("submission.csv", index=False)

# !head submission.csv

: 