In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import matthews_corrcoef


# data_dir = "/kaggle/input/cbbl-recruitment-semester-2-classification"
data_dir = "."
random_state = 42

# Data Processing

In [2]:
train_df = pd.read_csv(f"{data_dir}/train.csv")
X_train, y_train = train_df.iloc[:, 1:-1], train_df.iloc[:, -1]
X_train.shape, y_train.shape

((1775, 20), (1775,))

In [3]:
test_df = pd.read_csv(f"{data_dir}/test.csv")
X_test = test_df.iloc[:, 1:]
X_test

Unnamed: 0,Material,Type,Shape,Coat/Functional Group,Synthesis_Method,Surface_Charge,Cell_Type,No_of_Cells (cells/well),Human_Animal,Cell_Source,Cell_Tissue,Cell_Morphology,Cell_Age,Cell Line_Primary Cell,Time (hr),Concentration (ug/ml),Test,Test_Indicator,Size,Zeta
0,Ag,I,Sphere,Citrate,Commercial,Negative,CCL-110,5000,H,Human,Skin,Fibroblast,Fetus,P,24,0.50,MTS,TetrazoliumSalt,39.94,-23.5
1,Chitosan,O,Irregular,,Ionotropic Gelation Method,Positive,MBMC,10000,A,Mouse,BoneMarrow,Spindle,Adult,P,72,31.25,MTT,TetrazoliumSalt,475.20,71.3
2,NiO,I,Sphere,,Commercial,Positive,HepG2,10000,H,Human,Liver,Epithelial,Adult,L,24,100.00,MTS,TetrazoliumSalt,21.60,25.3
3,TiO2,I,Sphere,,Commercial,Negative,A549,30000,H,Human,Lung,Epithelial,Adult,L,48,10.00,MTT,TetrazoliumSalt,81.80,-8.6
4,Chitosan,O,Irregular,,Ionotropic Gelation Method,Positive,MBMC,10000,A,Mouse,BoneMarrow,Spindle,Adult,P,24,1000.00,MTT,TetrazoliumSalt,384.60,62.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
757,Au,I,Sphere,CYS,Chemical Reduction,Negative,L929,5000,A,Mouse,SubcutaneousConnectiveTissue,Fibroblast,Adult,L,24,5.00,MTT,TetrazoliumSalt,25.76,-46.8
758,MWCNT,C,Nanotube,,Commercial,Neutral,HUVEC,40000,H,Human,Umbilical Vein,Endothelial,Embryonic,P,24,32.00,NRU,NeutralRed,17.60,0.9
759,PLGA,O,Sphere,Chitosan,Emulsion-Solvent Evaporation,Positive,Colo_205,10000,H,Human,Colon,Epithelial,Adult,L,24,1250.00,MTS,TetrazoliumSalt,204.00,10.0
760,C60,C,Sphere,Dextran,Commercial,Negative,C6,10000,A,Rat,Brain,Fibroblast,Adult,L,24,25.00,MTT,TetrazoliumSalt,30.00,-16.9


## Feature Encoding

In [4]:
cat_cols = X_train.select_dtypes(include=["object"]).columns.tolist()
encoder = LabelEncoder()

for cols in cat_cols:
    X_train[cols] = encoder.fit_transform(X_train[cols])
    X_test[cols] = encoder.transform(X_test[cols])

In [5]:
X_train

Unnamed: 0,Material,Type,Shape,Coat/Functional Group,Synthesis_Method,Surface_Charge,Cell_Type,No_of_Cells (cells/well),Human_Animal,Cell_Source,Cell_Tissue,Cell_Morphology,Cell_Age,Cell Line_Primary Cell,Time (hr),Concentration (ug/ml),Test,Test_Indicator,Size,Zeta
0,20,1,6,23,1,0,26,5000.0,1,2,15,2,0,0,24,25.0,3,4,4.00,-8.00
1,2,1,6,4,1,0,31,5000.0,0,4,22,2,0,0,24,50.0,8,7,25.76,-46.80
2,11,2,6,31,6,2,46,5000.0,1,2,2,7,0,0,24,25.0,8,7,142.00,56.00
3,2,1,6,6,1,0,28,500000.0,1,2,2,5,0,0,24,1.5,8,7,7.00,-99.00
4,21,2,6,31,6,0,4,100000.0,1,2,15,1,0,0,24,3671.1,9,7,98.40,-14.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1770,9,2,6,1,4,2,53,100000.0,1,2,4,10,1,1,72,1.0,8,7,4.50,54.47
1771,25,1,6,31,4,2,14,50000.0,1,2,12,1,0,1,12,0.1,8,7,45.30,32.70
1772,4,0,6,31,4,0,49,10000.0,0,1,15,2,0,0,24,5.0,8,7,57.00,-35.00
1773,19,2,6,2,4,0,24,5000.0,1,2,6,1,0,0,48,5.0,1,7,50.00,-9.00


In [6]:
X_test

Unnamed: 0,Material,Type,Shape,Coat/Functional Group,Synthesis_Method,Surface_Charge,Cell_Type,No_of_Cells (cells/well),Human_Animal,Cell_Source,Cell_Tissue,Cell_Morphology,Cell_Age,Cell Line_Primary Cell,Time (hr),Concentration (ug/ml),Test,Test_Indicator,Size,Zeta
0,0,1,6,6,4,0,10,5000,1,2,20,2,2,1,24,0.50,7,7,39.94,-23.5
1,6,2,2,31,11,2,32,10000,0,4,3,11,0,1,72,31.25,8,7,475.20,71.3
2,16,1,6,31,4,2,25,10000,1,2,14,1,0,0,24,100.00,7,7,21.60,25.3
3,24,1,6,31,4,0,4,30000,1,2,15,1,0,0,48,10.00,8,7,81.80,-8.6
4,6,2,2,31,11,2,32,10000,0,4,3,11,0,1,24,1000.00,8,7,384.60,62.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
757,2,1,6,4,1,0,31,5000,0,4,22,2,0,0,24,5.00,8,7,25.76,-46.8
758,14,0,3,31,4,1,22,40000,1,2,24,0,1,1,24,32.00,9,5,17.60,0.9
759,17,2,6,5,6,2,13,10000,1,2,7,1,0,0,24,1250.00,7,7,204.00,10.0
760,3,0,6,12,4,0,9,10000,0,5,4,2,0,0,24,25.00,8,7,30.00,-16.9


# Model Training

In [7]:
model = MLPClassifier(
    random_state=random_state,
)
model.fit(X_train, y_train)

print("Training MCC:", matthews_corrcoef(y_train, model.predict(X_train)))

Training MCC: 0.14958849146374392


# Model Optimization

In [8]:
param_grid = {
    "hidden_layer_sizes": [(50, 50), (100, 100), (150, 150)],
    "activation": ["relu", "tanh"],
    "solver": ["adam", "sgd"],
    "alpha": [0.0001, 0.001, 0.01],
    "learning_rate": ["constant", "adaptive"],
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='matthews_corrcoef',
    n_jobs=-1,
    cv=5,
    verbose=2,
)

In [9]:
grid_search.fit(X_train, y_train)

best_model_class = grid_search.best_estimator_
best_score_class = grid_search.best_score_
print(f"Best Model Hyperparameters: {grid_search.best_params_}")
print(f"Best Model Score (MCC): {best_score_class}")

# Store the best models (optional, but good practice)
# You can save these models using joblib or pickle
# Example using joblib:
# import joblib
# joblib.dump(best_model_class, 'best_random_forest_classifier.pkl')

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best Model Hyperparameters: {'activation': 'tanh', 'alpha': 0.01, 'hidden_layer_sizes': (150, 150), 'learning_rate': 'constant', 'solver': 'adam'}
Best Model Score (MCC): 0.20085019222299366


# Model Evaluation

In [10]:
result_df = pd.DataFrame(data={"ID": test_df.iloc[:, 0], "Target": model.predict(X_test)}, dtype=int)
result_df.to_csv("submission.csv", index=False)

# !head submission.csv