In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score


# data_dir = "/kaggle/input/cbbl-recruitment-semester-2-classification"
data_dir = "."
random_state = 42

# Data Processing

In [19]:
train_df = pd.read_csv(f"{data_dir}/train.csv")

# Encoding the target variable
y_train = train_df["Target"].to_numpy()

# Dropping the Target columns from the features
train_feats = train_df.drop(columns=["Target"])

train_feats.head(5)

Unnamed: 0,ID,Material,Type,Shape,Coat/Functional Group,Synthesis_Method,Surface_Charge,Cell_Type,No_of_Cells (cells/well),Human_Animal,...,Cell_Tissue,Cell_Morphology,Cell_Age,Cell Line_Primary Cell,Time (hr),Concentration (ug/ml),Test,Test_Indicator,Size,Zeta
0,472,Pt,I,Sphere,PVP,Chemical Reduction,Negative,IMR90,5000.0,H,...,Lung,Fibroblast,Adult,L,24,25.0,CellTiterGlo,LuciferaseEnzyme,4.0,-8.0
1,232,Au,I,Sphere,CYS,Chemical Reduction,Negative,L929,5000.0,A,...,SubcutaneousConnectiveTissue,Fibroblast,Adult,L,24,50.0,MTT,TetrazoliumSalt,25.76,-46.8
2,2041,EudragitRL,O,Sphere,,Emulsion-Solvent Evaporation,Positive,THP-1,5000.0,H,...,Blood,Monocyte,Adult,L,24,25.0,MTT,TetrazoliumSalt,142.0,56.0
3,437,Au,I,Sphere,Citrate,Chemical Reduction,Negative,Jurkat,500000.0,H,...,Blood,Lymphoblast,Adult,L,24,1.5,MTT,TetrazoliumSalt,7.0,-99.0
4,68,SLN,O,Sphere,,Emulsion-Solvent Evaporation,Negative,A549,100000.0,H,...,Lung,Epithelial,Adult,L,24,3671.1,NRU,TetrazoliumSalt,98.4,-14.6


In [20]:
test_feats = pd.read_csv(f"{data_dir}/test.csv")
test_feats.head(5)

Unnamed: 0,ID,Material,Type,Shape,Coat/Functional Group,Synthesis_Method,Surface_Charge,Cell_Type,No_of_Cells (cells/well),Human_Animal,...,Cell_Tissue,Cell_Morphology,Cell_Age,Cell Line_Primary Cell,Time (hr),Concentration (ug/ml),Test,Test_Indicator,Size,Zeta
0,32,Ag,I,Sphere,Citrate,Commercial,Negative,CCL-110,5000,H,...,Skin,Fibroblast,Fetus,P,24,0.5,MTS,TetrazoliumSalt,39.94,-23.5
1,376,Chitosan,O,Irregular,,Ionotropic Gelation Method,Positive,MBMC,10000,A,...,BoneMarrow,Spindle,Adult,P,72,31.25,MTT,TetrazoliumSalt,475.2,71.3
2,71,NiO,I,Sphere,,Commercial,Positive,HepG2,10000,H,...,Liver,Epithelial,Adult,L,24,100.0,MTS,TetrazoliumSalt,21.6,25.3
3,2232,TiO2,I,Sphere,,Commercial,Negative,A549,30000,H,...,Lung,Epithelial,Adult,L,48,10.0,MTT,TetrazoliumSalt,81.8,-8.6
4,2018,Chitosan,O,Irregular,,Ionotropic Gelation Method,Positive,MBMC,10000,A,...,BoneMarrow,Spindle,Adult,P,24,1000.0,MTT,TetrazoliumSalt,384.6,62.1


## Feature Scaling

In [21]:
num_cols = train_feats.select_dtypes(include=["float64", "int64"]).columns.tolist()
num_cols.remove("ID")

scaler = StandardScaler()

# Scaling numerical features in the training set
scaled_train_feat = scaler.fit_transform(train_feats[num_cols])
scaled_train_df = pd.DataFrame(
    scaled_train_feat,
    columns=scaler.get_feature_names_out(num_cols)
)
train_feats = pd.concat([train_feats.drop(columns=num_cols), scaled_train_df], axis=1)

# Scaling numerical features in the test set
scaled_test_feat = scaler.transform(test_feats[num_cols])
scaled_test_df = pd.DataFrame(
    scaled_test_feat,
    columns=scaler.get_feature_names_out(num_cols)
)
test_feats = pd.concat([test_feats.drop(columns=num_cols), scaled_test_df], axis=1)

In [22]:
train_feats.head(5)

Unnamed: 0,ID,Material,Type,Shape,Coat/Functional Group,Synthesis_Method,Surface_Charge,Cell_Type,Human_Animal,Cell_Source,...,Cell_Morphology,Cell_Age,Cell Line_Primary Cell,Test,Test_Indicator,No_of_Cells (cells/well),Time (hr),Concentration (ug/ml),Size,Zeta
0,472,Pt,I,Sphere,PVP,Chemical Reduction,Negative,IMR90,H,Human,...,Fibroblast,Adult,L,CellTiterGlo,LuciferaseEnzyme,-0.418714,-0.453925,-0.24283,-0.667768,0.168256
1,232,Au,I,Sphere,CYS,Chemical Reduction,Negative,L929,A,Mouse,...,Fibroblast,Adult,L,MTT,TetrazoliumSalt,-0.418714,-0.453925,-0.208418,-0.525181,-1.072697
2,2041,EudragitRL,O,Sphere,,Emulsion-Solvent Evaporation,Positive,THP-1,H,Human,...,Monocyte,Adult,L,MTT,TetrazoliumSalt,-0.418714,-0.453925,-0.24283,0.236503,2.215188
3,437,Au,I,Sphere,Citrate,Chemical Reduction,Negative,Jurkat,H,Human,...,Lymphoblast,Adult,L,MTT,TetrazoliumSalt,3.232934,-0.453925,-0.275178,-0.64811,-2.742227
4,68,SLN,O,Sphere,,Emulsion-Solvent Evaporation,Negative,A549,H,Human,...,Epithelial,Adult,L,NRU,TetrazoliumSalt,0.282107,-0.453925,4.775996,-0.049194,-0.042834


In [23]:
test_feats.head(5)

Unnamed: 0,ID,Material,Type,Shape,Coat/Functional Group,Synthesis_Method,Surface_Charge,Cell_Type,Human_Animal,Cell_Source,...,Cell_Morphology,Cell_Age,Cell Line_Primary Cell,Test,Test_Indicator,No_of_Cells (cells/well),Time (hr),Concentration (ug/ml),Size,Zeta
0,32,Ag,I,Sphere,Citrate,Commercial,Negative,CCL-110,H,Human,...,Fibroblast,Fetus,P,MTS,TetrazoliumSalt,-0.418714,-0.453925,-0.276554,-0.432264,-0.327486
1,376,Chitosan,O,Irregular,,Ionotropic Gelation Method,Positive,MBMC,A,Mouse,...,Spindle,Adult,P,MTT,TetrazoliumSalt,-0.381829,1.694532,-0.234227,2.419859,2.704533
2,71,NiO,I,Sphere,,Commercial,Positive,HepG2,H,Human,...,Epithelial,Adult,L,MTS,TetrazoliumSalt,-0.381829,-0.453925,-0.139593,-0.55244,1.2333
3,2232,TiO2,I,Sphere,,Commercial,Negative,A549,H,Human,...,Epithelial,Adult,L,MTT,TetrazoliumSalt,-0.234287,0.620304,-0.263478,-0.157969,0.149066
4,2018,Chitosan,O,Irregular,,Ionotropic Gelation Method,Positive,MBMC,A,Mouse,...,Spindle,Adult,P,MTT,TetrazoliumSalt,-0.381829,-0.453925,1.099249,1.826185,2.410287


## Feature Encoding

In [24]:
cat_cols = train_feats.select_dtypes(include=["object"]).columns.tolist()
# label_encoding_cols = ["Shape"]
# one_hot_encoding_cols = list(set(cat_cols) - set(label_encoding_cols))

In [25]:
# ONE-HOT ENCODING
# encoder = OneHotEncoder(sparse_output=False, drop='first')

# Encoding the categorical columns in the training set
# encoded_train_features = encoder.fit_transform(train_feats[one_hot_encoding_cols])
# encoded_df = pd.DataFrame(
#     encoded_train_features,
#     columns=encoder.get_feature_names_out(one_hot_encoding_cols)
# )
# train_feats = pd.concat([train_feats.drop(columns=one_hot_encoding_cols), encoded_df], axis=1)

# Encoding the categorical columns in the test set
# encoded_test_features = encoder.transform(test_feats[one_hot_encoding_cols])
# encoded_df = pd.DataFrame(
#     encoded_test_features,
#     columns=encoder.get_feature_names_out(one_hot_encoding_cols)
# )
# test_feats = pd.concat([test_feats.drop(columns=one_hot_encoding_cols), encoded_df], axis=1)

In [26]:
# LABEL ENCODING
encoder = LabelEncoder()

# Encoding the Type column
for cols in cat_cols:
# for cols in label_encoding_cols:
    train_feats[cols] = encoder.fit_transform(train_feats[cols])
    test_feats[cols] = encoder.transform(test_feats[cols])

In [27]:
train_feats.head(5)

Unnamed: 0,ID,Material,Type,Shape,Coat/Functional Group,Synthesis_Method,Surface_Charge,Cell_Type,Human_Animal,Cell_Source,...,Cell_Morphology,Cell_Age,Cell Line_Primary Cell,Test,Test_Indicator,No_of_Cells (cells/well),Time (hr),Concentration (ug/ml),Size,Zeta
0,472,20,1,6,23,1,0,26,1,2,...,2,0,0,3,4,-0.418714,-0.453925,-0.24283,-0.667768,0.168256
1,232,2,1,6,4,1,0,31,0,4,...,2,0,0,8,7,-0.418714,-0.453925,-0.208418,-0.525181,-1.072697
2,2041,11,2,6,31,6,2,46,1,2,...,7,0,0,8,7,-0.418714,-0.453925,-0.24283,0.236503,2.215188
3,437,2,1,6,6,1,0,28,1,2,...,5,0,0,8,7,3.232934,-0.453925,-0.275178,-0.64811,-2.742227
4,68,21,2,6,31,6,0,4,1,2,...,1,0,0,9,7,0.282107,-0.453925,4.775996,-0.049194,-0.042834


In [28]:
test_feats.head(5)

Unnamed: 0,ID,Material,Type,Shape,Coat/Functional Group,Synthesis_Method,Surface_Charge,Cell_Type,Human_Animal,Cell_Source,...,Cell_Morphology,Cell_Age,Cell Line_Primary Cell,Test,Test_Indicator,No_of_Cells (cells/well),Time (hr),Concentration (ug/ml),Size,Zeta
0,32,0,1,6,6,4,0,10,1,2,...,2,2,1,7,7,-0.418714,-0.453925,-0.276554,-0.432264,-0.327486
1,376,6,2,2,31,11,2,32,0,4,...,11,0,1,8,7,-0.381829,1.694532,-0.234227,2.419859,2.704533
2,71,16,1,6,31,4,2,25,1,2,...,1,0,0,7,7,-0.381829,-0.453925,-0.139593,-0.55244,1.2333
3,2232,24,1,6,31,4,0,4,1,2,...,1,0,0,8,7,-0.234287,0.620304,-0.263478,-0.157969,0.149066
4,2018,6,2,2,31,11,2,32,0,4,...,11,0,1,8,7,-0.381829,-0.453925,1.099249,1.826185,2.410287


## Subsets Splitting

In [29]:
X_train = train_feats.drop(columns=["ID"]).to_numpy()
test_IDs = test_feats["ID"].to_numpy()
X_test = test_feats.drop(columns=["ID"]).to_numpy()

X_train.shape, y_train.shape, X_test.shape

((1775, 20), (1775,), (762, 20))

In [30]:
# TRAIN TEST SPLIT
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=random_state)
X_train.shape, y_train.shape, X_val.shape, y_val.shape

((1597, 20), (1597,), (178, 20), (178,))

# Training model

In [31]:
# params = {
#     # "n_estimators": [100, 200, 300, 400, 500],
#     # "criterion": ["gini", "entropy", "log_loss"],
#     # "max_depth": [None, 10, 20, 30, 40],
#     # "min_samples_split": [2, 5, 10],
#     # "min_samples_leaf": [1, 2, 4],
#     # "max_features": ["auto", "sqrt"],
#     # "bootstrap": [True, False],
#     # "oob_score": [True, False],
#     # "warm_start": [True, False],
#     "class_weight": [None, "balanced", "balanced_subsample"],
# }

# grid_search = GridSearchCV(
#     estimator=RandomForestClassifier(random_state=random_state),
#     param_grid=params,
#     scoring=matthews_corrcoef,
#     cv=5,
#     verbose=4,
#     n_jobs=-1,
# )

# grid_search.fit(X_train, y_train)
# grid_search.best_params_

In [32]:
model = LinearRegression()
model.fit(X_train, y_train)

print("Training R-squared:", r2_score(y_train, model.predict(X_train)))
print("Validation R-squared:", r2_score(y_val, model.predict(X_val)))

Training R-squared: 0.20193231259017164
Validation R-squared: 0.1972664950708395


# Inferencing

In [33]:
y_pred = model.predict(X_test) / 100.0
result_df = pd.DataFrame(data={"ID": test_IDs, "Target": y_pred})
result_df.to_csv("submission.csv", index=False)

# !head submission.csv