In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier
import category_encoders as ce

In [None]:
def load_classification_data():
    train_df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")
    target_col = "target" if "target" in train_df.columns else train_df.columns[-1]
    return train_df, test_df, target_col
train_df, test_df, target_col = load_classification_data()


In [None]:
cat_features = train_df.select_dtypes(include=["object", "category"]).columns.tolist()
num_features = train_df.select_dtypes(include=["int64", "float64"]).columns.tolist()
num_features.remove(target_col)


In [None]:
imputer = SimpleImputer(strategy="most_frequent")
train_df[cat_features] = imputer.fit_transform(train_df[cat_features])
test_df[cat_features] = imputer.transform(test_df[cat_features])


In [None]:
encoder = ce.TargetEncoder(cols=cat_features)
train_df[cat_features] = encoder.fit_transform(train_df[cat_features], train_df[target_col])
test_df[cat_features] = encoder.transform(test_df[cat_features])


In [None]:
scaler = StandardScaler()
train_df[num_features] = scaler.fit_transform(train_df[num_features])
test_df[num_features] = scaler.transform(test_df[num_features])

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    train_df.drop(columns=[target_col]), train_df[target_col], test_size=0.2, random_state=42, stratify=train_df[target_col]
)

In [None]:
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'num_leaves': [15, 20],
    'max_depth': [-1, 10]
}

In [None]:
grid_search = GridSearchCV(LGBMClassifier(random_state=42), param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=3)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
predictions = best_model.predict(X_val)

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[LightGBM] [Info] Number of positive: 89858, number of negative: 390142
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.031611 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1789
[LightGBM] [Info] Number of data points in the train set: 480000, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.187204 -> initscore=-1.468280
[LightGBM] [Info] Start training from score -1.468280


In [None]:
accuracy = accuracy_score(y_val, predictions)

In [None]:
accuracy

0.827725

In [None]:
#Regression
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor



In [None]:
train_df = pd.read_csv("gemstone_train.csv")


In [None]:
target_col = "price"
X_reg = train_df.drop(columns=[target_col])
y_reg = train_df[target_col]

In [None]:
num_cols = X_reg.select_dtypes(include=["number"]).columns
cat_cols = X_reg.select_dtypes(exclude=["number"]).columns


In [None]:
X_reg[num_cols] = X_reg[num_cols].fillna(X_reg[num_cols].median())
X_reg[cat_cols] = X_reg[cat_cols].fillna(X_reg[cat_cols].mode().iloc[0])

In [None]:
X_reg = pd.get_dummies(X_reg, drop_first=True)



In [None]:
X_reg["sum_features"] = X_reg[num_cols].sum(axis=1)
X_reg["mean_features"] = X_reg[num_cols].mean(axis=1)
X_reg["std_features"] = X_reg[num_cols].std(axis=1)
X_reg["max_min_ratio"] = X_reg[num_cols].max(axis=1) / (X_reg[num_cols].min(axis=1) + 1e-5)



In [None]:
X_train_reg, X_val_reg, y_train_reg, y_val_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)


In [None]:
scaler = StandardScaler()
X_train_reg = scaler.fit_transform(X_train_reg)
X_val_reg = scaler.transform(X_val_reg)


In [None]:
reg_model = XGBRegressor(objective="reg:squarederror", random_state=42)
param_grid = {
    "n_estimators": [100, 200],
    "learning_rate": [0.01, 0.1],
    "max_depth": [3, 7],
    "subsample": [0.8, 1.0]
}
grid_search = GridSearchCV(
    reg_model, param_grid, cv=3, scoring="r2", verbose=3, n_jobs=-1
)
grid_search.fit(X_train_reg, y_train_reg)

#
best_model = grid_search.best_estimator_


Fitting 3 folds for each of 16 candidates, totalling 48 fits


In [None]:
r2_score = best_model.score(X_val_reg, y_val_reg)
r2_score

0.9781777262687683