# KAGGLE - Binary Classification code

## 1. import library

In [None]:
import pandas as pd
import numpy as np
import optuna
import os
import matplotlib.pyplot as plt
import statsmodels.api as sm
from datetime import datetime
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import log_loss, roc_auc_score, recall_score, precision_score, average_precision_score, f1_score, classification_report, accuracy_score,confusion_matrix, silhouette_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import make_scorer

from imblearn.under_sampling import RandomUnderSampler
import warnings
warnings.filterwarnings("ignore")
os.chdir("/Users/yj.noh/Documents/GitHub")
print(os.getcwd())

## 2. dataset load

In [None]:
train = pd.read_csv("/Users/yj.noh/Documents/GitHub/kaggle-Binary/train.csv", encoding = "utf-8")
test = pd.read_csv("/Users/yj.noh/Documents/GitHub/kaggle-Binary/test.csv", encoding = "utf-8")

print(train.shape) #101,763, 23
print(test.shape) # 67,842

## 3. Data EDA

In [None]:
# NA check 
print(train.isna().sum())
print(test.isna().sum())

In [None]:
train.describe()

In [None]:
train.info()

In [None]:
# category화 할수 있는 변수 존재? 
n = train.nunique(axis=0) 
print("No.of.unique values :",  n)

In [None]:
# 중복 체크
duplicates = train[train.duplicated()]
duplicates

In [None]:
# outcome -> int 
train["defects"] = train["defects"].astype(int) 

print(train["defects"].value_counts()) #0 : 78699, 1: 23064 

In [None]:
X_train = train.drop(['defects', 'id'], axis=1)
X_test = test.drop(['id'], axis=1)
y_train = train[['defects']].values.ravel()  # y_train을 1차원 배열로 변환

print(y_train.shape)


In [None]:
cor = X_train.corr(method = 'pearson')
fig, ax = plt.subplots(figsize=(18,18)) 
ax = sns.heatmap(cor, annot=True)
plt.show()

In [None]:
cor

## 4. scale

In [None]:
print(X_train.columns)
print(X_test.columns)

In [None]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train = pd.DataFrame(X_train_scaled, index = X_train.index, columns = X_train.columns)
X_test = pd.DataFrame(X_test_scaled, index = X_test.index, columns = X_test.columns)

X_train

## 5. GLM - benchmark model

In [None]:
glm_1 = sm.GLM(y_train, X_train, family = sm.families.Binomial())
glm1_fit = glm_1.fit()
print(glm1_fit.summary())

In [None]:
# p-value 낮은 거 제외 
excluded_variables = ['v(g)', 'ev(g)', 'v', 'd', 'e', 'b', 'total_Op', 'branchCount']
X_train_v2 = X_train.drop(excluded_variables, axis=1)
print(X_train_v2.columns)

In [None]:
glm_2 = sm.GLM(y_train, X_train_v2, family = sm.families.Binomial())
glm2_fit = glm_2.fit()
print(glm2_fit.summary())

### 5.1.Recursive Feature Elimination

In [None]:
if __name__ == "__main__":
    
    lr = LogisticRegression()

    rfe = RFE(lr, step = 10) 
    fit = rfe.fit(X_train, y_train)
    print("Features: {features}".format(features=X_train.columns))
    print("Num Features: {number_features}".format(number_features=fit.n_features_))
    print("Selected Features: {support}".format(support=fit.support_))
    print("Feature Ranking: {ranking}".format(ranking=fit.ranking_))

    selected_columns = [column for column, selected in zip(X_train.columns, fit.support_) if selected]
    print("Selected columns: {selected}".format(selected = selected_columns))

X_train_v3 = X_train[selected_columns]

In [None]:
class_weights_to_test = [
    {0: 0.1, 1: 0.9},
    {0: 0.15, 1: 0.85},  
    {0: 0.2, 1: 0.8}, 
    {0: 0.25, 1: 0.75}, 
    {0: 0.3, 1: 0.7},
    {0: 0.35, 1: 0.65},
    {0: 0.4, 1: 0.6},
    {0: 0.45, 1: 0.55},
    {0: 0.5, 1: 0.5},
    {0: 0.55, 1: 0.45},
    {0: 0.6, 1: 0.4}
]

best_class_weights = []
best_accuracies = []

for data in [X_train, X_train_v2, X_train_v3]:
    best_class_weight = None
    best_accuracy = 0.0
    
    for class_weight in class_weights_to_test:
        model = LogisticRegression(class_weight = class_weight)
        accuracy = cross_val_score(model, data, y_train, cv=10, scoring = make_scorer(accuracy_score)).mean()
        
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_class_weight = class_weight
    
    best_class_weights.append(best_class_weight)
    best_accuracies.append(best_accuracy)

# 결과 출력
for i, (best_class_weight, accuracy) in enumerate(zip(best_class_weights, best_accuracies), start=1):
    print(f"Data (X_train_v{i}) - Best Class Weight: {best_class_weight}, Accuracy: {accuracy:.4f}")


## 6. LGBM

In [None]:
def objective(trial):
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.2),
        "n_estimators": trial.suggest_int("n_estimators", 50, 200),
        "max_depth": trial.suggest_int("max_depth", 3, 7),
        "num_leaves": trial.suggest_int("num_leaves", 16, 64),
    }

    model = lgb.LGBMClassifier(**params)
    
    accuracy = cross_val_score(model, X_train, y_train, cv=10, scoring='accuracy', n_jobs=-1).mean()
    return accuracy


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

best_params = study.best_params
best_lgbm_model = lgb.LGBMClassifier(**best_params)

accuracy = cross_val_score(best_lgbm_model, X_train, y_train, cv=10, scoring='accuracy', n_jobs=-1).mean()


print(f'Best LightGBM Model Accuracy: {accuracy:.4f}')


best_lgbm_model.fit(X_train, y_train)
y_pred = best_lgbm_model.predict(X_test)


output_df = pd.DataFrame({'ID': X_test['id'], 'defects': y_pred})

output_df.to_csv('predictions.csv', index=False)
