In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import log_loss, confusion_matrix, accuracy_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import itertools
import scipy
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
greeks = pd.read_csv("greeks.csv")
test_data = pd.read_csv("test.csv")
data = pd.read_csv("train.csv", index_col='Id')
data.columns = data.columns.str.replace(' ', '')
test_data.columns = test_data.columns.str.replace(' ', '')

In [201]:
# Visualize the data in line chart
feature_names = ['AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'AY', 'AZ', 'BC', 'BN', 'BQ', 'BR', 'BZ', 'CB', 'CF', 'CH', 'CL', 'CR', 'CS', 'CU', 'DA', 'DE', 'DF', 'DH', 'DI', 'DL', 'DN', 'DV', 'DY', 'EB', 'EE', 'EG', 'EH', 'EP', 'EU', 'FE', 'FI', 'FR', 'GB', 'GE', 'GF', 'GH', 'GI']
# feature_names = ['AB']

def lineChart(feature_name):
    feature_data = data[feature_name].to_numpy()
    plt.plot(feature_data)
    plt.show()

# remove the outliers using IQR logic
def removeOutliers(feature_data):
    # print(feature_data)
    mean = np.mean(feature_data)
    #print(mean)
    q1, q3 = np.percentile(feature_data, [25, 75])
    iqr = q3 - q1
    lower_bound = q1 - (1.5 * iqr)
    upper_bound = q3 + (1.5 * iqr)
    outliers = [x for x in feature_data if x < lower_bound or x > upper_bound]
    feature_data_replaced = [mean if x < lower_bound or x > upper_bound else x for x in feature_data]
    #print("The count of outliers are:", len(outliers))
    # print("The data with replaced outliers is:", feature_data_replaced)
    return feature_data_replaced

for feature_name in feature_names:
    # lineChart(feature_name)
    feature_data = data[feature_name]
    feature_data_new = removeOutliers(feature_data)
    # print(len(feature_data), len(feature_data_new))
    data[feature_name] = feature_data_new

# convert 'EJ' to a categorical variable, create onehot encoded columns
one_hot_encoded = pd.get_dummies(data['EJ']).add_prefix('EJ_')
df_encoded = pd.concat([data, one_hot_encoded], axis=1)

df_encoded = df_encoded.drop('EJ', axis=1)

data = df_encoded

In [222]:
#replace null columns with mean
numeric_col = [column for column in data.columns if column not in ['Id', 'EJ', 'Class']]
# Prepare the data - encode, remove inf, -inf and fill NaNs
for column in numeric_col:
    data[column] = data[column].replace([np.inf, -np.inf], np.nan)
    data[column] = data[column].fillna(data[column].mean())

In [224]:
from sklearn.utils import resample

# Separate majority and minority classes based on the target variable
majority_class = data[data['Class'] == 0]
minority_class_1 = data[data['Class'] == 1]

In [225]:
# Upsample the minority classes
minority_class_upsampled_1 = resample(minority_class_1,
                                      replace=True,
                                      n_samples=len(majority_class),
                                      random_state=42)


# Concatenate the upsampled minority classes with the majority class
df_upsampled = pd.concat([majority_class, minority_class_upsampled_1])

data = df_upsampled

In [226]:
# Prepare the data - encode, remove inf, -inf and fill NaNs
for column in data.columns:
    if data[column].dtype == 'object':
        data[column] = data[column].astype('category').cat.codes

In [228]:
features = [column for column in data.columns if column not in ['Id', 'Class']]
target = 'Class'

#### ML

In [229]:
# Use Random Forest to get feature importances
rf = RandomForestClassifier(random_state=42)
rf.fit(data[features], data[target])

RandomForestClassifier(random_state=42)

In [230]:
# Get feature importances
importances = rf.feature_importances_

# Create a DataFrame for visualization
feature_importances = pd.DataFrame({'Feature': features, 'Importance': importances})
feature_importances = feature_importances.sort_values('Importance', ascending=False)

In [231]:
# Select top features
top_features = feature_importances['Feature'][:30].append(feature_importances['Feature'][53:54])

In [257]:
# Train/Test Split
X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [258]:
# Scaling
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [259]:
# Model Definition with RandomForest and Cross-validation
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_leaf=5, random_state=42)
scores = cross_val_score(rf_model, X_train_scaled, y_train, cv=5, scoring='neg_log_loss')
log_loss_scores = -scores
print(f'Log Loss scores for each fold: {log_loss_scores}')
average_log_loss = np.mean(log_loss_scores)
print(f'Average Log Loss: {average_log_loss}')

Log Loss scores for each fold: [0.19507062 0.24395133 0.18546459 0.20514245 0.23890138]
Average Log Loss: 0.2137060746529642


In [235]:
import lightgbm as lgb
from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedKFold
 

In [260]:
# Define the LightGBM model
lgb_model = lgb.LGBMClassifier(objective='binary', metric='binary_logloss', reg_lambda=0.3) # reg_alpha=0.2,

# Define the cross-validation strategy
cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)

# Define the scorer as log loss
scorer = make_scorer(log_loss, greater_is_better=False, needs_proba=True)

# Perform cross-validation and calculate the scores
scores = cross_val_score(lgb_model, X_train_scaled, y_train, cv=cv, scoring=scorer)
scores = -scores

# Calculate the average score
avg_score = np.mean(scores)

print("Cross-Validation Scores (Log Loss):")
print(scores)
print("Average Log Loss Score:", avg_score)

Cross-Validation Scores (Log Loss):
[0.00880142 0.07640653 0.02517437 0.10919002 0.03238571 0.07740026
 0.01107517 0.06854812 0.07920504 0.00893975]
Average Log Loss Score: 0.049712639272142964


In [248]:
# Fit the model with the training data
lgb_model.fit(X_train_scaled, y_train)

LGBMClassifier(metric='binary_logloss', objective='binary', reg_lambda=0.3)

In [253]:
# Prepare the test data
test_data_clean = test_data[features]
test_data_clean = test_data_clean.replace([np.inf, -np.inf], np.nan)
test_data_clean.fillna(test_data_clean.median(numeric_only=True), inplace=True)
test_data_clean['EJ'] = test_data_clean['EJ'].astype('category').cat.codes
test_data_scaled = scaler.transform(test_data_clean)

In [254]:
# Predictions using the fitted model
test_preds = lgb_model.predict_proba(test_data_scaled)

In [255]:
# Create a submission file
submission = pd.DataFrame(test_data['Id'], columns=['Id'])
submission[['class_0', 'class_1']] = test_preds
#submission.to_csv('submission.csv', index=False)

In [256]:
submission

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.75089,0.24911
1,010ebe33f668,0.75089,0.24911
2,02fa521e1838,0.75089,0.24911
3,040e15f562a2,0.75089,0.24911
4,046e85c7cc7f,0.75089,0.24911


In [33]:
import xgboost as xgb

from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import make_scorer, log_loss


# Define the base XGBoost model
base_model = xgb.XGBClassifier(objective='binary:logistic', random_state=42)

# Wrap the base model with MultiOutputClassifier
#model = MultiOutputClassifier(base_model)

# Define the scorer as log loss
scorer = make_scorer(log_loss, greater_is_better=False, needs_proba=True)

# Perform cross-validation and calculate the scores
scores = cross_val_score(base_model, X_train_scaled, y_train, scoring=scorer)#cv=5,

# Calculate the average score
avg_score = np.mean(scores)

print("Cross-Validation Scores (Log Loss):")
print(scores)
print("Average Log Loss Score:", avg_score)
#try lgbm without the minus before scorer tomorrow

Cross-Validation Scores (Log Loss):
[-0.10276949 -0.04446035 -0.065472   -0.0652326  -0.08916046]
Average Log Loss Score: -0.07341897996701433


In [294]:
# Fit the model with the training data
base_model.fit(X_train_scaled, y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=42, ...)

In [295]:
# Predictions using the fitted model
test_preds = base_model.predict_proba(test_data_scaled)

In [296]:
test_preds

array([[0.4411614, 0.5588386],
       [0.4411614, 0.5588386],
       [0.4411614, 0.5588386],
       [0.4411614, 0.5588386],
       [0.4411614, 0.5588386]], dtype=float32)