In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

In [None]:
train = pd.read_csv('data/train.csv')
train.head()

In [None]:
test = pd.read_csv('data/test.csv')
test.head()

In [None]:
submission = test[['id']]
train = train.drop(['id'], axis=1)
test = test.drop(['id'], axis=1)

In [None]:
train = train[train['target']>0]

In [None]:
x_train = train.loc[:, train.columns != 'target']
y_train = train[['target']]
x_test = test

In [None]:
from sklearn.mixture import GaussianMixture
gm = GaussianMixture(n_components=2, random_state=0).fit(y_train)
y_train2 = gm.predict(y_train)

In [None]:
train['class'] = y_train2
fig = px.histogram(train, x='target', color='class', marginal="box", barmode="overlay", nbins=200)
fig.show()

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
clf = KNeighborsClassifier()
clf.fit(x_train, y_train2)
pred = clf.predict(x_train)
np.round(100.*accuracy_score(y_train2, pred),2)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
N_SPLITS = 5
SEED = 42

def run_model(X, y, X_test):
    """
    Baseline is based on
    https://www.kaggle.com/ttahara/tps-jan-2021-gbdts-baseline
    
    Arg:
    * X: training data containing features
    * y: training data containing target variables
    * X_test: test data to predict
    
    Returns:
    * predictions for X_test
    """
    # Initialize variables
    y_oof_pred = np.zeros(len(X))
    y_test_pred = np.zeros(len(X_test))

    kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)


    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        print(f"Fold {fold + 1}:")

        # Prepare training and validation data
        X_train = X.iloc[train_idx].reset_index(drop=True)
        X_val = X.iloc[val_idx].reset_index(drop=True)

        y_train = y.iloc[train_idx].reset_index(drop=True)
        y_val = y.iloc[val_idx].reset_index(drop=True)  

        # Define model
        reg = RandomForestRegressor()
        reg.fit(X_train, y_train)

        # Calculate evaluation metric: Root Mean Squared Error (RMSE)
        y_val_pred = reg.predict(X_val)
        score = np.sqrt(mean_squared_error(y_val, y_val_pred))
        print(f"RMSE: {score:.5f}\n")

        y_oof_pred[val_idx] = y_val_pred

        # Make predictions
        y_test_pred += reg.predict(X_test)

    # Calculate evaluation metric for out of fold validation set
    oof_score = np.sqrt(mean_squared_error(y, y_oof_pred))
    print(f"OOF RMSE: {oof_score: 5f}")

    # Average predictions over all folds
    y_test_pred = y_test_pred / N_SPLITS

    return y_test_pred


y_pred = run_model(x_train, y_train, x_test)

In [None]:
submission['target'] = y_pred
test['target'] = y_pred
train['label'] = 'train'
test['label'] = 'test'
df = pd.concat([train, test])
fig = px.histogram(df, x='target', color='label', marginal="box", barmode="overlay")
fig.show()

In [None]:
submission.to_csv('submission.csv', index=False)