In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score

train = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv')
test = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv')

kfold = StratifiedKFold(n_splits = 2, shuffle = True, random_state = 1)
for num, (train_index, val_index) in enumerate(kfold.split(train, train['score'])):
    train.loc[val_index, 'target'] = int(num)
    
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 2)
for num, (train_index, val_index) in enumerate(kfold.split(train, train['score'])):
    train.loc[val_index, 'fold'] = int(num)
    
vec = TfidfVectorizer(max_features = 10000)
data = vec.fit_transform(train['full_text'])

train_ind = train[train['fold'] != 0].index
valid_ind = train[train['fold'] == 0].index
x_train, x_valid = data[train_ind], data[valid_ind]
y_train, y_valid = train['target'][train_ind], train['target'][valid_ind]
train_dataset = lgb.Dataset(x_train, y_train)
valid_dataset = lgb.Dataset(x_valid, y_valid)

params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting': 'gbdt',
    'seed': 42,
    'num_leaves': 31,
    'learning_rate': 0.1,
    'n_jobs': -1,
}

early_stopping = lgb.early_stopping(50, verbose = 50)

model = lgb.train(
    params = params,
    train_set = train_dataset,
    num_boost_round = 10000,
    valid_sets = [train_dataset, valid_dataset],
    callbacks = [early_stopping]
)
pred = model.predict(x_valid)
score = roc_auc_score(y_valid, pred)
print(f'Roc Auc Score: {score}')

In [None]:
train = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv')
test = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv')

train['target'] = 0.0
test['target'] = 1.0 
train = pd.concat([train, test], axis = 0, ignore_index = True)
    
kfold = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 1)
for num, (train_index, val_index) in enumerate(kfold.split(train, train['target'])):
    train.loc[val_index, 'fold'] = int(num)
    
vec = TfidfVectorizer(max_features = 10000)
data = vec.fit_transform(train['full_text'])

train_ind = train[train['fold'] != 0].index
valid_ind = train[train['fold'] == 0].index
x_train, x_valid = data[train_ind], data[valid_ind]
y_train, y_valid = train['target'][train_ind], train['target'][valid_ind]
train_dataset = lgb.Dataset(x_train, y_train)
valid_dataset = lgb.Dataset(x_valid, y_valid)

model = lgb.train(
    params = params,
    train_set = train_dataset,
    num_boost_round = 10000,
    valid_sets = [train_dataset, valid_dataset],
    callbacks = [early_stopping]
)
pred = model.predict(x_valid)
score = roc_auc_score(y_valid, pred)
print(f'Roc Auc Score: {score}')

In [None]:
if score >= 0.60:
    sub = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv')
    sub.to_csv('submission.csv', index = False)
else:
    train = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv')
    vec = TfidfVectorizer(max_features = 10000)
    train_transform = vec.fit_transform(train['full_text'])
    train_dataset = lgb.Dataset(train_transform, train['score'])
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting': 'gbdt',
        'seed': 42,
        'num_leaves': 31,
        'learning_rate': 0.1,
        'n_jobs': -1
    }
    model = lgb.train(
        params = params,
        train_set = train_dataset,
        num_boost_round = 300,
        valid_sets = [train_dataset],
    )
    test = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv')
    test_transform = vec.transform(test['full_text'])
    test_pred = model.predict(test_transform).round().astype(np.int32)
    test['score'] = test_pred
    test = test[['essay_id', 'score']]
    test.to_csv('submission.csv', index = False)