In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score
import lightgbm as lgb

# Load the data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Split the training data into training and validation sets
train_data, val_data, train_labels, val_labels = train_test_split(train_df['text'], train_df['label'], test_size=0.2, random_state=42)

# Preprocess the text data using CountVectorizer
vectorizer = CountVectorizer()
train_features = vectorizer.fit_transform(train_data).astype(np.float64)
val_features = vectorizer.transform(val_data).astype(np.float64)
test_features = vectorizer.transform(test_df['text']).astype(np.float64)

# Create a LightGBM classifier and specify its hyperparameters
params = {
    'objective': 'multiclass',
    'num_class': 8,
    'metric': 'multi_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}
clf = lgb.LGBMClassifier(**params)

# Train the LightGBM model
clf.fit(train_features, train_labels, eval_set=[(val_features, val_labels)], early_stopping_rounds=10)

# Evaluate the performance of the model on the validation set
val_pred = clf.predict(val_features)
val_f1_score = f1_score(val_labels, val_pred, average='macro')
print('Validation F1 score:', val_f1_score)

# Make predictions on the test data and save to a CSV file
test_pred = clf.predict(test_features)
submission_df = pd.DataFrame({'id': test_df['id'], 'label': test_pred})
submission_df.to_csv('submission.csv', index=False)



You can set `force_col_wise=true` to remove the overhead.
[1]	valid_0's multi_logloss: 1.46008
[2]	valid_0's multi_logloss: 1.35935
[3]	valid_0's multi_logloss: 1.276
[4]	valid_0's multi_logloss: 1.20615
[5]	valid_0's multi_logloss: 1.14393
[6]	valid_0's multi_logloss: 1.08876
[7]	valid_0's multi_logloss: 1.03917
[8]	valid_0's multi_logloss: 0.995053
[9]	valid_0's multi_logloss: 0.955987
[10]	valid_0's multi_logloss: 0.919875
[11]	valid_0's multi_logloss: 0.887069
[12]	valid_0's multi_logloss: 0.855921
[13]	valid_0's multi_logloss: 0.828462
[14]	valid_0's multi_logloss: 0.803111
[15]	valid_0's multi_logloss: 0.779036
[16]	valid_0's multi_logloss: 0.756383
[17]	valid_0's multi_logloss: 0.735838
[18]	valid_0's multi_logloss: 0.716171
[19]	valid_0's multi_logloss: 0.697567
[20]	valid_0's multi_logloss: 0.680118
[21]	valid_0's multi_logloss: 0.663795
[22]	valid_0's multi_logloss: 0.648961
[23]	valid_0's multi_logloss: 0.634907
[24]	valid_0's multi_logloss: 0.621961
[25]	valid_0's multi_log