In [2]:
import sys
sys.path.append("../../../")

from modules.acquire_data import DataAcquire
from modules.analyze_data import analyze_data
from modules.create_model import create_model

da = DataAcquire()


In [1]:
import wandb



In [3]:
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/takizawakeiya/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mtkzw-0529[0m ([33mtkzw-0529-prudential-financial[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
import os 
os.environ["WANDB_API_KEY"] = "d63df136f169bffa17486544a5339caee2aec09c"

In [5]:
wandb.init(project="project_demo")

In [42]:
import os
import random
import numpy as np
import pandas as pd
import wandb

from wandb.integration.lightgbm import log_summary, wandb_callback

import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder




In [43]:
print(f"wandb version: {wandb.__version__}")
print(f"lightgbm version: {lgb.__version__}")


wandb version: 0.19.9
lightgbm version: 4.5.0


In [44]:
df_train = da.get_data_and_columns("Train")
categorical_columns = ['Podcast_Name', 'Episode_Title', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']
label_encoders = {}

for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    df_train[col] = label_encoders[col].fit_transform(df_train[col].astype(str))

DATA ACQUIRE:COMPLETE

COLUMNS
----------------------------------------
COLUMN :  D-TYPE :  #NANS
id : int64 : 0 nans
Podcast_Name : object : 0 nans
Episode_Title : object : 0 nans
Episode_Length_minutes : float64 : 87093 nans
Genre : object : 0 nans
Host_Popularity_percentage : float64 : 0 nans
Publication_Day : object : 0 nans
Publication_Time : object : 0 nans
Guest_Popularity_percentage : float64 : 146030 nans
Number_of_Ads : float64 : 1 nans
Episode_Sentiment : object : 0 nans
Listening_Time_minutes : float64 : 0 nans



In [45]:
df_train.columns

Index(['id', 'Podcast_Name', 'Episode_Title', 'Episode_Length_minutes',
       'Genre', 'Host_Popularity_percentage', 'Publication_Day',
       'Publication_Time', 'Guest_Popularity_percentage', 'Number_of_Ads',
       'Episode_Sentiment', 'Listening_Time_minutes'],
      dtype='object')

In [46]:
class CFG:
    exp_name = 'exp001'
    test_size = 0.2
    random_state = 529
    learning_rate = 0.1
    num_leaves = 31
    n_estimators = 10000
    feature_fraction = 0.9
    stopping_rounds = 50
    objective = 'regression'
    metric = 'rmse'
    features = ['id', 'Podcast_Name', 'Episode_Title', 'Episode_Length_minutes', 'Genre', 
               'Host_Popularity_percentage', 'Publication_Day', 'Publication_Time', 
               'Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment']
    target = ['Listening_Time_minutes']
    # カテゴリカル特徴量のリストを追加
    categorical_features = ['Podcast_Name', 'Episode_Title', 'Genre', 'Publication_Day', 
                          'Publication_Time', 'Episode_Sentiment']
    
config = CFG()


In [47]:
def seed_everything(seed: int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything(config.random_state)
    

In [48]:
def class_to_dict(obj):
    return {k: getattr(obj, k) for k in dir(obj) if not k.startswith('_') and not callable(getattr(obj, k))}

class_to_dict(config)
 

{'categorical_features': ['Podcast_Name',
  'Episode_Title',
  'Genre',
  'Publication_Day',
  'Publication_Time',
  'Episode_Sentiment'],
 'exp_name': 'exp001',
 'feature_fraction': 0.9,
 'features': ['id',
  'Podcast_Name',
  'Episode_Title',
  'Episode_Length_minutes',
  'Genre',
  'Host_Popularity_percentage',
  'Publication_Day',
  'Publication_Time',
  'Guest_Popularity_percentage',
  'Number_of_Ads',
  'Episode_Sentiment'],
 'learning_rate': 0.1,
 'metric': 'rmse',
 'n_estimators': 10000,
 'num_leaves': 31,
 'objective': 'regression',
 'random_state': 529,
 'stopping_rounds': 50,
 'target': ['Listening_Time_minutes'],
 'test_size': 0.2}

In [49]:
wandb.init(project="sample_project",
           config=class_to_dict(config),
           name=config.exp_name,
           )



In [50]:
X_train, X_test, y_train, y_test = train_test_split(df_train[config.features],
                                                    df_train[config.target],
                                                    test_size=config.test_size,
                                                    random_state=config.random_state)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((600000, 11), (150000, 11), (600000, 1), (150000, 1))

In [51]:
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

params = {
    'learning_rate': config.learning_rate,
    'num_leaves': config.num_leaves,
    'objective': config.objective,
    'metric': config.metric,
    'feature_fraction': config.feature_fraction,
}

model = lgb.train(
                  params, 
                  train_data,
                  num_boost_round=config.n_estimators,
                  callbacks = [
                               lgb.early_stopping(stopping_rounds=config.stopping_rounds, verbose=True),
                               lgb.log_evaluation(period=100), wandb_callback()
                              ],
                  valid_sets = [train_data, test_data],
                  )

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002115 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1199
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 11
[LightGBM] [Info] Start training from score 45.410277
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 13.0058	valid_1's rmse: 13.1006
[200]	training's rmse: 12.8889	valid_1's rmse: 13.0675
[300]	training's rmse: 12.7882	valid_1's rmse: 13.0466
[400]	training's rmse: 12.6986	valid_1's rmse: 13.0281
[500]	training's rmse: 12.6177	valid_1's rmse: 13.0154
[600]	training's rmse: 12.5395	valid_1's rmse: 13.0065
[700]	training's rmse: 12.4667	valid_1's rmse: 12.9975
[800]	training's rmse: 12.3996	valid_1's rmse: 12.9896
[900]	training's rmse: 12.3327	valid_1's rmse: 12.9842
[1000]	training's rmse: 12.2715	vali

In [53]:
y_pred = model.predict(X_test, num_iteration=model.best_iteration)

rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"RMSE: {rmse}")

# モデルの保存
model.save_model('lgb_model.txt')



RMSE: 12.949750122180982




<lightgbm.basic.Booster at 0x30123bb50>

In [54]:
log_summary(model, save_model_checkpoint=True)

In [55]:
wandb.finish()

0,1
iteration,▁▁▂▃▃▄▅▅▆▆▆▆▆▇▇▇████▁▁▁▂▂▃▃▃▃▄▄▄▄▄▄▅▅▆▆█
training_rmse,█▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▅▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▁
valid_1_rmse,▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_iteration,2006
iteration,2055
