In [1]:
import xgboost as xgb
import sklearn

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import log_loss,confusion_matrix,classification_report,roc_curve,auc,accuracy_score,roc_auc_score
from sklearn.model_selection import StratifiedKFold
# Onehot encoding

from sklearn.preprocessing import LabelEncoder

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy import sparse
import gc
import os
import random

In [2]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    #torch.manual_seed(seed)
    #torch.cuda.manual_seed(seed)
    #torch.backends.cudnn.deterministic = True
    #torch.backends.cudnn.benchmark = False
seed_everything()

In [3]:
'''
!mkdir input/
%cd input/
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip
!unzip NewsAggregatorDataset.zip
'''

'\n!mkdir input/\n%cd input/\n!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip\n!unzip NewsAggregatorDataset.zip\n'

In [4]:
# 読込時のエラー回避のためダブルクォーテーションをシングルクォーテーションに置換
#!sed -e 's/"/'\''/g' ./input/newsCorpora.csv > ./input/newsCorpora_re.csv
#%cd ..

In [5]:


# データの読込
df = pd.read_csv('./input/newsCorpora_re.csv', header=None, sep='\t', names=['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'])

# データの抽出
df = df.loc[df['PUBLISHER'].isin(['Reuters', 'Huffington Post', 'Businessweek', 'Contactmusic.com', 'Daily Mail']), ['TITLE', 'CATEGORY']]



In [6]:
df.head()

Unnamed: 0,TITLE,CATEGORY
12,Europe reaches crunch point on banking union,b
13,ECB FOCUS-Stronger euro drowns out ECB's messa...,b
19,"Euro Anxieties Wane as Bunds Top Treasuries, S...",b
20,Noyer Says Strong Euro Creates Unwarranted Eco...,b
29,REFILE-Bad loan triggers key feature in ECB ba...,b


In [7]:
# データの分割
df_train, df_valid_test = train_test_split(df, test_size=0.2, shuffle=True, random_state=123, stratify=df['CATEGORY'])
df_valid, df_test = train_test_split(df_valid_test, test_size=0.5, shuffle=True, random_state=123, stratify=df_valid_test['CATEGORY'])
df_train.reset_index(drop=True, inplace=True)
df_valid.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

print(df_train.head())

                                               TITLE CATEGORY
0  REFILE-UPDATE 1-European car sales up for sixt...        b
1  Amazon Plans to Fight FTC Over Mobile-App Purc...        t
2  Kids Still Get Codeine In Emergency Rooms Desp...        m
3  What On Earth Happened Between Solange And Jay...        e
4  NATO Missile Defense Is Flight Tested Over Hawaii        b


In [10]:
vect_word = TfidfVectorizer(max_features=20000, lowercase=True, analyzer='word',
                        stop_words= None,ngram_range=(1,3),dtype=np.float32)
vect_char = TfidfVectorizer(max_features=40000, lowercase=True, analyzer='char',
                        stop_words= None,ngram_range=(3,6),dtype=np.float32)

In [11]:
# Word ngram vector
tr_vect = vect_word.fit_transform(df_train['TITLE'])
vl_vect = vect_word.transform(df_valid['TITLE'])
ts_vect = vect_word.transform(df_test['TITLE'])

# Character n gram vector
tr_vect_char = vect_char.fit_transform(df_train['TITLE'])
vl_vect_char = vect_char.transform(df_valid['TITLE'])
ts_vect_char = vect_char.transform(df_test['TITLE'])
gc.collect()

0

In [12]:
tr_vect.shape

(10684, 20000)

In [13]:
X = sparse.hstack([tr_vect, tr_vect_char])
x_val = sparse.hstack([vl_vect, vl_vect_char])
x_test = sparse.hstack([ts_vect, ts_vect_char])

In [14]:
le = LabelEncoder()
y_tr = le.fit_transform(df_train['CATEGORY'].values)
y_vl = le.transform(df_valid['CATEGORY'].values)
y_te = le.transform(df_test['CATEGORY'].values)

In [15]:
svd = TruncatedSVD(n_components=300, random_state=42)
X = svd.fit_transform(tr_vect)
x_val = svd.transform(vl_vect)
x_test = svd.transform(ts_vect)

In [16]:
X.shape

(10684, 300)

In [17]:
y_vl.shape

(1336,)

In [18]:
y_tr.shape

(10684,)

In [19]:
model = xgb.XGBClassifier()
model.fit(X, y_tr)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [20]:
# 検証データを予測する
y_pred = model.predict_proba(x_val)
y_pred_max = np.argmax(y_pred, axis=1)  # 最尤と判断したクラスの値にする

accuracy = sum(y_vl == y_pred_max) / len(y_vl)
print(accuracy)

0.8450598802395209


In [21]:
print(roc_auc_score(y_vl, y_pred, multi_class='ovo'))
print(log_loss(y_vl, y_pred))

0.9338559146568496
0.5067125110771251


In [22]:
# 評価データを予測する
y_pred = model.predict_proba(x_test)
y_pred_max = np.argmax(y_pred, axis=1)  # 最尤と判断したクラスの値にする

accuracy = sum(y_te == y_pred_max) / len(y_te)
print(accuracy)

0.8555389221556886


In [23]:
print(roc_auc_score(y_te, y_pred, multi_class='ovo'))
print(log_loss(y_te, y_pred))

0.9422619609725721
0.46472405979445847


# Gensim

In [22]:
!pip freeze > requirements.lock

In [21]:
class SWEM():
    """
    Simple Word-Embeddingbased Models (SWEM)
    https://arxiv.org/abs/1805.09843v1
    """

    def __init__(self, w2v, tokenizer, oov_initialize_range=(-0.01, 0.01)):
        self.w2v = w2v
        self.tokenizer = tokenizer
        self.vocab = set(self.w2v.vocab.keys())
        self.embedding_dim = self.w2v.vector_size
        self.oov_initialize_range = oov_initialize_range

        if self.oov_initialize_range[0] > self.oov_initialize_range[1]:
            raise ValueError("Specify valid initialize range: "
                             f"[{self.oov_initialize_range[0]}, {self.oov_initialize_range[1]}]")

    def get_word_embeddings(self, text):
        np.random.seed(abs(hash(text)) % (10 ** 8))

        vectors = []
        for word in self.tokenizer(text):
            if word in self.vocab:
                vectors.append(self.w2v[word])
            else:
                vectors.append(np.random.uniform(self.oov_initialize_range[0],
                                                 self.oov_initialize_range[1],
                                                 self.embedding_dim))
        return np.array(vectors)

    def average_pooling(self, text):
        word_embeddings = self.get_word_embeddings(text)
        return np.mean(word_embeddings, axis=0)

    def max_pooling(self, text):
        word_embeddings = self.get_word_embeddings(text)
        return np.max(word_embeddings, axis=0)

    def concat_average_max_pooling(self, text):
        word_embeddings = self.get_word_embeddings(text)
        return np.r_[np.mean(word_embeddings, axis=0), np.max(word_embeddings, axis=0)]

    def hierarchical_pooling(self, text, n):
        word_embeddings = self.get_word_embeddings(text)

        text_len = word_embeddings.shape[0]
        if n > text_len:
            raise ValueError(f"window size must be less than text length / window_size:{n} text_length:{text_len}")
        window_average_pooling_vec = [np.mean(word_embeddings[i:i + n], axis=0) for i in range(text_len - n + 1)]

        return np.max(window_average_pooling_vec, axis=0)

In [27]:
from gensim.models import KeyedVectors

from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
nlp = English()

wv = KeyedVectors.load_word2vec_format(
    './input/GoogleNews-vectors-negative300.bin', binary=True)

swem = SWEM(wv, nlp.tokenizer)

In [28]:
# Word ngram vector
tr_vect = np.array([swem.average_pooling(text) for text in df_train['TITLE'].tolist()])
vl_vect = np.array([swem.average_pooling(text) for text in df_valid['TITLE'].tolist()])
ts_vect = np.array([swem.average_pooling(text) for text in df_test['TITLE'].tolist()])

In [29]:
tr_vect.shape

(10684, 300)

In [30]:
y_tr.shape

(10684,)

In [31]:
pd.Series(y_tr).value_counts()

0    4501
1    4235
3    1220
2     728
dtype: int64

In [32]:
model = xgb.XGBClassifier()
model.fit(X, y_tr)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [33]:
# 検証データを予測する
y_pred = model.predict_proba(vl_vect)
y_pred_max = np.argmax(y_pred, axis=1)  # 最尤と判断したクラスの値にする

accuracy = sum(y_vl == y_pred_max) / len(y_vl)
print('{:.4f}'.format(accuracy))
print('{:.4f}'.format(roc_auc_score(y_vl, y_pred, multi_class='ovo')))
print('{:.4f}'.format(log_loss(y_vl, y_pred)))

0.4004
0.5017
1.9219


In [34]:
# 評価データを予測する
y_pred = model.predict_proba(ts_vect)
y_pred_max = np.argmax(y_pred, axis=1)  # 最尤と判断したクラスの値にする

accuracy = sum(y_te == y_pred_max) / len(y_te)
print('{:.4f}'.format(accuracy))
print('{:.4f}'.format(roc_auc_score(y_te, y_pred, multi_class='ovo')))
print('{:.4f}'.format(log_loss(y_te, y_pred)))

0.4266
0.5171
1.8261


# GloVe

In [35]:
# GloVeダウンロード
#!wget https://nlp.stanford.edu/data/glove.6B.zip
#!unzip glove.6B.zip

In [38]:
EMBEDDING_FILE='./input/glove.6B.300d.txt'

In [39]:
# Read the glove word vectors (space delimited strings) into a dictionary from word->vector.

def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))

In [42]:
from keras.preprocessing.text import Tokenizer

2021-12-16 13:46:59.097475: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-16 13:46:59.097564: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [40]:
embedding_dict={}
with open(EMBEDDING_FILE,'r') as f:
    for line in f:
        values=line.split()
        word=values[0]
        vectors=np.asarray(values[1:],'float32')
        embedding_dict[word]=vectors
f.close()

In [45]:
class SWEM_Glove():
    """
    Simple Word-Embeddingbased Models (SWEM)
    https://arxiv.org/abs/1805.09843v1
    """

    def __init__(self, dic, tokenizer, oov_initialize_range=(-0.01, 0.01)):
        self.tokenizer = tokenizer
        self.dic = dic
        self.embedding_dim = self.dic['a'].shape[0]
        self.oov_initialize_range = oov_initialize_range

        if self.oov_initialize_range[0] > self.oov_initialize_range[1]:
            raise ValueError("Specify valid initialize range: "
                             f"[{self.oov_initialize_range[0]}, {self.oov_initialize_range[1]}]")

    def get_word_embeddings(self, text):
        np.random.seed(abs(hash(text)) % (10 ** 8))

        vectors = []
        for word in text.split():
            if word in self.dic:
                vectors.append(self.dic[word])
            else:
                vectors.append(np.random.uniform(self.oov_initialize_range[0],
                                                 self.oov_initialize_range[1],
                                                 self.embedding_dim))
        return np.array(vectors)

    def average_pooling(self, text):
        word_embeddings = self.get_word_embeddings(text)
        return np.mean(word_embeddings, axis=0)

tokenizer = Tokenizer()
swem = SWEM_Glove(embedding_dict, tokenizer)

In [46]:
# Word ngram vector
tr_vect = np.array([swem.average_pooling(text) for text in df_train['TITLE'].tolist()])
vl_vect = np.array([swem.average_pooling(text) for text in df_valid['TITLE'].tolist()])
ts_vect = np.array([swem.average_pooling(text) for text in df_test['TITLE'].tolist()])

In [47]:
model = xgb.XGBClassifier()
model.fit(X, y_tr)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [49]:
# 検証データを予測する
y_pred = model.predict_proba(vl_vect)
y_pred_max = np.argmax(y_pred, axis=1)  # 最尤と判断したクラスの値にする

accuracy = sum(y_vl == y_pred_max) / len(y_vl)
print('{:.4f}'.format(accuracy))
print('{:.4f}'.format(roc_auc_score(y_vl, y_pred, multi_class='ovo')))
print('{:.4f}'.format(log_loss(y_vl, y_pred)))

# 評価データを予測する
y_pred = model.predict_proba(ts_vect)
y_pred_max = np.argmax(y_pred, axis=1)  # 最尤と判断したクラスの値にする

accuracy = sum(y_te == y_pred_max) / len(y_te)
print('{:.4f}'.format(accuracy))
print('{:.4f}'.format(roc_auc_score(y_te, y_pred, multi_class='ovo')))
print('{:.4f}'.format(log_loss(y_te, y_pred)))

0.4199
0.5247
1.8890
0.4476
0.5363
1.8704


# FastText

In [50]:
import fasttext

In [51]:
#model2 = FastText.load_fasttext_format('cc.en.300.bin')
FASTTEXT_MODEL_BIN = "input/cc.en.300.bin"
#this works
ft_model = fasttext.load_model(FASTTEXT_MODEL_BIN)
ft_model.get_word_vector("additional").shape



(300,)

In [52]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()

In [53]:
class SWEM_FastText():
    """
    Simple Word-Embeddingbased Models (SWEM)
    https://arxiv.org/abs/1805.09843v1
    """

    def __init__(self, dic, tokenizer, oov_initialize_range=(-0.01, 0.01)):
        self.tokenizer = tokenizer
        self.dic = dic
        self.embedding_dim = self.dic['a'].shape[0]
        self.oov_initialize_range = oov_initialize_range

        if self.oov_initialize_range[0] > self.oov_initialize_range[1]:
            raise ValueError("Specify valid initialize range: "
                             f"[{self.oov_initialize_range[0]}, {self.oov_initialize_range[1]}]")

    def get_word_embeddings(self, text):
        np.random.seed(abs(hash(text)) % (10 ** 8))

        vectors = []
        for word in text.split():
            if word in self.dic:
                vectors.append(self.dic[word])
            else:
                vectors.append(np.random.uniform(self.oov_initialize_range[0],
                                                 self.oov_initialize_range[1],
                                                 self.embedding_dim))
        return np.array(vectors)

    def average_pooling(self, text):
        word_embeddings = self.get_word_embeddings(text)
        return np.mean(word_embeddings, axis=0)

swem = SWEM_FastText(ft_model, tokenizer)

In [54]:
# Word ngram vector
tr_vect = np.array([swem.average_pooling(text) for text in df_train['TITLE'].tolist()])
vl_vect = np.array([swem.average_pooling(text) for text in df_valid['TITLE'].tolist()])
ts_vect = np.array([swem.average_pooling(text) for text in df_test['TITLE'].tolist()])

In [55]:
le = LabelEncoder()
y_tr = le.fit_transform(df_train['CATEGORY'].values)
y_vl = le.transform(df_valid['CATEGORY'].values)
y_te = le.transform(df_test['CATEGORY'].values)

In [56]:
model = xgb.XGBClassifier()
model.fit(tr_vect, y_tr)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [57]:
# 検証データを予測する
y_pred = model.predict_proba(vl_vect)
y_pred_max = np.argmax(y_pred, axis=1)  # 最尤と判断したクラスの値にする

accuracy = sum(y_vl == y_pred_max) / len(y_vl)
print('{:.4f}'.format(accuracy))
print('{:.4f}'.format(roc_auc_score(y_vl, y_pred, multi_class='ovo')))
print('{:.4f}'.format(log_loss(y_vl, y_pred)))

# 評価データを予測する
y_pred = model.predict_proba(ts_vect)
y_pred_max = np.argmax(y_pred, axis=1)  # 最尤と判断したクラスの値にする

accuracy = sum(y_te == y_pred_max) / len(y_te)
print('{:.4f}'.format(accuracy))
print('{:.4f}'.format(roc_auc_score(y_te, y_pred, multi_class='ovo')))
print('{:.4f}'.format(log_loss(y_te, y_pred)))

0.8728
0.9476
0.4293
0.8900
0.9548
0.3723


# BERT

In [7]:
import torch
import transformers

from transformers import BertTokenizer


class BertSequenceVectorizer:
    def __init__(self, model_name="bert-base-uncased", max_len=128):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model_name = model_name
        self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
        self.bert_model = transformers.BertModel.from_pretrained(self.model_name)
        self.bert_model = self.bert_model.to(self.device)
        self.max_len = max_len

    def vectorize(self, sentence: str) -> np.array:
        inp = self.tokenizer.encode(sentence)
        len_inp = len(inp)

        if len_inp >= self.max_len:
            inputs = inp[:self.max_len]
            masks = [1] * self.max_len
        else:
            inputs = inp + [0] * (self.max_len - len_inp)
            masks = [1] * len_inp + [0] * (self.max_len - len_inp)

        inputs_tensor = torch.tensor([inputs], dtype=torch.long).to(self.device)
        masks_tensor = torch.tensor([masks], dtype=torch.long).to(self.device)

        bert_out = self.bert_model(inputs_tensor, masks_tensor)
        seq_out, pooled_out = bert_out['last_hidden_state'], bert_out['pooler_output']

        if torch.cuda.is_available():    
            return seq_out[0][0].cpu().detach().numpy() # 0番目は [CLS] token, 768 dim の文章特徴量
        else:
            return seq_out[0][0].detach().numpy()

2021-12-16 13:22:19.719065: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-16 13:22:19.719136: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [9]:
BSV = BertSequenceVectorizer(
    model_name="bert-base-uncased",
    max_len=128)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
class SWEM_BERT():
    """
    Simple Word-Embeddingbased Models (SWEM)
    https://arxiv.org/abs/1805.09843v1
    """

    def __init__(self, dic, tokenizer, oov_initialize_range=(-0.01, 0.01)):
        self.tokenizer = tokenizer
        self.dic = dic
        self.embedding_dim = self.dic.vectorize('a').shape[0]
        self.oov_initialize_range = oov_initialize_range

        if self.oov_initialize_range[0] > self.oov_initialize_range[1]:
            raise ValueError("Specify valid initialize range: "
                             f"[{self.oov_initialize_range[0]}, {self.oov_initialize_range[1]}]")

    def get_word_embeddings(self, text):
        np.random.seed(abs(hash(text)) % (10 ** 8))

        vectors = []
        for word in text.split():
            vectors.append(self.dic.vectorize(word))
            #else:
            #   vectors.append(np.random.uniform(self.oov_initialize_range[0],
            #                                     self.oov_initialize_range[1],
             #                                    self.embedding_dim))
        return np.array(vectors)

    def average_pooling(self, text):
        word_embeddings = self.get_word_embeddings(text)
        return np.mean(word_embeddings, axis=0)

swem = SWEM_BERT(BSV, BSV.tokenizer)

In [None]:
# Word vector
tr_vect = np.array([swem.average_pooling(text) for text in df_train['TITLE'].tolist()])
vl_vect = np.array([swem.average_pooling(text) for text in df_valid['TITLE'].tolist()])
ts_vect = np.array([swem.average_pooling(text) for text in df_test['TITLE'].tolist()])

In [None]:
le = LabelEncoder()
y_tr = le.fit_transform(df_train['CATEGORY'].values)
y_vl = le.transform(df_valid['CATEGORY'].values)
y_te = le.transform(df_test['CATEGORY'].values)

In [None]:
model = lgb.LGBMClassifier()
model.fit(tr_vect, y_tr)