In [1]:
import pandas as pd

tweet_df = pd.read_csv('drive/MyDrive/data/cleaned_airline_tweets.csv')

In [2]:
tweet_df.head()

Unnamed: 0,text,sentiment
0,@VirginAmerica What @dhepburn said.,neutral
1,"@VirginAmerica it was amazing, and arrived an ...",positive
2,@VirginAmerica I &lt;3 pretty graphics. so muc...,positive
3,@VirginAmerica So excited for my first cross c...,positive
4,I ❤️ flying @VirginAmerica. ☺️👍,positive


In [3]:
# from pandas_profiling import ProfileReport

# profile = ProfileReport(tweet_df, title="Tweets Report", explorative=True)

# profile

In [4]:
tweet_df['sentiment'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
sentiment,Unnamed: 1_level_1
positive,0.348705
neutral,0.336528
negative,0.314767


In [5]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(tweet_df, test_size=0.2, random_state=0, stratify=tweet_df['sentiment'])

print(f'Count of tweets in training set: {train.shape[0]:,}')
print(f'Count of tweets in testing set: {test.shape[0]:,}')

Count of tweets in training set: 3,088
Count of tweets in testing set: 772


In [6]:
# テキストのベクトル化

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

# 訓練データの回数ベクトル化
cv = CountVectorizer()
single_word = cv.fit_transform(train['text'])

print(single_word.shape)

(3088, 6018)


In [8]:
pd.DataFrame(single_word.todense(), columns=cv.get_feature_names_out())

Unnamed: 0,00,000,000114,000ft,00pm,0167560070877,02,0200,03,0400,...,zacks_com,zakkohane,zero,zf5wjgtxzt,zgoqoxjbqy,zj76,zone,zsdgzydnde,zukes,zv2pt6trk9
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3083,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3084,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3085,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3086,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# 最もよく使われる20個を選択する
cv = CountVectorizer(max_features=20)

limited_vocab = cv.fit_transform(train['text'])

pd.DataFrame(limited_vocab.toarray(), index = train['text'], columns = cv.get_feature_names_out())

Unnamed: 0_level_0,americanair,and,flight,for,in,is,it,jetblue,me,my,of,on,southwestair,thanks,the,to,united,usairways,you,your
text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
@JetBlue Maybe I'll just go to Cleveland instead.,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
smh RT @JetBlue: Our fleet's on fleek. http://t.co/IRiXaIfJJX,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0
@SouthwestAir I would.,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
@USAirways trying to Cancelled Flight a flight urgently...get hung up on twice??? Sweet refund policy,0,0,2,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0
@AmericanAir you are beyond redemption. Jfk. Baggage claim looks like a luggage warehouse,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
“@JetBlue: Our fleet's on fleek. http://t.co/b5ttno68xu” I just 🙈,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0
"@united caught earlier flight to ORD. Gate checked bag, and you've lost it at O'Hare. original flight lands in 20minutes. #frustrating!",0,1,2,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,1,0
@AmericanAir hi when will your next set of flights be out for next year from Dublin???,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
@SouthwestAir Finally! Integration w/ passbook is a great Valentine gift - better then chocoLate Flight. You do heart me.,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0


In [10]:
# unigram, bigram, trigramを考慮する
cv = CountVectorizer(ngram_range=(1, 3))
more_ngrams = cv.fit_transform(train['text'])

print(more_ngrams.shape)  # 70,613 features!

pd.DataFrame(more_ngrams.toarray(), index = train['text'], columns = cv.get_feature_names_out()).head()

(3088, 70613)


Unnamed: 0_level_0,00,00 phone,00 phone hold,00 pm,00 pm that,000,000 air,000 air miles,000 crewmembers,000 crewmembers embody,...,zj76 how,zj76 how did,zone,zone was,zone was after,zsdgzydnde,zukes,zukes non,zukes non vegan,zv2pt6trk9
text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
@JetBlue Maybe I'll just go to Cleveland instead.,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
smh RT @JetBlue: Our fleet's on fleek. http://t.co/IRiXaIfJJX,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
@SouthwestAir I would.,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
@USAirways trying to Cancelled Flight a flight urgently...get hung up on twice??? Sweet refund policy,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
@AmericanAir you are beyond redemption. Jfk. Baggage claim looks like a luggage warehouse,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# 訓練データに含まれる最も一般的な単語
cv = CountVectorizer(max_features=10)
cv.fit(train['text'])

cv.get_feature_names_out()

array(['and', 'flight', 'for', 'jetblue', 'on', 'southwestair', 'the',
       'to', 'united', 'you'], dtype=object)

In [12]:
# ストップワードを除去
cv = CountVectorizer(stop_words='english', max_features=10)  # A
cv.fit(train['text'])

cv.get_feature_names_out()

array(['americanair', 'flight', 'http', 'jetblue', 'service',
       'southwestair', 'thank', 'thanks', 'united', 'usairways'],
      dtype=object)

In [13]:
import numpy as np
np.random.seed(0)
import random
random.seed(0)

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, mean_squared_error
from sklearn.pipeline import Pipeline
import time

In [14]:
def advanced_grid_search(x_train, y_train, x_test, y_test, ml_pipeline, params, cv=3, include_probas=False, is_regression=False):
    '''
    This helper function will grid search a machine learning pipeline with feature engineering included
    and print out a classification report for the best param set.
    Best here is defined as having the best cross-validated accuracy on the training set
    '''

    model_grid_search = GridSearchCV(ml_pipeline, param_grid=params, cv=cv, error_score=-1)
    start_time = time.time()  # capture the start time

    model_grid_search.fit(x_train, y_train)

    best_model = model_grid_search.best_estimator_

    y_preds = best_model.predict(x_test)

    if is_regression:
        rmse = np.sqrt(mean_squared_error(y_pred=y_preds, y_true=test_set['pct_change_eod']))
        print(f'RMSE: {rmse:.5f}')
    else:
        print(classification_report(y_true=y_test, y_pred=y_preds))
    print(f'Best params: {model_grid_search.best_params_}')
    end_time = time.time()
    print(f"Overall took {(end_time - start_time):.2f} seconds")

    if include_probas:
        y_probas = best_model.predict_proba(x_test).max(axis=1)
        return best_model, y_preds, y_probas

    return best_model, y_preds

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# ロジスティック回帰
clf = LogisticRegression(max_iter=10000)

ml_pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', clf)
])

params = {
    'vectorizer__lowercase': [True, False],  # トークン化前にすべてのテキストを小文字にする
    'vectorizer__stop_words': [None, 'english'],
    'vectorizer__max_features': [100, 1000, 5000],
    'vectorizer__ngram_range': [(1, 1), (1, 3)],

    'classifier__C': [1e-1, 1e0, 1e1]
}

# 79%の正解率
print("Count Vectorizer + Log Reg\n=====================")
advanced_grid_search(  # D
    train['text'], train['sentiment'], test['text'], test['sentiment'],
    ml_pipeline, params
)

Count Vectorizer + Log Reg
              precision    recall  f1-score   support

    negative       0.79      0.77      0.78       243
     neutral       0.75      0.78      0.77       260
    positive       0.84      0.83      0.84       269

    accuracy                           0.79       772
   macro avg       0.79      0.79      0.79       772
weighted avg       0.79      0.79      0.79       772

Best params: {'classifier__C': 1.0, 'vectorizer__lowercase': True, 'vectorizer__max_features': 5000, 'vectorizer__ngram_range': (1, 1), 'vectorizer__stop_words': None}
Overall took 187.05 seconds


(Pipeline(steps=[('vectorizer', CountVectorizer(max_features=5000)),
                 ('classifier', LogisticRegression(max_iter=10000))]),
 array(['negative', 'neutral', 'positive', 'neutral', 'neutral',
        'positive', 'neutral', 'negative', 'positive', 'negative',
        'neutral', 'negative', 'negative', 'neutral', 'neutral',
        'negative', 'negative', 'neutral', 'positive', 'negative',
        'positive', 'positive', 'neutral', 'negative', 'neutral',
        'negative', 'neutral', 'neutral', 'positive', 'neutral',
        'negative', 'neutral', 'neutral', 'neutral', 'positive',
        'negative', 'neutral', 'positive', 'neutral', 'positive',
        'positive', 'neutral', 'neutral', 'negative', 'neutral',
        'negative', 'negative', 'positive', 'negative', 'positive',
        'negative', 'negative', 'neutral', 'negative', 'positive',
        'positive', 'positive', 'neutral', 'positive', 'positive',
        'positive', 'negative', 'positive', 'positive', 'negative',

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDFベクトル化
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=10)

tfdf_text = tfidf_vectorizer.fit_transform(train['text'])
pd.DataFrame(tfdf_text.toarray(), index = train['text'], columns = tfidf_vectorizer.get_feature_names_out())

Unnamed: 0_level_0,americanair,flight,http,jetblue,service,southwestair,thank,thanks,united,usairways
text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
@JetBlue Maybe I'll just go to Cleveland instead.,0.0,0.000000,0.00000,1.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000
smh RT @JetBlue: Our fleet's on fleek. http://t.co/IRiXaIfJJX,0.0,0.000000,0.78863,0.614869,0.0,0.000000,0.0,0.0,0.000000,0.000000
@SouthwestAir I would.,0.0,0.000000,0.00000,0.000000,0.0,1.000000,0.0,0.0,0.000000,0.000000
@USAirways trying to Cancelled Flight a flight urgently...get hung up on twice??? Sweet refund policy,0.0,0.877741,0.00000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.479136
@AmericanAir you are beyond redemption. Jfk. Baggage claim looks like a luggage warehouse,1.0,0.000000,0.00000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...
“@JetBlue: Our fleet's on fleek. http://t.co/b5ttno68xu” I just 🙈,0.0,0.000000,0.78863,0.614869,0.0,0.000000,0.0,0.0,0.000000,0.000000
"@united caught earlier flight to ORD. Gate checked bag, and you've lost it at O'Hare. original flight lands in 20minutes. #frustrating!",0.0,0.898683,0.00000,0.000000,0.0,0.000000,0.0,0.0,0.438599,0.000000
@AmericanAir hi when will your next set of flights be out for next year from Dublin???,1.0,0.000000,0.00000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000
@SouthwestAir Finally! Integration w/ passbook is a great Valentine gift - better then chocoLate Flight. You do heart me.,0.0,0.703237,0.00000,0.000000,0.0,0.710956,0.0,0.0,0.000000,0.000000


In [17]:
# 最も独自性のあるトークンの一覧化
tfidf_vectorizer = TfidfVectorizer()

tfidf_vectorizer.fit(train['text'])

idf = pd.DataFrame({'feature_name':tfidf_vectorizer.get_feature_names_out(), 'idf_weights':tfidf_vectorizer.idf_})
idf.sort_values('idf_weights', ascending=True)

Unnamed: 0,feature_name,idf_weights
5401,to,1.932281
5316,the,2.163475
5983,you,2.288016
2419,for,2.375028
5608,united,2.497463
...,...,...
3460,lucas,8.342456
3461,lucia,8.342456
1320,cbv7f3kbkx,8.342456
3450,lowstandards,8.342456


In [18]:
ml_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', clf)
])

# 正解率は84%に向上
print("TF-IDF Vectorizer + Log Reg\n=====================")
advanced_grid_search(
    train['text'], train['sentiment'], test['text'], test['sentiment'],
    ml_pipeline, params
)

TF-IDF Vectorizer + Log Reg
              precision    recall  f1-score   support

    negative       0.80      0.84      0.82       243
     neutral       0.82      0.80      0.81       260
    positive       0.89      0.87      0.88       269

    accuracy                           0.84       772
   macro avg       0.84      0.84      0.84       772
weighted avg       0.84      0.84      0.84       772

Best params: {'classifier__C': 1.0, 'vectorizer__lowercase': True, 'vectorizer__max_features': 5000, 'vectorizer__ngram_range': (1, 3), 'vectorizer__stop_words': None}
Overall took 163.79 seconds


(Pipeline(steps=[('vectorizer',
                  TfidfVectorizer(max_features=5000, ngram_range=(1, 3))),
                 ('classifier', LogisticRegression(max_iter=10000))]),
 array(['negative', 'neutral', 'positive', 'neutral', 'neutral',
        'positive', 'neutral', 'negative', 'positive', 'negative',
        'neutral', 'negative', 'negative', 'positive', 'neutral',
        'negative', 'negative', 'negative', 'positive', 'negative',
        'positive', 'positive', 'positive', 'negative', 'neutral',
        'negative', 'neutral', 'neutral', 'positive', 'neutral',
        'negative', 'neutral', 'neutral', 'positive', 'positive',
        'negative', 'neutral', 'positive', 'neutral', 'positive',
        'positive', 'neutral', 'neutral', 'negative', 'neutral',
        'negative', 'negative', 'positive', 'negative', 'positive',
        'negative', 'negative', 'negative', 'positive', 'positive',
        'positive', 'positive', 'neutral', 'positive', 'positive',
        'positive', 'neg

In [19]:
#
# 特徴量改善
#

In [20]:
from nltk.stem import SnowballStemmer

# ステミング
snowball_stemmer = SnowballStemmer(language='english')

snowball_stemmer.stem('waiting')

'wait'

In [21]:
import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords

stemmed_stopwords = list(map(snowball_stemmer.stem, stopwords.words('english')))

import re

def stem_tokenizer(_input):
    tokenized_words = re.sub(r"[^A-Za-z0-9\-]", " ", _input).lower().split()
    return [snowball_stemmer.stem(word) for word in tokenized_words if snowball_stemmer.stem(word) not in stemmed_stopwords]

stem_tokenizer('waiting for the plane')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['wait', 'plane']

In [22]:
ml_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=stem_tokenizer)),  # A
    ('classifier', clf)
])

params = {
#     'vectorizer__lowercase': [True, False],
#     'vectorizer__stop_words': [],  # B

    'vectorizer__max_features': [100, 1000, 5000],
    'vectorizer__ngram_range': [(1, 1), (1, 3)],

    'classifier__C': [1e-1, 1e0, 1e1]

}

# ステミングで性能が低下
print("Stemming + Log Reg\n=====================")
advanced_grid_search(
    # remove cleaning
    train['text'], train['sentiment'],
    test['text'], test['sentiment'],
    ml_pipeline, params
)

Stemming + Log Reg




              precision    recall  f1-score   support

    negative       0.80      0.81      0.80       243
     neutral       0.77      0.78      0.78       260
    positive       0.86      0.84      0.85       269

    accuracy                           0.81       772
   macro avg       0.81      0.81      0.81       772
weighted avg       0.81      0.81      0.81       772

Best params: {'classifier__C': 1.0, 'vectorizer__max_features': 5000, 'vectorizer__ngram_range': (1, 1)}
Overall took 143.67 seconds


(Pipeline(steps=[('vectorizer',
                  TfidfVectorizer(max_features=5000,
                                  tokenizer=<function stem_tokenizer at 0x79c81e5f6f80>)),
                 ('classifier', LogisticRegression(max_iter=10000))]),
 array(['negative', 'neutral', 'positive', 'neutral', 'neutral',
        'positive', 'neutral', 'neutral', 'positive', 'negative',
        'neutral', 'negative', 'negative', 'neutral', 'neutral',
        'negative', 'negative', 'neutral', 'positive', 'negative',
        'positive', 'positive', 'positive', 'negative', 'neutral',
        'negative', 'neutral', 'neutral', 'positive', 'neutral',
        'negative', 'neutral', 'neutral', 'positive', 'positive',
        'negative', 'neutral', 'positive', 'neutral', 'positive',
        'positive', 'neutral', 'positive', 'negative', 'neutral',
        'negative', 'negative', 'negative', 'negative', 'positive',
        'negative', 'negative', 'negative', 'positive', 'positive',
        'positive', 'pos

In [None]:
#
# 特徴量抽出
#

In [None]:
from sklearn.decomposition import TruncatedSVD

# SVD（特異値分解）による特徴量抽出・次元削減
ml_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('reducer', TruncatedSVD()),
    ('classifier', clf)
])

params = {
    'vectorizer__lowercase': [True, False],
    'vectorizer__stop_words': [None, 'english'],
    'vectorizer__max_features': [5000],
    'vectorizer__ngram_range': [(1, 3)],

    'reducer__n_components': [500, 1000, 1500, 2000],  # number of components to reduce to

    'classifier__C': [1e-1, 1e0, 1e1]

}

print("SVD + Log Reg\n=====================")
advanced_grid_search(
    train['text'], train['sentiment'],
    test['text'], test['sentiment'],
    ml_pipeline, params
)

In [None]:
#
# 特徴量学習
#

In [None]:
from keras.layers import Input, Dense      # A
from keras.models import Model, Sequential # A
import tensorflow as tf                    # A

n_inputs = vectorized_X_train.shape[1]
n_bottleneck = 2000  # B

# encoder
visible = Input(shape=(n_inputs,), name='input')
e = Dense(n_inputs//2, activation='relu', name='encoder')(visible)
# code/bottleneck
bottleneck = Dense(n_bottleneck, name='bottleneck')(e)

# decoder
d = Dense(n_inputs//2, activation='relu', name='decoder')(bottleneck)
# output layer
output = Dense(n_inputs, activation='relu', name='output')(d)

# define autoencoder model
autoencoder = Model(inputs=visible, outputs=output)


autoencoder.compile(optimizer='adam', loss='mse')

In [None]:
#
# BERTによる転移学習
#

In [24]:
from transformers import BertTokenizer, BertModel
import torch

bert_model = BertModel.from_pretrained('bert-base-uncased')

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tweet = 'I hate this airline'

token_ids = torch.tensor(bert_tokenizer.encode(tweet)).unsqueeze(0)

bert_model(token_ids)[1].shape

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

torch.Size([1, 768])

In [None]:
from tqdm import tqdm
import numpy as np

def batch_embed_text(bert_model, tokenizer, text_iterable, batch_size=256):
    ''' This helper method will batch embed an iterable of text using a given tokenizer and bert model '''
    encoding = tokenizer.batch_encode_plus(text_iterable, padding=True)
    input_ids = np.vstack(encoding['input_ids'])
    attention_mask = np.vstack(encoding['attention_mask'])

    def batch_array_idx(np_array, batch_size):
        for i in tqdm(range(0, np_array.shape[0], batch_size)):
            yield i, i + batch_size

    embedded = None

    for start_idx, end_idx in batch_array_idx(input_ids, batch_size=batch_size):
        batch_bert = bert_model(
            torch.tensor(input_ids[start_idx:end_idx]),
            attention_mask=torch.tensor(attention_mask[start_idx:end_idx])
        )[1].detach().numpy()
        if embedded is None:
            embedded = batch_bert
        else:
            embedded = np.vstack([embedded, batch_bert])

    return embedded

bert_X_train = batch_embed_text(bert_model, bert_tokenizer, train['text'])

bert_X_test = batch_embed_text(bert_model, bert_tokenizer, test['text'])

In [None]:
ml_pipeline = Pipeline([
    ('classifier', clf)
])

params = {
    'classifier__C': [1e-1, 1e0, 1e1]
}

print("BERT + Log Reg\n=====================")
advanced_grid_search(
    bert_X_train, train['sentiment'], bert_X_test, test['sentiment'],
    ml_pipeline, params
)