In [42]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from transformers import AutoTokenizer
import xgboost as xgb

In [38]:
#Toy dataset. Review content on scale of a few sentences rather than a few paragraphs
toy_df = pd.read_csv('amazon_cells_labelled.txt',  names=['review', 'sentiment'], sep='\t')

reviews = df['review'].values
labels = df['sentiment'].values
reviews_train, reviews_test, y_train, y_test = train_test_split(reviews, labels, test_size=0.2, random_state=1000)

With random guessing we get an f1 score of around 0.5 which makes sense.

In [39]:
predictions = np.random.randint(2, size = len(y_test))

In [40]:
f1_score(predictions, y_test)

0.4824120603015075

Now lets try a simple ootb tokenizer and Linear Regression Model

In [66]:
tokenizer = CountVectorizer()

x_train = tokenizer.fit_transform(reviews_train)
x_test = tokenizer.transform(reviews_test)

classifier = LogisticRegression()
classifier.fit(x_train, y_train)

prediction = classifier.predict(x_test)

f1_score(prediction, y_test)

0.7916666666666666

Now xgBoost on simple tokens

In [67]:
classifier = xgb.XGBClassifier()
classifier.fit(np.array(X_train),y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [70]:
f1_score(classifier.predict(np.array(X_train)),y_train)

0.9586983729662076

Improvement as to be expected. Now lets try a serious tokenizer with xgBoost

In [63]:
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

X_train = tokenizer(
    list(reviews_train), padding = True, truncation = True, max_length = 5)['input_ids']
X_test = tokenizer(
    list(reviews_test), padding = True, truncation = True, max_length = 5)['input_ids']

In [64]:
classifier = xgb.XGBClassifier()
classifier.fit(np.array(X_train),y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [65]:
prediction = classifier.predict(np.array(X_test))
f1_score(prediction, y_test)

0.5549738219895288