In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from transformers import AutoTokenizer
import xgboost as xgb

In [119]:
#Toy dataset. Review content on scale of a few sentences rather than a few paragraphs
df = pd.read_csv('amazon_cells_labelled.txt',  names=['review', 'sentiment'], sep='\t')

reviews = df['review'].values
labels = df['sentiment'].values
reviews_train, reviews_test, y_train, y_test = train_test_split(reviews, labels, test_size=0.2, random_state=1000)

In [124]:
content = pd.read_csv('Data/content.csv')
blah = pd.read_csv('Data/reviews.csv')

In [129]:
def convertScore(score):
    if score <= 6: #trainContentScorePairs.score.quantile(.5):
        y = 0
    else:
        y = 1
    return y
blah['label'] = blah['score'].transform(lambda x: convertScore(x))
reviews = content.content.values
labels = blah['label'].values
reviews_train, reviews_test, y_train, y_test = train_test_split(reviews, labels, test_size=0.2, random_state=1000)

With random guessing we get an f1 score of around 0.5 which makes sense.

In [130]:
predictions = np.random.randint(2, size = len(y_test))

In [131]:
f1_score(predictions, y_test)

0.6090022816842978

Now lets try a simple ootb tokenizer and Linear Regression Model

In [132]:
tokenizer = CountVectorizer()

x_train = tokenizer.fit_transform(reviews_train)
x_test = tokenizer.transform(reviews_test)

classifier = LogisticRegression()
classifier.fit(x_train, y_train)

prediction = classifier.predict(x_test)

f1_score(prediction, y_test)

ValueError: np.nan is an invalid document, expected byte or unicode string.

Improvement as to be expected. Now lets try a serious tokenizer with xgBoost

Now xgBoost on simple tokens

In [123]:
classifier = xgb.XGBClassifier()
classifier.fit(x_train.toarray(), y_train)

prediction = classifier.predict(x_test.toarray())

f1_score(prediction, y_test)





0.7653061224489796

Now xgb on custom tokenizer

In [56]:
classifier = xgb.XGBClassifier()
classifier.fit(X_train.toarray(),y_train)
prediction = classifier.predict(x_test.toarray())

f1_score(prediction, y_test)



0.7653061224489796

In [88]:
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

X_train = tokenizer(
    list(reviews_train), padding = True, truncation = True, max_length = 5)['input_ids']
X_test = tokenizer(
    list(reviews_test), padding = True, truncation = True, max_length = 5)['input_ids']

In [76]:
X_train = x_train.toarray()
X_test = x_test.toarray()

Poor results on test. Classic overfitting. Need more regularization. Lets do a grid search

In [89]:
param_grid = {
              "reg_alpha" : [0.01,0.05, 0.1, 0.5, 1, 2, 5,],
              "reg_lambda" : [0.01, 0.05, 0.1, 1,0.5, 5]
}

In [90]:
folds = 3
param_comb = 5
skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)
xgb_ = xgb.XGBClassifier()

In [91]:
from sklearn.model_selection import RandomizedSearchCV

random_search = RandomizedSearchCV(xgb_, param_distributions=param_grid, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=skf.split(X_train,y_train), verbose=3, random_state=1001 )

In [92]:
random_search.fit(np.array(X_train),y_train)

Fitting 3 folds for each of 5 candidates, totalling 15 fits






RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x7fec6bf18040>,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing...
                                           n_estimators=100, n_jobs=None,
                                           num_parallel_tree=None,
                                           random_state=None, reg_alpha=None,
                                           reg

In [93]:
prediction = random_search.predict(np.array(X_test))
f1_score(prediction, y_test)

0.5531914893617021

The Distilbert tokens really dont work well with the simpler algorithms. I guess that makes sense.