In [34]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from transformers import AutoTokenizer
import xgboost as xgb


In [110]:
#data = 'amazon'
data = 'pitchfork'


if data == "amazon":
    df = pd.read_csv('amazon_cells_labelled.txt',  names=['review', 'sentiment'], sep='\t')
    reviews = df['review'].values
    labels = df['sentiment'].values
elif data == 'pitchfork':
    content_df = pd.read_csv('Data/content.csv')
    review_df = pd.read_csv('Data/reviews.csv')
    reviews = content_df.content.fillna(' ').values
    labels = (review_df.score <= 6).astype(int).values
    
reviews_train, reviews_test, y_train, y_test = train_test_split(reviews, labels, test_size=0.2, random_state=1000)

In [111]:
#Tokenize Review Content

tokenizer = CountVectorizer()
x_train = tokenizer.fit_transform(reviews_train)
x_test = tokenizer.transform(reviews_test)

In [112]:
def predictions(model):
    if model == "random":
        
        predictions = np.random.randint(2, size = len(y_test))
        
    elif model == 'lr':
        
        classifier = LogisticRegression()
        classifier.fit(x_train, y_train)
        predictions = classifier.predict(x_test)
        
    elif model == 'xgb':
        
        classifier = xgb.XGBClassifier()
        param_grid = {
            "reg_alpha" : [0.01, 0.1, 1],
            "n_estimators" : [50, 125, 200],
            "learning_rate" : [0.01, 0.1, 0.2],
        };folds = 4; param_comb = 10
        skf = StratifiedKFold(n_splits=folds, shuffle = True)
        random_search = RandomizedSearchCV(xgb_, param_distributions=param_grid, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=skf.split(x_train,y_train))
        random_search.fit(x_train.toarray(), y_train)
        predictions = random_search.predict(x_test.toarray())
    return predictions

In [107]:
#toy dataset (amazon)

print(f'Random F1 Score:{f1_score(predictions("random"), y_test)}')
print(f'LR F1 Score: {f1_score(predictions("lr"), y_test)}')
print(f'XGB F1 Score: {f1_score(predictions("xgb"), y_test)}')

Random F1 Score:0.4512820512820513
LR F1 Score: 0.7916666666666666




XGB F1 Score: 0.7812499999999999


In [114]:
#actual dataset (pitchfork)

print(f'Random F1 Score:{f1_score(predictions("random"), y_test)}')
print(f'LR F1 Score: {f1_score(predictions("lr"), y_test)}')
print(f'XGB F1 Score: {f1_score(predictions("xgb"), y_test)}')

Random F1 Score:0.27233372687918145


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LR F1 Score: 0.4983766233766233


TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9)}