In [1]:
%run -i "../util/file_utils.ipynb"
%run -i "../util/lang_utils.ipynb"

In [2]:
from datasets import load_dataset
train_dataset = load_dataset("rotten_tomatoes", split="train[:15%]+train[-15%:]")
test_dataset = load_dataset("rotten_tomatoes", split="test[:15%]+test[-15%:]")
print(len(train_dataset))
print(len(test_dataset))

2560
320


In [3]:
train_dataset[0]['text']

'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'

In [4]:
# Build a simple text vectorizer
class POS_vectorizer:
    def __init__(self, spacy_model):
        self.model = spacy_model
        
    def vectorize(self, input_text):
        doc = self.model(input_text)
        vector = []
        vector.append(len(doc))
        pos = {"VERB":0, "NOUN":0, "PRON":0, "ADJ":0, "ADV":0, "AUX":0,
               "PROPN":0, "NUM":0, "PUNCT":0}
        for token in doc:
            if token.pos_ in pos:
                pos[token.pos_] += 1
        vector_values = list(pos.values()) # convert dict values to list
        vector = vector + vector_values # concatenate lists
        return vector

In [5]:
sample_text = train_dataset[0]['text']
vectorizer = POS_vectorizer(small_model)
vector = vectorizer.vectorize(sample_text)
print(sample_text)
print(vector)

the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .
[38, 3, 8, 1, 5, 1, 3, 2, 0, 5]


In [6]:
# Build up training and test matrices
import numpy as np
import pandas as pd

train_df = train_dataset.to_pandas() # two columns: text, label
train_df = train_df.sample(frac=1).reset_index(drop=True) # shuffle the dataframe
test_df = test_dataset.to_pandas()

train_df["vector"] = train_df["text"].apply(lambda x: vectorizer.vectorize(x))
X_train = np.array(train_df["vector"].to_list())
y_train = np.array(train_df["label"].to_list())
test_df["vector"] = test_df["text"].apply(lambda x: vectorizer.vectorize(x))
X_test = np.array(test_df["vector"].to_list())
y_test = np.array(test_df["label"].to_list())



In [7]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((2560, 10), (2560,), (320, 10), (320,))

In [8]:
# Build a classification model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
model = LogisticRegression(C=0.1, max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.58      0.56      0.57       160
           1       0.58      0.59      0.58       160

    accuracy                           0.58       320
   macro avg       0.58      0.58      0.58       320
weighted avg       0.58      0.58      0.58       320



In [9]:
# BOW vectorizer
%run -i "../util/util_simple_classifier.ipynb"
from sklearn.feature_extraction.text import CountVectorizer
import sys

In [10]:
(train_df, text_df) = load_train_test_dataset_pd()
vectorizer = CountVectorizer(max_df=0.4)
X = vectorizer.fit_transform(train_df["text"])
print(X)

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 39134 stored elements and shape (2560, 8856)>
  Coords	Values
  (0, 6578)	1
  (0, 4219)	1
  (0, 2106)	1
  (0, 8000)	2
  (0, 717)	1
  (0, 42)	1
  (0, 1280)	1
  (0, 5260)	1
  (0, 1607)	1
  (0, 7889)	1
  (0, 3630)	1
  (0, 3406)	1
  (0, 4759)	1
  (0, 7345)	1
  (0, 2707)	1
  (0, 3476)	1
  (0, 7883)	1
  (0, 487)	1
  (0, 6769)	1
  (0, 4269)	1
  (0, 1441)	1
  (0, 8405)	1
  (0, 1889)	1
  (0, 5466)	1
  (0, 7461)	1
  :	:
  (2557, 5905)	1
  (2557, 3595)	1
  (2557, 6745)	1
  (2557, 1023)	1
  (2557, 5259)	1
  (2558, 8000)	1
  (2558, 5331)	1
  (2558, 6733)	1
  (2558, 970)	1
  (2558, 4150)	1
  (2558, 874)	1
  (2559, 4219)	1
  (2559, 5128)	1
  (2559, 285)	1
  (2559, 5292)	1
  (2559, 6278)	1
  (2559, 5326)	1
  (2559, 7929)	1
  (2559, 3336)	1
  (2559, 5622)	1
  (2559, 6748)	1
  (2559, 8629)	1
  (2559, 7915)	1
  (2559, 1679)	1
  (2559, 5904)	1


In [11]:
dense_matrix = X.todense()
print(dense_matrix)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [12]:
print(vectorizer.get_feature_names_out())

['10' '100' '101' ... 'zone' 'ótimo' 'últimos']


In [13]:
print(len(vectorizer.get_feature_names_out()))

8856


In [None]:
print(vectorizer.stop_words_)