In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import AlbertTokenizer, TFAlbertModel

In [3]:
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)

In [4]:
df1=df[:2000]

In [5]:
labels = df1[1].values

In [6]:
df1[1].value_counts()

1    1041
0     959
Name: 1, dtype: int64

In [7]:
texts = df1[0].values.tolist()

In [8]:
tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
# xlarge version => "albert-xlarge-v2"
# xxlarge version => "albert-xxlarge-v2"

In [9]:
tokenized_data = tokenizer(texts, return_tensors="np", max_length=30, padding='max_length', truncation=True)

In [10]:
model = TFAlbertModel.from_pretrained('albert-base-v2', output_hidden_states = True)
# xlarge version => "albert-xlarge-v2"
# xxlarge version => "albert-xxlarge-v2"

Some layers from the model checkpoint at albert-base-v2 were not used when initializing TFAlbertModel: ['predictions']
- This IS expected if you are initializing TFAlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFAlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFAlbertModel were initialized from the model checkpoint at albert-base-v2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFAlbertModel for predictions without further training.


In [11]:
outputs = model(tokenized_data)

In [12]:
len(outputs)

3

In [13]:
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output', 'hidden_states'])

In [14]:
len(outputs.hidden_states)

13

In [14]:
outputs.last_hidden_state.shape

TensorShape([2000, 30, 768])

In [15]:
features = outputs.last_hidden_state[:,0,:].numpy()

In [17]:
features.shape

(2000, 768)

In [17]:
len(labels)

2000

In [25]:
labels

array([1, 0, 0, ..., 1, 0, 1], dtype=int64)

In [18]:
from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.2, random_state=0)

In [19]:
train_features.shape

(1600, 768)

In [20]:
train_labels.shape

(1600,)

In [21]:
from sklearn.linear_model import LogisticRegression
lr2 = LogisticRegression(C=1, penalty='l2', solver='saga', max_iter=10000)
lr2.fit(train_features, train_labels)
pred_labels = lr2.predict(test_features)

In [22]:
from sklearn.metrics import accuracy_score
accuracy_score(test_labels, pred_labels)

0.785