In [2]:
import datasets
from collections import  Counter
from datasets import load_dataset
import pandas as pd
import sklearn
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
import torch

In [3]:
dynasent_r1  = load_dataset("dynabench/dynasent", 'dynabench.dynasent.r1.all', trust_remote_code=True)
dynasent_r1

Repo card metadata block was not found. Setting CardData to empty.


DatasetDict({
    train: Dataset({
        features: ['id', 'hit_ids', 'sentence', 'indices_into_review_text', 'model_0_label', 'model_0_probs', 'text_id', 'review_id', 'review_rating', 'label_distribution', 'gold_label', 'metadata'],
        num_rows: 80488
    })
    validation: Dataset({
        features: ['id', 'hit_ids', 'sentence', 'indices_into_review_text', 'model_0_label', 'model_0_probs', 'text_id', 'review_id', 'review_rating', 'label_distribution', 'gold_label', 'metadata'],
        num_rows: 3600
    })
    test: Dataset({
        features: ['id', 'hit_ids', 'sentence', 'indices_into_review_text', 'model_0_label', 'model_0_probs', 'text_id', 'review_id', 'review_rating', 'label_distribution', 'gold_label', 'metadata'],
        num_rows: 3600
    })
})

In [4]:
def print_label_dist(dataset, label_name="gold_label", split_names=("train", "validation")):
    for split_name in split_names:
        print(split_name)
        dist = sorted(Counter(dataset[split_name][label_name]).items())
        for k, v in dist:
            print(f"\t{k:>4s}: {v}")

print_label_dist(dynasent_r1)

train
	negative: 14021
	neutral: 45076
	positive: 21391
validation
	negative: 1200
	neutral: 1200
	positive: 1200


In [5]:
dynasent_r2 = load_dataset("dynabench/dynasent", 'dynabench.dynasent.r2.all')

Repo card metadata block was not found. Setting CardData to empty.


In [6]:
dynasent_r2['train'][25]

{'id': 'r2-0000042',
 'hit_ids': ['y22532', 'y22740'],
 'sentence': 'He kindly walked us through the menu through the specialties and the wine list that we did not want.',
 'sentence_author': 'w148',
 'has_prompt': True,
 'prompt_data': {'indices_into_review_text': [453, 525],
  'review_rating': 5,
  'prompt_sentence': 'He walked us through the menu through the specialties and the wine list.',
  'review_id': '7joEfaGr4aC0OrR9I7CJ8Q'},
 'model_1_label': 'positive',
 'model_1_probs': {'negative': 0.15193994343280792,
  'positive': 0.7109395265579224,
  'neutral': 0.1371205449104309},
 'text_id': 'r2-0000042',
 'label_distribution': {'positive': [],
  'negative': ['w516', 'w199', 'w294'],
  'neutral': ['w544'],
  'mixed': ['w527']},
 'gold_label': 'negative',
 'metadata': {'split': 'train',
  'round': 2,
  'subset': 'all',
  'model_in_the_loop': 'RoBERTa'}}

In [7]:
print_label_dist(dynasent_r2)

train
	negative: 4579
	neutral: 2448
	positive: 6038
validation
	negative: 240
	neutral: 240
	positive: 240


In [8]:
sst = load_dataset("SetFit/sst5")

Repo card metadata block was not found. Setting CardData to empty.


In [9]:
print_label_dist(sst, label_name='label_text')

train
	negative: 2218
	neutral: 1624
	positive: 2322
	very negative: 1092
	very positive: 1288
validation
	negative: 289
	neutral: 229
	positive: 279
	very negative: 139
	very positive: 165


In [10]:
sst["train"][35]

{'text': "a well-intentioned effort that 's still too burdened by the actor 's offbeat sensibilities for the earnest emotional core to emerge with any degree of accessibility .",
 'label': 1,
 'label_text': 'negative'}

reformart the SST dataset

In [11]:
def conver_sst_label(label):
    return label.split(" ")[-1]

for split_name in ("train", "test", "validation"):
    dist = [conver_sst_label(s) for s in sst[split_name]['label_text']]
    sst[split_name] = sst[split_name].add_column("gold_label", dist)
    sst[split_name] = sst[split_name].add_column("sentence", sst[split_name]["text"])

print_label_dist(sst)

train
	negative: 3310
	neutral: 1624
	positive: 3610
validation
	negative: 428
	neutral: 229
	positive: 444


## Linear Classifiers

In [12]:
def unigrams_phi(sentence: str):
    return Counter(sentence.lower().split(" "))

uni = unigrams_phi("Hello! I love Machine Learning!")
uni

Counter({'hello!': 1, 'i': 1, 'love': 1, 'machine': 1, 'learning!': 1})

In [13]:
train_feats = [
    {'a': 1, 'b': 1},
    {'b': 1, 'c': 2}
]

vec = DictVectorizer(sparse=False)
X_train = vec.fit_transform(train_feats)

df = pd.DataFrame(X_train, columns=vec.get_feature_names_out())
df

Unnamed: 0,a,b,c
0,1.0,1.0,0.0
1,0.0,1.0,2.0


In [14]:
test_feats = [
    {'a': 2, 'c': 1},
    {'a': 4, 'b': 2, 'd': 1}
]

X_test = vec.transform(test_feats)
df = pd.DataFrame(X_test, columns=vec.get_feature_names_out())
df

Unnamed: 0,a,b,c
0,2.0,0.0,1.0
1,4.0,2.0,0.0


The most common mistake with DictVectorizer is calling fit_transform on test examples. This will wipe out the existing representation scheme, replacing it with one that matches the test examples. That will happen silently, but then you'll find that the new representations are incompatible with the model you fit. This is likely to manifest itself as a ValueError relating to feature counts.

### Task 1: Tweetgrams

In [15]:
from nltk.tokenize import TweetTokenizer

def tweetgrams_phi(sentence: str, **kwargs):
    tk = TweetTokenizer(kwargs)
    tokens = tk.tokenize(sentence)
    return Counter(tokens)

tweet = "Let that SINK in :)"

grams = tweetgrams_phi(tweet, preserve_case=False)
grams

Counter({'Let': 1, 'that': 1, 'SINK': 1, 'in': 1, ':)': 1})

#### Test Function

In [16]:
def test_tweetgrams_phi(func):
    examples = [
        (
            "Here's an example with an emoticon :)", 
            Counter({'an': 2, "Here's": 1, 'example': 1, 'with': 1, 'emoticon': 1, ':)': 1})
        ),
        (
            "The URL is https://pytorch.org!", 
            Counter({'The': 1, 'URL': 1, 'is': 1, 'https://pytorch.org': 1, '!': 1})
        )
    ]
    errcount = 0
    for ex, expected in examples:
        result = func(ex, preserve_case=True)
        if result != expected:
            errcount += 1
            print(f"Error for `{func.__name__}`: For input {ex}, "
                  f"expected {expected} but got {result}")
    caps_ex = "CAPS"
    caps_result = func(caps_ex, preserve_case=False)
    caps_expected = Counter({"CAPS": 1})
    if caps_result != caps_expected:
        errcount += 1
        print(f"Error for `{func.__name__}`: For input {caps_ex}, "
              f"expected {caps_expected} but got {caps_result}")
    if errcount == 0:
        print(f"All tests passed for `{func.__name__}`")

In [17]:
test_tweetgrams_phi(tweetgrams_phi)

All tests passed for `tweetgrams_phi`


### Task 2: Model Training

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import classification_report

def train_linear_model(model, featfunc, train_dataset):
    """Train an sklearn classifier.

    Parameters
    ----------
    model : sklearn classifier model
    featfunc : func
        Maps strings to Counter instances
    train_dataset: dict
        Must have a key "sentence" containing strings that `featfunc`
        will process, and a key "gold_label" giving labels

    Returns
    -------
    tuple
        * A trained version of `model`
        * A fitted `vectorizer` for the train set

    """

    feats = [dict(featfunc(s)) for s in train_dataset['sentence']]
    vec = DictVectorizer()
    X_train = vec.fit_transform(feats)
    model.fit(X_train, train_dataset['gold_label'])
    return model, vec


train_dataset = {
    'sentence': ['A A', 'A B', 'B B', 'B A', 'B'],
    'gold_label': [0, 1, 0, 1, 1]
}
def featfunc(s):
    return Counter(s.split())

model = LogisticRegression()
train_linear_model(model, featfunc, train_dataset)

(LogisticRegression(), DictVectorizer())

In [22]:
def test_train_linear_model(func):
    train_dataset = {
        'sentence': ['A A', 'A B', 'B B', 'B A', 'B'],
        'gold_label': [0, 1, 0, 1, 1]}
    def featfunc(s):
        return Counter(s.split())
    model = LogisticRegression()
    result = func(model, featfunc, train_dataset)
    if not isinstance(result, tuple) or len(result) != 2:
        print(f"Error for `{func.__name__}`: Incorrect return type")
        return
    model, vectorizer = result
    if not hasattr(vectorizer, 'vocabulary_'):
        print(f"Error for `{func.__name__}`: "
              f"Second return value is not a trained vectorizer")
        return
    if not hasattr(model, 'classes_'):
        print(f"Error for `{func.__name__}`: "
              f"First return value is not a trained classifier")
        return
    print(f"No errors found for `{func.__name__}`")

_ =  test_train_linear_model(train_linear_model)

No errors found for `train_linear_model`


#### train model on dynasent R1

In [23]:
lr_unigrams, vec_unigrams = train_linear_model(
    LogisticRegression(max_iter=1000), 
    unigrams_phi, dynasent_r1['train'])

### Task 3: model assessment

In [24]:
def assess_linear_model(model, featfunc, vectorizer, assess_dataset):
    """Assess a trained sklearn model.

    Parameters
    ----------
    model: trained sklearn model
    featfunc : func
        Maps strings to count dicts
    vectorizer : fitted DictVectorizer
    assess_dataset: dict
        Must have a key "sentence" containing strings that `featfunc`
        will process, and a key "gold_label" giving labels

    Returns
    -------
    A classification report (multiline string)

    """
    feats = [featfunc(s) for s in assess_dataset['sentence']]
    X_test = vectorizer.transform(feats)
    preds = model.predict(X_test)
    
    return classification_report(assess_dataset['gold_label'], preds, digits=3)

In [25]:
def test_assess_linear_model(assessfunc, trainfunc):
    train_dataset = {
        'sentence': ['A A', 'A B', 'B B', 'B A', 'A', 'B'],
        'gold_label': [0, 1, 0, 1, 0, 1]}
    assess_dataset = {
        'sentence': ['A C', 'B A'],
        'gold_label': [0, 1]}
    def featfunc(s):
        return Counter(s.split())
    model = LogisticRegression()
    model, vectorizer = trainfunc(model, featfunc, train_dataset)
    result = assessfunc(model, featfunc, vectorizer, assess_dataset)
    errcount = 0
    if len(vectorizer.vocabulary_) != 2:
        print(f"Error for `{assessfunc.__name__}`: Unexpected feature count")
        errcount += 1
    if 'weighted avg' not in result:
        print(f"Error for `{assessfunc.__name__}`: Unexpected return value")
        errcount += 1
    if errcount == 0:
        print(f"No errors found for `{assessfunc.__name__}`")

test_assess_linear_model(assess_linear_model, train_linear_model)


No errors found for `assess_linear_model`


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [26]:
report = assess_linear_model(
    lr_unigrams,
    unigrams_phi,
    vec_unigrams,
    dynasent_r1['validation'])

print(report)

              precision    recall  f1-score   support

    negative      0.759     0.364     0.492      1200
     neutral      0.523     0.890     0.659      1200
    positive      0.699     0.572     0.629      1200

    accuracy                          0.609      3600
   macro avg      0.660     0.609     0.593      3600
weighted avg      0.660     0.609     0.593      3600

