In [2]:
import nltk
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
nltk.download('brown')
from nltk.corpus import brown
corpus = brown.tagged_sents(categories='news')[:5000]
train_sents = corpus[:4500]  # Use first 5000 sentences for training
test_sents = corpus[4500:]  # Use remaining sentences for testing


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\Computer-_-\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [3]:
def extract_features(sentence, index):
    """
    Extract features for a given word in a sentence.
    """
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }

In [4]:
def transform_to_dataset(tagged_sentences):
    """
    Transform tagged sentences into a dataset that can be used for training
    a machine learning model.
    """
    X, y = [], []
    for sentence in tagged_sentences:
        for index in range(len(sentence)):
            X.append(extract_features([word for word, tag in sentence], index))
            y.append(sentence[index][1])
    return X, y

In [5]:
# Transform training and test data into datasets
X_train, y_train = transform_to_dataset(train_sents)
X_test, y_test = transform_to_dataset(test_sents)

In [6]:
pipeline = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('classifier', LogisticRegression())
])

In [7]:
# Train the machine learning model
pipeline.fit(X_train, y_train)

In [None]:
accuracy = pipeline.score(X_test, y_test)
print('Accuracy:', accuracy)