In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer

def extract_features(sentence, i):
    """Extract features for the word at index i in the sentence."""
    features = {
        'word': sentence[i],
        'is_first': i == 0,
        'is_last': i == len(sentence) - 1,
        'is_capitalized': sentence[i][0].isupper(),
        'is_all_caps': sentence[i].isupper(),
        'is_numeric': sentence[i].isdigit(),
        'prefix_1': sentence[i][0] if len(sentence[i]) > 0 else '',
        'prefix_2': sentence[i][:2] if len(sentence[i]) > 1 else '',
        'suffix_1': sentence[i][-1] if len(sentence[i]) > 0 else '',
        'suffix_2': sentence[i][-2:] if len(sentence[i]) > 1 else '',
        'suffix_3': sentence[i][-3:] if len(sentence[i]) > 2 else '',
    }

    # Add features for surrounding words
    if i > 0:
        features['prev_word'] = sentence[i-1]
        features['prev_is_capitalized'] = sentence[i-1][0].isupper()
    else:
        features['prev_word'] = '<START>'
        features['prev_is_capitalized'] = False # Or some other indicator for start

    if i < len(sentence) - 1:
        features['next_word'] = sentence[i+1]
        features['next_is_capitalized'] = sentence[i+1][0].isupper()
    else:
        features['next_word'] = '<END>'
        features['next_is_capitalized'] = False # Or some other indicator for end

    return features

# Define training data (placeholder - replace with your actual data)
pos_data = [
    (['This', 'is', 'a', 'sample', 'sentence', '.'], ['DT', 'VBZ', 'DT', 'NN', 'NN', '.']),
    (['Another', 'example', 'here', '.'], ['DT', 'NN', 'RB', '.'])
]


# Prepare training data
X_train = []
y_train = []

for sentence, tags in pos_data:
    for i in range(len(sentence)):
        features = extract_features(sentence, i)
        X_train.append(features)
        y_train.append(tags[i])

# Convert features to vectors
vectorizer = DictVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)

# Initialize and train the Logistic Regression model
lod_model = LogisticRegression(max_iter=1000) # Increased max_iter for convergence
lod_model.fit(X_train_vec, y_train)

# Define prediction function
def predict_tags_lod(sentence, model, vectorizer):
    """Predict tags for a sentence using the trained LOD model."""
    features = [extract_features(sentence, i) for i in range(len(sentence))]
    features_vec = vectorizer.transform(features)
    predicted_tags = model.predict(features_vec)
    return predicted_tags

# Test the LOD model with a sample sentence
test_sentence = ['This', 'is', 'a', 'test', '.']
predicted_tags_lod = predict_tags_lod(test_sentence, lod_model, vectorizer)

print(f"Test Sentence (LOD): {test_sentence}")
print(f"Predicted Tags (LOD): {predicted_tags_lod}")

# Test with a sentence from the training data
test_sentence_trained = ['This', 'is', 'a', 'sample', 'sentence', '.']
predicted_tags_lod_trained = predict_tags_lod(test_sentence_trained, lod_model, vectorizer)

print(f"Test Sentence (LOD, Trained): {test_sentence_trained}")
print(f"Predicted Tags (LOD, Trained): {predicted_tags_lod_trained}")

Test Sentence (LOD): ['This', 'is', 'a', 'test', '.']
Predicted Tags (LOD): ['DT' 'VBZ' 'DT' 'NN' '.']
Test Sentence (LOD, Trained): ['This', 'is', 'a', 'sample', 'sentence', '.']
Predicted Tags (LOD, Trained): ['DT' 'VBZ' 'DT' 'NN' 'NN' '.']
