# Imports

In [2]:
import nltk
from nltk import word_tokenize
from nltk.corpus import treebank
import pprint
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline

# Datasets download

In [3]:
nltk.download('treebank')
nltk.download('punkt')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Tagged sentences

In [4]:
tagged_sentences = treebank.tagged_sents()
print(len(tagged_sentences[0]))

18


In [6]:
from sklearn.model_selection import train_test_split

def get_word_feature(word, index):
  # create features from the word and its position
  return {  
      'word': word, 
      'is_first': index == 0, 
      'is_capitalized': word[0].upper() == word[0],     
      'is_all_caps': word.upper() == word,     
      'is_all_lower': word.lower() == word,        
      'prefix-1': word[0],       
      'prefix-2': word[:2],      
      'prefix-3': word[:3],       
      'suffix-1': word[-1],     
      'suffix-2': word[-2:],      
      'suffix-3': word[-3:],      
      'has_hyphen': '-' in word,  
      'is_numeric': word.isdigit(),   
      'capitals_inside': word[1:].lower() != word[1:],
      'is_short': len(word) < 4,
      'is_long': len(word) > 10,
  }

# create dataset
X, y = [], []

for tagged_sent in tagged_sentences:
    untagged = [w for w,t in tagged_sent]
    for index in range(len(untagged)):
        X.append(get_word_feature(untagged[index], index))
        y.append(tagged_sent[index][1])

# split data into 80% train 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [10]:
# create classifier pipeline
classifier=Pipeline([
                     ('vectorizer', DictVectorizer(sparse=False)),
                     ('classifier', DecisionTreeClassifier(criterion='entropy')
                    )])
# train
classifier.fit(X_train[:50000], y_train[:50000])

# get accuracy
print("acc: ", classifier.score(X_test, y_test))

# predict tags
def pos_tag(tokens): 
  features = [get_word_feature(tokens[index], index, len(tokens)) for index in range(len(tokens))]
  tags = classifier.predict(features)
  return zip(tokens, tags)
  


acc:  0.9241160111243544


# Sample for output of your PoS tagger

In [17]:
print(list(pos_tag(word_tokenize('This is my friend, John.'))))

[('This', 'DT'), ('is', 'VBZ'), ('my', 'NN'), ('friend', 'NN'), (',', ','), ('John', 'NNP'), ('.', '.')]
