# Imports

In [13]:
import nltk
from nltk import word_tokenize
from nltk.corpus import treebank
import pprint
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline

[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

# Datasets download

In [14]:
nltk.download('treebank')
nltk.download('punkt')
nltk.download('universal_tagset')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Tagged sentences

In [33]:
tagged_sentences = nltk.corpus.treebank.tagged_sents()

In [16]:
def features(sentence, index):
  return {  
  'word': sentence[index], 
  'is_first': index == 0, 
  'is_last': index == len(sentence) - 1,    
  'is_capitalized': sentence[index][0].upper() == sentence[index][0],     
   'is_all_caps': sentence[index].upper() == sentence[index],    
  'is_all_lower': sentence[index].lower() == sentence[index],    
  'prefix-1': sentence[index][0],       
  'prefix-2': sentence[index][:2],      
  'prefix-3': sentence[index][:3],       
  'suffix-1': sentence[index][-1],     
  'suffix-2': sentence[index][-2:],      
  'suffix-3': sentence[index][-3:],     
  'prev_word': '' if index == 0 else sentence[index - 1],    
  'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],   
  'has_hyphen': '-' in sentence[index],  
  'is_numeric': sentence[index].isdigit(),   
  'capitals_inside': sentence[index][1:].lower() != sentence[index][1:] 
   }  

In [45]:
def transform_to_dataset(sentences):  
  X, y = [],[]    
  for tagged in sentences:    
    i=0
    while(i<len(tagged)):   
      X.append(features([w for w , t in tagged],i))              
      y.append(tagged[i][1]) 
      i+=1      
  return X, y  

In [46]:
part=int(0.8 * len(tagged_sentences)) 
x_train, y_train = transform_to_dataset( tagged_sentences[:part])
x_test, y_test = transform_to_dataset( tagged_sentences[part:])

In [48]:
classifier=Pipeline([
                     ('vectorizer', DictVectorizer(sparse=False)),
                     ('classifier', DecisionTreeClassifier(criterion='entropy'))
                     ]) 
classifier.fit(x_train[:10000], y_train[:10000])

train acc: 0.8961270880613118
test acc 0.8937072708218973


In [50]:
def pos_tag(sentence): 
  tags = classifier.predict([features(sentence, index) for index in range(len(sentence))]) 
  return list(zip(sentence, tags) )

In [None]:
print("train acc:",classifier.score(x_train, y_train))
print("test acc",classifier.score(x_test, y_test))

# Sample for output of your PoS tagger

In [52]:
print(list(pos_tag(word_tokenize('This is my friend, John.'))))

[('This', 'DT'), ('is', 'VBZ'), ('my', 'NN'), ('friend', 'NN'), (',', ','), ('John', 'NNP'), ('.', '.')]


In [51]:
print(list(pos_tag(word_tokenize("let's go shopping"))))

[('let', 'VBD'), ("'s", 'POS'), ('go', 'NN'), ('shopping', 'VBG')]
