In [1]:
from nltk.tokenize import word_tokenize, PunktSentenceTokenizer
from nltk.corpus import state_union, treebank
tagged_sentences=treebank.tagged_sents()
#print(tagged_sentences[:5])
from sklearn.cross_validation import train_test_split
from sklearn import svm
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
import numpy as np
import scipy as sp

#defining features

def features(sentence, index):
    #""" sentence: array of words, index: the index of the word """
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }

#function to strip  words in the form of an array from the given sentence 

def untag(tagged_sentence):
    return [w for w, t in tagged_sentence]

training_sentences, test_sentences=train_test_split(tagged_sentences,random_state=7)
#print(training_sentences)

#function to transform into feature and response matrix.

def transform_to_dataset(tagged_sentences):
    X, y = [], []
 
    for tagged in tagged_sentences:
        for index in range(len(tagged)):
            X.append(features(untag(tagged), index))
            y.append(tagged[index][1])
 
    return X, y 
X_train, y_train = transform_to_dataset(training_sentences)
#X contains features of all the words in the sentence argument passed in the form of a dictionary.
#y contains all the tags of the sentences in the form of dictionary.

#training the classifier

clf=Pipeline([('vectorizer', DictVectorizer(sparse='False')),
              ('classifier', svm.SVC(kernel='rbf', C=7000, random_state=7, decision_function_shape='ovr'))])
clf.fit(X_train, y_train)
#print(clf.feature_names_)
X_test, y_test=transform_to_dataset(test_sentences)
print(clf.score(X_test,y_test))

#predicting the values

def tag(sentence):
    tags=clf.predict([features(sentence,index) for index in range(len(sentence))])
    return list(zip(sentence,tags))

#using the POS tagger to predict

text=input()
token=word_tokenize(text)
tagged=tag(token)
ner=[]
pos=[]
for i,j in tagged:
    ner.append(i)
    pos.append(j)
print(ner)
print(pos)
A=ner
B=pos
B.append("!!!")
ans = []
temp = ""
Len = len(B)
for i in range(0, Len - 1):
	if(B[i] == "NNP" and B[i + 1] == "NNP"):
		temp += A[i]
	else:
		if(B[i] == "NNP"):
			temp += " " + A[i]
		TempLen = len(temp)
		if(TempLen):
			ans.append(temp)
		temp = ""


Len = len(ans)
for i in range(0, Len):
    print(ans[i])



0.957925440524
A prince named Abdullah whose wife was Chuchi had big boobs.
['A', 'prince', 'named', 'Abdullah', 'whose', 'wife', 'was', 'Chuchi', 'had', 'big', 'boobs', '.']
['DT', 'NN', 'VBD', 'NNP', 'WP$', 'NN', 'VBD', 'NNP', 'VBD', 'JJ', 'NNS', '.']
 Abdullah
 Chuchi
