# <span style='color:orange'> Applying Machine Learning Classifiers to problem of Chunk Prediction <span>

# Read the Chunking Training Dataset

In [1]:
import pandas as pd
from pandas.io.parsers import read_table

In [4]:
train_text_table = read_table('./data/train.txt', header=None, delimiter=' ')

In [5]:
train_text_table.head()

Unnamed: 0,0,1,2
0,Confidence,NN,B-NP
1,in,IN,B-PP
2,the,DT,B-NP
3,pound,NN,I-NP
4,is,VBZ,B-VP


# Generate the list which contains (word POS_tag) as strings

In [6]:
train_word_list = []
for i in range(len(train_text_table)):
    train_word_list.append(train_text_table[0][i] + ' ' + train_text_table[1][i])

In [14]:
train_word_list[1]

'in IN'

In [8]:
train_classes = train_text_table[2].tolist()

In [9]:
train_classes[0]

'B-NP'

# Transform string = 'word POS_tag' into feature vector using token counts

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train_word_list)
X_train_counts.shape

(211727, 14867)

In [15]:
count_vect.vocabulary_.get(u'in')

6949

In [16]:
X_train_tfidf = X_train_counts

# Convert Occurences to Frequencies
Divide the number of occurrences of each word in a document by the total number of words in the document: these new features are called tf for Term Frequencies.

In [42]:
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

In [23]:
import numpy as np

# Encode categorical labels (B-NP...) to integer values 

In [17]:
from sklearn.preprocessing import LabelEncoder

In [18]:
le = LabelEncoder()

In [19]:
int_train_targets = le.fit_transform(train_classes)

In [20]:
int_train_targets

array([ 5,  6,  5, ..., 15,  1, 21])

In [21]:
int_train_targets.shape

(211727,)

In [24]:
print('Training Data Ready...')
print('Shape of Testing Data: {}'.format(np.shape(X_train_tfidf)))
print('Shape of Testing Labels: {}'.format(np.shape(int_train_targets)))

Training Data Ready...
Shape of Testing Data: (211727, 14867)
Shape of Testing Labels: (211727,)


# Load the Testing Data and prepare it for testing

In [27]:
test_text_table = read_table('./data/reference250.txt', header=None, delimiter=' ')

In [28]:
test_text_table = test_text_table.dropna()

In [29]:
test_word_list = []
for i in range(len(test_text_table)):
    test_word_list.append(test_text_table[0][i] + ' ' + test_text_table[1][i])

In [30]:
test_word_list[0]

'Rockwell NNP'

In [31]:
X_test_counts = count_vect.transform(test_word_list)
X_test_counts.shape

(6002, 14867)

In [33]:
X_test_tfidf = X_test_counts

In [None]:
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
X_test_tfidf.shape

In [34]:
test_classes = test_text_table[2].tolist()

In [35]:
int_test_targets = le.transform(test_classes)

In [36]:
print('Testing Data Ready...')
print('Shape of Testing Data: {}'.format(np.shape(X_test_tfidf)))
print('Shape of Testing Labels: {}'.format(np.shape(int_test_targets)))

Testing Data Ready...
Shape of Testing Data: (6002, 14867)
Shape of Testing Labels: (6002,)


# <span style='color:Green'> Training Classifiers </span>

## <span style='color:red'> Naive Bayes </span>

In [37]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, int_train_targets)

In [38]:
predicted = clf.predict(X_test_tfidf)


In [39]:
from sklearn import metrics
print(metrics.classification_report(int_test_targets, predicted,
    target_names=le.classes_))
print('Accuracy: {}%'.format(metrics.accuracy_score(int_test_targets, predicted)*100))

             precision    recall  f1-score   support

     B-ADJP       0.80      0.08      0.14        53
     B-ADVP       0.47      0.83      0.60        96
    B-CONJP       0.00      0.00      0.00         2
     B-INTJ       0.89      0.69      0.78      1583
      B-LST       0.84      0.99      0.91       626
       B-NP       1.00      0.10      0.18        10
       B-PP       0.83      0.68      0.75        50
      B-PRT       0.89      0.83      0.86       577
     B-SBAR       0.00      0.00      0.00        27
      B-UCP       0.00      0.00      0.00        12
       B-VP       0.00      0.00      0.00         1
     I-ADJP       0.58      0.89      0.70      1877
     I-ADVP       0.00      0.00      0.00         5
    I-CONJP       0.00      0.00      0.00         2
     I-INTJ       0.73      0.67      0.69       273
       I-NP       0.62      0.13      0.22       808

avg / total       0.73      0.71      0.68      6002

Accuracy: 71.0763078974%


  .format(len(labels), len(target_names))
  'precision', 'predicted', average, warn_for)


## <span style='color:red'> Support Vector Machine </span>

In [43]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-4, random_state=25,
                                           max_iter=4, tol=None, n_jobs=8)),
])
text_clf.fit(train_word_list, int_train_targets)  

predicted = text_clf.predict(test_word_list)
np.mean(predicted == int_test_targets)            
print(metrics.classification_report(int_test_targets, predicted,
    target_names=le.classes_))
metrics.confusion_matrix(int_test_targets, predicted)

print('Accuracy: {}%'.format(metrics.accuracy_score(int_test_targets, predicted)*100))

             precision    recall  f1-score   support

     B-ADJP       0.25      0.04      0.07        53
     B-ADVP       0.63      0.67      0.65        96
    B-CONJP       0.00      0.00      0.00         2
     B-INTJ       0.92      0.64      0.75      1583
      B-LST       0.85      0.98      0.91       626
       B-NP       0.45      0.50      0.48        10
       B-PP       0.79      0.68      0.73        50
      B-PRT       0.91      0.84      0.87       577
     B-SBAR       0.00      0.00      0.00        27
      B-UCP       0.50      0.17      0.25        12
       B-VP       0.00      0.00      0.00         1
     I-ADJP       0.75      0.89      0.81      1877
     I-ADVP       0.00      0.00      0.00         5
    I-CONJP       0.00      0.00      0.00         2
     I-INTJ       0.70      0.72      0.71       273
       I-NP       0.82      0.97      0.89       808

avg / total       0.82      0.81      0.80      6002

Accuracy: 81.1729423525%


In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
C_range = np.logspace(-2, 10, 13)
gamma_range = np.logspace(-9, 3, 13)
param_grid = dict(gamma=gamma_range, C=C_range)

grid = GridSearchCV(SVC(), param_grid=param_grid)
grid.fit(X_train_tfidf, int_train_targets)



In [None]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10, 20]}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(X_train_tfidf, int_train_targets)
sorted(clf.cv_results_.keys())

In [45]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=10)
clf.fit(X_train_tfidf, int_train_targets)

print(clf.feature_importances_)
predicted = clf.predict(X_test_tfidf)
print(metrics.classification_report(int_test_targets, predicted,
    target_names=le.classes_))
metrics.confusion_matrix(int_test_targets, predicted)

print('Accuracy: {}%'.format(metrics.accuracy_score(int_test_targets, predicted)*100))

[  5.35817177e-06   2.57644814e-04   5.46220577e-06 ...,   6.06758083e-07
   3.90054266e-06   8.98451057e-06]
             precision    recall  f1-score   support

     B-ADJP       0.45      0.38      0.41        53
     B-ADVP       0.59      0.72      0.65        96
    B-CONJP       0.00      0.00      0.00         2
     B-INTJ       0.00      0.00      0.00         0
      B-LST       0.82      0.74      0.78      1583
       B-NP       0.85      0.98      0.91       626
       B-PP       0.45      0.50      0.48        10
      B-PRT       0.79      0.68      0.73        50
     B-SBAR       0.93      0.82      0.87       577
      B-UCP       0.00      0.00      0.00        27
       B-VP       0.33      0.08      0.13        12
     I-ADJP       0.00      0.00      0.00         1
     I-ADVP       0.79      0.81      0.80      1877
    I-CONJP       0.00      0.00      0.00         5
     I-INTJ       0.00      0.00      0.00         2
       I-NP       0.73      0.62      0.6

## Perform sliding window operation on data for trigrams

In [None]:
train_word_list.insert(0, '* TP')
train_word_list.append('* TP')

test_word_list.insert(0, '* TP')
test_word_list.append('* TP')

In [None]:
train_classes.insert(0, 'O')
train_classes.append('O')

test_classes.insert(0, 'O')
test_classes.append('O')

In [None]:
def nGramSequenceGenerator(labelledlist, labels, n):
    """
    Takes as input the label list of tuples generated by the code above. 
    The function generates sequence of size "n" from the given list. 
    """
    count = len(labelledlist)/n
    ngrammedlist = []
    ngrammedlabels = []
    for i in range(count):
        ngrammedlist.append( ' '.join(labelledlist[i : i + n]))
        ngrammedlabels.append(labels[ int(np.floor((2.0*i + n)/2.0))])
    return ngrammedlist, ngrammedlabels

In [None]:
train_ng_list, train_ng_labels = nGramSequenceGenerator(train_word_list,train_classes, 3)
test_ng_list, test_ng_labels = nGramSequenceGenerator(test_word_list,test_classes, 3)

In [None]:
train_ng_list[0]

In [None]:
le = LabelEncoder()
int_train_ng_labels = le.fit_transform(train_ng_labels)
int_test_ng_labels = le.transform(test_ng_labels)

In [None]:
test_ng_list[0]

### This is the original

In [None]:
for i in range(0,20):
    print(train_word_list[i], train_classes[i])

### This after ngramming

In [None]:
for i in range(0,20):
    print(ng_list[i], ng_labels[i])

In [None]:
text_clf.fit(train_ng_list, int_train_ng_labels) 

In [None]:
predicted = text_clf.predict(test_ng_list)
np.mean(predicted == int_test_ng_labels)            
print(metrics.classification_report(int_test_ng_labels, predicted,
    target_names=le.classes_))
metrics.confusion_matrix(int_test_ng_labels, predicted)

print('Accuracy: {}%'.format(metrics.accuracy_score(int_test_ng_labels, predicted)*100))

In [None]:
text_clf.named_steps['vect']

In [None]:
for i in range(0,20):
    print(predicted[i])