# <span style='color:orange'> Applying Machine Learning Classifiers to problem of Chunk Prediction <span>

# Read the Chunking Training Dataset

In [1]:
import pandas as pd
from pandas.io.parsers import read_table

In [3]:
train_text_table = read_table('./data/train.txt', header=None, delimiter=' ')

IOError: File ./data/train.txt does not exist

In [None]:
train_text_table.head()

TODO

# Generate the list which contains (word POS_tag) as strings

In [None]:
train_word_list = []
for i in range(len(train_text_table)):
    train_word_list.append(train_text_table[0][i] + ' ' + train_text_table[1][i])

In [None]:
train_word_list[2]

In [None]:
train_classes = train_text_table[2].tolist()

In [None]:
train_classes[0]

# Transform string = 'word POS_tag' into feature vector using token counts

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train_word_list)
X_train_counts.shape

In [None]:
count_vect.vocabulary_.get(u'new')

# Convert Occurences to Frequencies
Divide the number of occurrences of each word in a document by the total number of words in the document: these new features are called tf for Term Frequencies.

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

In [None]:
import numpy as np

# Encode categorical labels (B-NP...) to integer values 

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()

In [None]:
int_train_targets = le.fit_transform(train_classes)

In [None]:
int_train_targets

In [None]:
int_train_targets.shape

In [None]:
print('Training Data Ready...')
print('Shape of Testing Data: {}'.format(np.shape(X_train_tfidf)))
print('Shape of Testing Labels: {}'.format(np.shape(int_train_targets)))

# Load the Testing Data and prepare it for testing

In [None]:
test_text_table = read_table('./reference250.txt', header=None, delimiter=' ')

In [None]:
test_text_table = test_text_table.dropna()

In [None]:
test_word_list = []
for i in range(len(test_text_table)):
    test_word_list.append(test_text_table[0][i] + ' ' + test_text_table[1][i])

In [None]:
test_word_list[0]

In [None]:
X_test_counts = count_vect.transform(test_word_list)
X_test_counts.shape

In [None]:
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
X_test_tfidf.shape

In [None]:
test_classes = test_text_table[2].tolist()

In [None]:
int_test_targets = le.transform(test_classes)

In [None]:
print('Testing Data Ready...')
print('Shape of Testing Data: {}'.format(np.shape(X_test_tfidf)))
print('Shape of Testing Labels: {}'.format(np.shape(int_test_targets)))

# <span style='color:Green'> Training Classifiers </span>

## <span style='color:red'> Naive Bayes </span>

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, int_train_targets)

In [None]:
predicted = clf.predict(X_test_tfidf)


In [None]:
from sklearn import metrics
print(metrics.classification_report(int_test_targets, predicted,
    target_names=le.classes_))
print('Accuracy: {}%'.format(metrics.accuracy_score(int_test_targets, predicted)*100))

## <span style='color:red'> Support Vector Machine </span>

In [None]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, random_state=42,
                                           max_iter=4, tol=None, n_jobs=5)),
])
text_clf.fit(train_word_list, int_train_targets)  

predicted = text_clf.predict(test_word_list)
np.mean(predicted == int_test_targets)            
print(metrics.classification_report(int_test_targets, predicted,
    target_names=le.classes_))
metrics.confusion_matrix(int_test_targets, predicted)

print('Accuracy: {}%'.format(metrics.accuracy_score(int_test_targets, predicted)*100))

In [None]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10, 20]}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(X_train_tfidf, int_train_targets)
sorted(clf.cv_results_.keys())

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=50, random_state=0, n_jobs=10)
clf.fit(X_train_tfidf, int_train_targets)

print(clf.feature_importances_)
predicted = clf.predict(X_test_tfidf)
print(metrics.classification_report(int_test_targets, predicted,
    target_names=le.classes_))
metrics.confusion_matrix(int_test_targets, predicted)

print('Accuracy: {}%'.format(metrics.accuracy_score(int_test_targets, predicted)*100))

## Perform sliding window operation on data for trigrams

In [None]:
train_word_list.insert(0, '* TP')
train_word_list.append('* TP')

test_word_list.insert(0, '* TP')
test_word_list.append('* TP')

In [None]:
train_classes.insert(0, 'O')
train_classes.append('O')

test_classes.insert(0, 'O')
test_classes.append('O')

In [None]:
def nGramSequenceGenerator(labelledlist, labels, n):
    """
    Takes as input the label list of tuples generated by the code above. 
    The function generates sequence of size "n" from the given list. 
    """
    count = len(labelledlist)/n
    ngrammedlist = []
    ngrammedlabels = []
    for i in range(count):
        ngrammedlist.append( ' '.join(labelledlist[i : i + n]))
        ngrammedlabels.append(labels[ int(np.floor((2.0*i + n)/2.0))])
    return ngrammedlist, ngrammedlabels

In [None]:
train_ng_list, train_ng_labels = nGramSequenceGenerator(train_word_list,train_classes, 3)
test_ng_list, test_ng_labels = nGramSequenceGenerator(test_word_list,test_classes, 3)

In [None]:
train_ng_list[0]

In [None]:
le = LabelEncoder()
int_train_ng_labels = le.fit_transform(train_ng_labels)
int_test_ng_labels = le.transform(test_ng_labels)

In [None]:
test_ng_list[0]

### This is the original

In [None]:
for i in range(0,20):
    print(train_word_list[i], train_classes[i])

### This after ngramming

In [None]:
for i in range(0,20):
    print(ng_list[i], ng_labels[i])

In [None]:
text_clf.fit(train_ng_list, int_train_ng_labels) 

In [None]:
predicted = text_clf.predict(test_ng_list)
np.mean(predicted == int_test_ng_labels)            
print(metrics.classification_report(int_test_ng_labels, predicted,
    target_names=le.classes_))
metrics.confusion_matrix(int_test_ng_labels, predicted)

print('Accuracy: {}%'.format(metrics.accuracy_score(int_test_ng_labels, predicted)*100))

In [None]:
text_clf.named_steps['vect']

In [None]:
for i in range(0,20):
    print(predicted[i])