# Creation and Formatting of Data 

Here we take data and spllit it acording to the tags, Now in case of multiple tags we only consider thetag that is given first.

In [42]:
def read_data(train_data):
    all_sentences = []
    for line in train_data:
        sentence = []
        for tup in line:
            word=tup[0].lower()
            tag=tup[1].lower()
            tag=tag.split('+')[0]
            tag=tag.split('-')[0]
            sentence.append((word, tag))
        all_sentences.append(sentence)

    return all_sentences

# Feature Extraction

As no features were given we took features that could be found in the text. These features were : Is it the first or last word, or was it fully or partially capitalized or was it a number or was it a prefix or a suffix

In [43]:
def get_feature(token, token_index, sent,vocab_idx):
    token_feature = {    
        'token'             : token,                                    # Token itself
        'is_first'          : token_index == 0,                         # Is token at the beginning of the sentence
        'is_last'           : token_index == len(sent)-1,               # Is token at the end of the sentence
        'is_capitalized'    : token[0].upper() == token[0],             # Is first letter of token a capital letter
        'is_all_capitalized': token.upper() == token,                   # Are all letters of token capital letters
        'is_capitals_inside': token[1:].lower() != token[1:],           # Is there any capital letters in the token
        'is_numeric'        : token.isdigit(),                          # Is there any digits in the token
        'prefix-1'          : token[0],                                 # Token prefix containing only one letter
        'prefix-2'          : '' if len(token) < 2  else token[:1],     # Token prefix containing two letters
        'suffix-1'          : token[-1],                                # Token suffix containing only one letter
        'suffix-2'          : '' if len(token) < 2  else token[-2:],    # Token suffix containing two letters
        'prev-token'        : '' if token_index == 0     else sent[token_index - 1][0],     # Previous token in the sentence
        '2-prev-token'      : '' if token_index <= 1     else sent[token_index - 2][0],     # Two previous token in the sentence
        'next-token'        : '' if token_index == len(sent) - 1     else sent[token_index + 1][0],     # Next token in the sentence
        '2-next-token'      : '' if token_index >= len(sent) - 2     else sent[token_index + 2][0]      # Two next token in the sentence
        }
    
    if token not in vocab.keys():       # Add token to vocabulary
        vocab[token] = vocab_idx
        vocab_idx += 1
    
    # Adding features for a token 
    
    
    if token_feature['prefix-1'] not in vocab.keys():
        vocab[token_feature['prefix-1']] = vocab_idx
        vocab_idx += 1

    if token_feature['prefix-2'] not in vocab.keys():
        vocab[token_feature['prefix-2']] = vocab_idx
        vocab_idx += 1

    if token_feature['suffix-1'] not in vocab.keys():
        vocab[token_feature['suffix-2']] = vocab_idx
        vocab_idx += 1

    if token_feature['suffix-2'] not in vocab.keys():
        vocab[token_feature['suffix-2']] = vocab_idx
        vocab_idx += 1

    if token_feature['prev-token'] not in vocab.keys():
        vocab[token_feature['prev-token']] = vocab_idx
        vocab_idx += 1

    if token_feature['2-prev-token'] not in vocab.keys():
        vocab[token_feature['2-prev-token']] = vocab_idx
        vocab_idx += 1

    if token_feature['next-token'] not in vocab.keys():
        vocab[token_feature['next-token']] = vocab_idx
        vocab_idx += 1

    if token_feature['2-next-token'] not in vocab.keys():
        vocab[token_feature['2-next-token']] = vocab_idx
        vocab_idx += 1

    return token_feature, vocab_idx


# Making X and Y for logestic regression

Now we need to make features and poition labels for the logestic regression ie the X and Y axis for it. Therefore for all words we extract features. We also need to transform the features to make it fit for logestic regression.

In [44]:
def form_data(all_sentences):
    features   = []                                                     # X
    pos_labels = []                                                     # Y
    vocab_idx = 0
    
    for sent in all_sentences:
        for token_index, token_pair in enumerate(sent):                 # Pick the index and token together 
            token = token_pair[0]                                       
            f,vocab_idx=get_feature(token, token_index, sent,vocab_idx)# Extract features from token and append it to features list (x_train)
            features.append(f)
            pos_label = token_pair[1]                                   
            pos_labels.append(pos_label)                                # Append pos label to pos_labels list (y_train) 
    
    x_train = transform(features,vocab_idx) # Convert the X to vector form

    
    x_train=np.array(x_train)
    return x_train, pos_labels 

# Tranformation of the features 

Now the X is in form of a dictionary with true and false values and some suffix prefix and other data thus we need to convert it into a vector form to be understood by logestic regression.

In [45]:
def transform(x_train,vocab_idx):
    X = []
    for feature in x_train:
        x_features = []
        for key in feature:
            if feature[key] is True:
                x_features.append(1)
            elif feature[key] is False:
                x_features.append(0)
            else:
                if feature[key] not in vocab:
                    x_features.append(vocab_idx)
                    vocab_idx += 1
                else:
                    x_features.append(vocab[feature[key]])

        X.append(x_features)
    return X

# Logistic Regression

The basic logestic regression model

In [50]:
import numpy as np
class LogisticRegression:
    def __init__(self, learning_rate=0.01, num_iter=10, fit_intercept=True, verbose=False):
        self.learning_rate = learning_rate
        self.num_iter = num_iter
        self.fit_intercept = fit_intercept
        self.verbose=verbose
    
    def __add_intercept(self, X):
        intercept = np.ones((X.shape[0], 1))
        return np.hstack((intercept, X))
        
        
    def __sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    
    def __loss(self, h, y):
        return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()
    
    def fit(self, X, y):
        if self.fit_intercept:
            X = self.__add_intercept(X)
        
        # weights initialization
        self.w = np.zeros(X.shape[1])
        
        for i in range(self.num_iter):
            z = np.dot(X, self.w)
            h = self.__sigmoid(z)
            gradient = np.dot(X.T, (h - y)) / len(y)
            self.w -= self.learning_rate * gradient
            
            if(self.verbose == True and i % 10 == 0):
                z = np.dot(X, self.w)
                h = self.__sigmoid(z)
        return self.w
    
    def predict_prob(self,X,wt):
        if self.fit_intercept:
            X = self.__add_intercept(X)
    
        return self.__sigmoid(np.dot(X,wt))


# Calculating Accuracy

Getting prediction and actual labels and testing how many of them are right and how many are wrong

In [47]:
def calc_accuracy(y_test, pred):
    score=0
    for i in range(0,len(y_test)):
        if y_test[i]==pred[i]:
            score+=1
    
    return score*100.0/len(y_test)

# Driver Code  : Training

In [55]:
from nltk.corpus import brown 
length = len(brown.tagged_sents())

train_data = brown.tagged_sents()[:1000]            # Training Data
all_sentences = read_data(train_data)               # Read all sentences
x_train,y_train = form_data(all_sentences)          # making X and Y for logestic regression

classes= list(set(y_train))                         # Getting classes for making models

W=[]                                                # Weight list for all tags

model = LogisticRegression()                        # Model : Logestic Regression

for c in classes:                                   # One vs All : All classes have a model fitted with all other 0
    y_label=[]
    for y in y_train:
        if y == c:
            y_label.append(1)     # Current
        else:
            y_label.append(0)     # All Others
            
    weight=model.fit(x_train, y_label)
    W.append(weight)              # All class models here



  from ipykernel import kernelapp as app


# Driver Code : Testing

In [56]:
test_data = brown.tagged_sents()[-200:]            # Data for testing
all_sentence = read_data(test_data)                # Fetching sentences
x_test, y_test = form_data(all_sentence)           # Making X and Y

result=[]
for i in range(0,len(classes)):
    p = model.predict_prob(x_test,W[i])            # Testing and fetching prediction
    result.append(p)

result = np.array(result).T

pred=[]
for r in result:
    index=np.argmax(r,axis=0)                     # As it is one vs all : Argmax is the actual prediction
    pred.append(classes[index])

# Measure accuracy
accuracy   = calc_accuracy(y_test, pred)         # Test with actual data
print "Accuracy : ",accuracy

Accuracy :  10.3852263701


  from ipykernel import kernelapp as app
