In [22]:
import pandas as pd
import numpy as np
import pickle
import re
import warnings
import os
warnings.filterwarnings('ignore')

from nltk.tag import CRFTagger

from stemmer import Stemmer
from tokenizer import WordTokenizer
from alphabet import numbers, punctuations
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score


In [23]:
def get_traindata(path):
    
    f = open(path)
    
    lines= f.readlines()
    
    i=0
    data = []
    while i < len(lines):
        temp = lines[i].split()

        stmt = []
        if temp[0] == '<s>':
            isend = i+1
            i+=1
            
            while lines[isend].split()[0] != '</s>':
                t = lines[isend].split()
                stmt.append( tuple(t))
                isend +=1
                i+=1
                
        
        i+=1 
        data.append(stmt)

    return data

In [24]:
def feat(idx , tokens ):

    stemmer = Stemmer()
    numbs = numbers.values()
    puncts = punctuations.values()

    token = stemmer.stem(tokens[idx])
    feature_list = []

    if not token:
        return feature_list

    for number in numbs:
        if number in list(token):
            feature_list.append("HAS_NUM")

    for punctuation in puncts:
        if punctuation in list(token):
            feature_list.append("PUNCTUATION")

    feature_list.append("WORD_" + token)

    if len(token) > 1:
        feature_list.append("SUF_" + token[-1:])
        feature_list.append("PRE_" + token[:1])
    if len(token) > 2:
        feature_list.append("SUF_" + token[-2:])
        feature_list.append("PRE_" + token[:2])
    if len(token) > 3:
        feature_list.append("SUF_" + token[-3:])
        feature_list.append("PRE_" + token[:3])

    if idx >= 1:
        previous_token = stemmer.stem(tokens[idx-1])
        if not previous_token:
            return feature_list

        for number in numbs:
            if number in list(previous_token):
                feature_list.append("HAS_NUM")

        for punctuation in puncts:
            if punctuation in list(previous_token):
                feature_list.append("PUNCTUATION")

        if len(previous_token) > 1:
            feature_list.append("SUF_" + previous_token[-1:])
            feature_list.append("PRE_" + previous_token[:1])
        if len(previous_token) > 2:
            feature_list.append("SUF_" + previous_token[-2:])
            feature_list.append("PRE_" + previous_token[:2])
        if len(previous_token) > 3:
            feature_list.append("SUF_" + previous_token[-3:])
            feature_list.append("PRE_" + previous_token[:3])

        feature_list.append("PREV_WORD_" + previous_token)

    if idx >= 2:
        previous_token = stemmer.stem(tokens[idx-2])
        if not previous_token:
            return feature_list

        for number in numbs:
            if number in list(previous_token):
                feature_list.append("HAS_NUM")

        for punctuation in puncts:
            if punctuation in list(previous_token):
                feature_list.append("PUNCTUATION")

        if len(previous_token) > 1:
            feature_list.append("SUF_" + previous_token[-1:])
            feature_list.append("PRE_" + previous_token[:1])
        if len(previous_token) > 2:
            feature_list.append("SUF_" + previous_token[-2:])
            feature_list.append("PRE_" + previous_token[:2])
        if len(previous_token) > 3:
            feature_list.append("SUF_" + previous_token[-3:])
            feature_list.append("PRE_" + previous_token[:3])

        feature_list.append("PREV_PREV_WORD_" + previous_token)



    if idx < len(tokens)-1:
        next_token = stemmer.stem(tokens[idx+1])
        if not next_token:
            return feature_list

        for number in numbs:
            if number in list(next_token):
                feature_list.append("HAS_NUM")

        for punctuation in puncts:
            if punctuation in list(next_token):
                feature_list.append("PUNCTUATION")

        if len(next_token) > 1:
            feature_list.append("SUF_" + next_token[-1:])
            feature_list.append("PRE_" + next_token[:1])
        if len(next_token) > 2:
            feature_list.append("SUF_" + next_token[-2:])
            feature_list.append("PRE_" + next_token[:2])
        if len(next_token) > 3:
            feature_list.append("SUF_" + next_token[-3:])
            feature_list.append("PRE_" + next_token[:3])

        feature_list.append("NEXT_WORD_" + next_token)

    if idx < len(tokens)-2:
        next_token = stemmer.stem(tokens[idx+2])
        if not next_token:
            return feature_list

        for number in numbs:
            if number in list(next_token):
                feature_list.append("HAS_NUM")

        for punctuation in puncts:
            if punctuation in list(next_token):
                feature_list.append("PUNCTUATION")

        if len(next_token) > 1:
            feature_list.append("SUF_" + next_token[-1:])
            feature_list.append("PRE_" + next_token[:1])
        if len(next_token) > 2:
            feature_list.append("SUF_" + next_token[-2:])
            feature_list.append("PRE_" + next_token[:2])
        if len(next_token) > 3:
            feature_list.append("SUF_" + next_token[-3:])
            feature_list.append("PRE_" + next_token[:3])

        feature_list.append("NEXT_NEXT_WORD_" + next_token)

    return feature_list


In [25]:
def get_features(tokens):

    for stmt in tokens:

        for i in range(len(stmt)):

            feature = feat(i ,stmt)
            print( stmt[i] )
            print(feature)
            break
        break


def pos_tag(sentence):
    stemmer = Stemmer()
    sent = stemmer.stem(sentence)
    tokens = WordTokenizer(sent)
    return tokens

In [26]:
def stem_tok(data):
    stemmer = Stemmer()
    new_train = []
    
    for stmt in data :
        temp = []

        for word in stmt :

            sent = stemmer.stem(word[0])
            toks = WordTokenizer(sent)

            if len(toks) == 0:
                continue

            temp.append(tuple( [toks[0] ,word][1] ))
        new_train.append(temp)
    
    return new_train

In [27]:
def train(train_path ):
    train = get_traindata(train_path)
    
    train = stem_tok(train)

    ct = CRFTagger()
    ct.train(train,'model.crf.tagger')
    
    return ct

In [28]:
def stem_tokenize(data ,tag):
    stemmer = Stemmer()
    new_str = []
    new_tags = []
    
    for i in range(len(data)) :
        sent = stemmer.stem(data[i])
        toks = WordTokenizer(sent)

        if len(toks) == 0:
            continue

        new_str.append(toks[0])
        new_tags.append(tag[i])
    
    return new_str ,new_tags

In [29]:
def get_testdata(path):
    
    f = open(path)
    
    lines= f.readlines()
    
    test = []
    tags = []
    
    for stmt in lines:
        temp = stmt.split()
                
        string = []
        tag = []
        
        for j in range(len(temp)):
            
            t = temp[j].split('_')

            if j!=len(temp)-1:
                string.append(t[0])
                
                if len(t) == 2:
                    tag.append(t[1])
                else:
                    tag.append(t[1]+'_'+t[2])

        
        string ,tag = stem_tokenize(string ,tag)
        test.append(string)
        tags.append(tag)

    
    return test , tags
        

In [30]:
def get_tags(data):
    pred = []
    
    for i in data:
        temp= []
        for j in i:
             temp.append(j[1])
        
        pred.append(temp)
        
    return pred 

In [31]:
def test_fun(model ,test_path):
    
    test,labels = get_testdata(test_path)
    
    output = []
    
    for i in test:
        output.append(model.tag(i))
    
    pred_tags = get_tags(output)
    
    label = []
    predict = []
    for i in range(len(labels)):
        for j in range( len(labels[i])):

            label.append(labels[i][j])
            predict.append(pred_tags[i][j])
    
    le = preprocessing.LabelEncoder()

    full = label + predict
    le.fit(full)

    labels = le.transform(label)
    preds = le.transform(predict)
    
    print('Accuracy : '+ str(accuracy_score(labels ,preds)))
    print('F1 Score : '+ str(f1_score(labels ,preds ,average='macro')))

In [32]:
print('Art and Culture')
model = train('train_data/train_guj_art and culture_sample1.txt')
test_fun(model ,'test_data/guj_art and culture_sample1_tags.txt')


Art and Culture
Accuracy : 0.685150955021565
F1 Score : 0.4594274470259343


In [33]:
print('Economy')
model = train('train_data/train_guj_economy_sample2.txt')
test_fun(model ,'test_data/guj_economy_sample2_tags.txt')


Economy
Accuracy : 0.734910277324633
F1 Score : 0.5310406967351758


In [16]:
print('Entertainment')
model = train('train_data/train_guj_entertainment_sample3.txt')
test_fun(model ,'test_data/guj_entertainment_sample3_tags.txt')

Entertainment
Accuracy : 0.6373008434864105
F1 Score : 0.4300666018153045


In [17]:
print('Philosophy')
model = train('train_data/train_guj_philosophy_sample4.txt')
test_fun(model ,'test_data/guj_philosophy_sample4_tags.txt')

Philosophy
Accuracy : 0.7438345266507558
F1 Score : 0.5563661500554326


In [18]:
print('Religion')
model = train('train_data/train_guj_religion_sample5.txt')
test_fun(model ,'test_data/guj_religion_sample5_tags.txt')

Religion
Accuracy : 0.6538461538461539
F1 Score : 0.4877179531472362


In [19]:
print('Science And Tech')
model = train('train_data/train_guj_science and technology_sample6.txt')
test_fun(model ,'test_data/guj_science and technology_sample6_tags.txt')

Science And Tech
Accuracy : 0.6975512905360688
F1 Score : 0.49164867291251735


In [20]:
print('Sports')
model = train('train_data/train_guj_sports_sample7.txt')
test_fun(model ,'test_data/guj_sports_sample7_tags.txt')

Sports
Accuracy : 0.6161971830985915
F1 Score : 0.4914905327674904
