# NLP HW3


Name    : Thamme Gowda   
USCID : 2074-6694-39   


In [76]:
from itertools import chain
import nltk
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite as crf
import os, sys, glob, csv
from collections import namedtuple
import pandas as pd
import numpy as np
from collections import defaultdict as ddict

In [91]:
# Code given by TAs 
def get_utterances_from_file(dialog_csv_file):
    """Returns a list of DialogUtterances from an open file."""
    reader = csv.DictReader(dialog_csv_file)
    return [_dict_to_dialog_utterance(du_dict) for du_dict in reader]

def get_utterances_from_filename(dialog_csv_filename):
    """Returns a list of DialogUtterances from an unopened filename."""
    with open(dialog_csv_filename, "r") as dialog_csv_file:
        return get_utterances_from_file(dialog_csv_file)

def get_data(data_dir):
    """Generates lists of utterances from each dialog file.

    To get a list of all dialogs call list(get_data(data_dir)).
    data_dir - a dir with csv files containing dialogs"""
    dialog_filenames = sorted(glob.glob(os.path.join(data_dir, "*.csv")))
    for dialog_filename in dialog_filenames:
        yield dialog_filename, get_utterances_from_filename(dialog_filename)

DialogUtterance = namedtuple("DialogUtterance", ("act_tag", "speaker", "pos", "text"))

PosTag = namedtuple("PosTag", ("token", "pos"))

def _dict_to_dialog_utterance(du_dict):
    """Private method for converting a dict to a DialogUtterance."""

    # Remove anything with 
    for k, v in du_dict.items():
        if len(v.strip()) == 0:
            du_dict[k] = None

    # Extract tokens and POS tags
    if du_dict["pos"]:
        du_dict["pos"] = [
            PosTag(*token_pos_pair.split("/"))
            for token_pos_pair in du_dict["pos"].split()]
    return DialogUtterance(**du_dict)

In [102]:
class CrfClassifier(object):

    def __init__(self, max_iters=50, l1_reg=1.0, l2_reg=1e-3):
        self.trainer = crf.Trainer(verbose=False)
        self.trainer.set_params({
            'c1': l1_reg,   # coefficient for L1 penalty
            'c2': l2_reg,  # coefficient for L2 penalty
            'max_iterations': max_iters,  # stop earlier
            # include transitions that are possible, but not observed
            'feature.possible_transitions': True
        })
    
    def featurize(self, idx, dialog):
        feats = []
        if idx == 0:
            feats.append("BOD") # beginning of dialogue
        else:
            if dialog[idx].speaker != dialog[idx-1].speaker:
                feats.append('SPKRCNG') # speaker Change
        utterance = dialog[idx]
        if utterance.pos is not None:
            for i, pos in enumerate(utterance.pos):
                feats.append("TOK[%d]=%s" % (i+1, pos.token))
                feats.append("POS[%d]=%s" % (i+1, pos.pos))
        else: 
            tokens = utterance.text.replace('>', '').replace('<', '').replace('.', '').split()
            for i, tok in enumerate(tokens):
                feats.append("ACTN[%d]=%s" % (i+1, tok))
        return feats
    
    def transform(self, dialog):
        X, Y = [], []
        for idx, utterance in enumerate(dialog):
            Y.append(utterance.act_tag)
            X.append(self.featurize(idx, dialog))
        return X, Y

    def train(self, train_dir, model_path):
        ''' '''
        '''
        Termianlogy: 
            directory has dialogues (in each file)
                dialogues have utterances (in each line)
                uttrances have label, speaker, tokens and the text
        '''
        dialogs = get_data(train_dir)
        for f_name, dialog in dialogs:
            X, Y = self.transform(dialog)
            self.trainer.append(X, Y)
        print("Training and saving model to %s" % model_path)
        self.trainer.train(model_path)
        print("Done")

    def load_model(self, model_file):
        self.tagger = crf.Tagger()
        self.tagger.open(model_file)

    def test(self, test_dir):
        dialogs = get_data(test_dir)
        for f_name, dialog in dialogs:
            X, Y = self.transform(dialog)
            predY = self.tagger.tag(X)
            assert len(Y) == len(predY)
            for i in range(len(Y)):
                yield(Y[i], predY[i])
                
    def evaluate(self, dev_dir):
        print("Evaluating %s" % data_dir)
        recs = self.test(dev_dir)
        matrix = ddict(lambda: ddict(int))
        for actual, predicted in recs:
            matrix[actual][predicted] += 1
        df = pd.DataFrame(matrix)
        tot = df.sum().sum()
        diags = [df[c][c] for c in df.columns]
        diags = map(lambda x: 0 if np.isnan(x) else x, diags)
        trace = sum(diags)
        return trace / tot, df
    
    def predict(self, data_dir, out_file):
        dialogs = get_data(data_dir)
        with open(out_file, 'w') as out:
            for f_name, dialog in dialogs:
                out.write("Filename=%s\n" % f_name.split("/")[-1])
                X, _ = self.transform(dialog)
                predY = self.tagger.tag(X)
                assert len(predY) == len(X)
                out.write("\n".join(predY))
                out.write("\n\n")
        print("Output stored at %s" % out_file)

In [104]:
train_dir = "../data/train"
dev_dir = "../data/dev"
model_file = "crfmodel.data"
clsf = CrfClassifier()
clsf.train(train_dir, model_file)
clsf.load_model(model_file)
clsf.predict(dev_dir, "output1.out")

perf, confusion_matrix = clsf.evaluate(dev_dir)
print("Performance: %f" % perf)
confusion_matrix

Training and saving model to crfmodel.data
Done
Output stored at output1.out
Evaluating ../data/
Performance: 0.721460


Unnamed: 0,%,+,^2,^g,^h,^q,aa,aap_am,ad,ar,...,qrr,qw,qw^d,qy,qy^d,sd,sv,t1,t3,x
%,2670.0,77.0,12.0,,8.0,3.0,63.0,5.0,8.0,5.0,...,1.0,23.0,,27.0,9.0,466.0,228.0,3.0,1.0,5.0
+,99.0,3402.0,24.0,,1.0,11.0,20.0,,21.0,1.0,...,3.0,7.0,,18.0,17.0,516.0,254.0,1.0,1.0,35.0
^2,2.0,13.0,44.0,,,,4.0,1.0,1.0,,...,1.0,,,1.0,4.0,10.0,11.0,,,
^g,,,,11.0,,,,,,,...,,,,2.0,1.0,,,,,
^h,2.0,1.0,,,43.0,,1.0,,2.0,,...,,1.0,,,,8.0,1.0,5.0,,
^q,,4.0,,,2.0,6.0,,,6.0,,...,,,,,,18.0,13.0,,,
aa,80.0,21.0,1.0,1.0,1.0,,1194.0,5.0,,18.0,...,,,,1.0,1.0,68.0,80.0,,,
aap_am,,,,,,,2.0,,,,...,,,,,,,1.0,,,
ad,1.0,2.0,,,1.0,3.0,,,41.0,,...,,,,,2.0,22.0,10.0,,3.0,
ar,1.0,,,,,,2.0,,,8.0,...,,,,,,,1.0,,,


52671.0 38004.0 0.721535569858


# Advanced CRF

In [None]:
class AdvancedCRF(CrfClassifier):
    
    def featurize(self, idx, dialog):
        feats = super(AdvancedCRF, self).featurize(idx, dialog)
        # more features here
        if idx > 0:
            # add previous
            # add last token from previous 
            pass
        if idx < le(dialog):
            # add next
            pass