## Loyal assignment


## Environment Setup

In [1]:
import numpy as np
import time
import argparse
import json
import random
from nltk.tokenize import regexp_tokenize
import numpy as np
from nltk.corpus import stopwords

ModuleNotFoundError: No module named 'nltk'

### Define util function to extract tokenized feature with Unigram, Bigram and customerised features.

In [None]:
# Here is a default pattern for tokenization, you can substitue it with yours
default_pattern =  r"""(?x)                  
                        (?:[A-Z]\.)+          
                        |\$?\d+(?:\.\d+)?%?    
                        |\w+(?:[-']\w+)*      
                        |\.\.\.               
                        |(?:[.,;"'?():-_`])    
                    """

def tokenize(text, pattern = default_pattern):
    """Tokenize senten with specific pattern

    Arguments:
        text {str} -- sentence to be tokenized, such as "I love NLP"

    Keyword Arguments:
        pattern {str} -- reg-expression pattern for tokenizer (default: {default_pattern})

    Returns:
        list -- list of tokenized words, such as ['I', 'love', 'nlp']
    """
    text = text.lower()
    return regexp_tokenize(text, pattern)


class FeatureExtractor(object):
    """Base class for feature extraction.
    """
    def __init__(self):
        pass
    def fit(self, text_set):
        pass
    def transform(self, text):
        pass
    def transform_list(self, text_set):
        pass



class UnigramFeature(FeatureExtractor):
    """Example code for unigram feature extraction
    """
    def __init__(self):
        self.unigram = {}

    def fit(self, text_set: list):
        """Fit a feature extractor based on given data

        Arguments:
            text_set {list} -- list of tokenized sentences and words are lowercased, such as [["I", "love", "nlp"], ["I", "like", "python"]]
        """
        index = 0
        for i in range(0, len(text_set)):
            for j in range(0, len(text_set[i])):
                if text_set[i][j].lower() not in self.unigram:
                    self.unigram[text_set[i][j].lower()] = index
                    index += 1
                else:
                    continue

    def transform(self, text: list):
        """Transform a given sentence into vectors based on the extractor you got from self.fit()

        Arguments:
            text {list} -- a tokenized sentence (list of words), such as ["I", "love", "nlp"]

        Returns:
            array -- an unigram feature array, such as array([1,1,1,0,0,0])
        """
        feature = np.zeros(len(self.unigram))
        for i in range(0, len(text)):
            if text[i].lower() in self.unigram:
                feature[self.unigram[text[i].lower()]] += 1

        return feature

    def transform_list(self, text_set: list):
        """Transform a list of tokenized sentences into vectors based on the extractor you got from self.fit()

        Arguments:
            text_set {list} --a list of tokenized sentences, such as [["I", "love", "nlp"], ["I", "like", "python"]]

        Returns:
            array -- unigram feature arraies, such as array([[1,1,1,0,0], [1,0,0,1,1]])
        """
        features = []
        for i in range(0, len(text_set)):
            features.append(self.transform(text_set[i]))

        return np.array(features)


class BigramFeature(FeatureExtractor):
    """Bigram feature extractor analogous to the unigram one.
    """
    def __init__(self):
        # Add your code here!
        self.bigram = {}

    def fit(self, text_set):
        # Add your code here!
        index = 0
        for i in range(0, len(text_set)):
            for j in range(0, len(text_set[i])-1):
                if text_set[i][j].lower()+"_"+text_set[i][j+1].lower() not in self.bigram:
                    self.bigram[text_set[i][j].lower()+"_"+text_set[i][j+1].lower()] = index
                    index += 1
                else:
                    continue

    def transform(self, text):
        # Add your code here!
        feature = np.zeros(len(self.bigram))
        for i in range(0, len(text)-1):
            if text[i].lower()+"_"+text[i+1].lower() in self.bigram:
                feature[self.bigram[text[i].lower()+"_"+text[i+1].lower()]] += 1

        return feature

    def transform_list(self, text_set):
        # Add your code here!
        features = []
        for i in range(0, len(text_set)):
            features.append(self.transform(text_set[i]))

        return np.array(features)

class CustomFeature(FeatureExtractor):
    """customized feature extractor, such as TF-IDF
    """
    def __init__(self):
        # Add your code here!
        self.custom = {}
        # raise Exception("Must be implemented")
    def fit(self, text_set):
        # remove stopwords
        stop_words = set(stopwords.words('english'))

        # Add your code here!
        index = 0
        # add unigram features
        for i in range(0, len(text_set)):
            for j in range(0, len(text_set[i])):
                if text_set[i][j].lower() not in self.custom and text_set[i][j].lower() not in stop_words:
                    self.custom[text_set[i][j].lower()] = index
                    index += 1
                else:
                    continue
        # add bigram features
        for i in range(0, len(text_set)):
            for j in range(0, len(text_set[i]) - 1):
                if text_set[i][j].lower() + "_" + text_set[i][j + 1].lower() not in self.custom:
                    self.custom[text_set[i][j].lower() + "_" + text_set[i][j + 1].lower()] = index
                    index += 1
                else:
                    continue
        # add trigram features
        for i in range(0, len(text_set)):
            for j in range(0, len(text_set[i]) - 2):
                if text_set[i][j].lower() + "_" + text_set[i][j + 1].lower()+ "_" + text_set[i][j + 2].lower() not in self.custom:
                    self.custom[text_set[i][j].lower() + "_" + text_set[i][j + 1].lower() + "_" + text_set[i][j + 2].lower()] = index
                    index += 1
                else:
                    continue

        # raise Exception("Must be implemented")
    def transform(self, text):
        # Add your code here!
        feature = np.zeros(len(self.custom))
        # transform unigram features
        for i in range(0, len(text)):
            if text[i].lower() in self.custom:
                feature[self.custom[text[i].lower()]] += 1
        # transform bigram features
        for i in range(0, len(text) - 1):
            if text[i].lower() + "_" + text[i + 1].lower() in self.custom:
                feature[self.custom[text[i].lower() + "_" + text[i + 1].lower()]] += 1
        # transform trigram features
        for i in range(0, len(text) - 2):
            if text[i].lower() + "_" + text[i + 1].lower() + "_" + text[i + 2].lower() in self.custom:
                feature[self.custom[text[i].lower() + "_" + text[i + 1].lower() + "_" + text[i + 2].lower()]] += 1

        return feature

    def transform_list(self, text_set):
        # Add your code here!
        features = []
        for i in range(0, len(text_set)):
            features.append(self.transform(text_set[i]))

        return np.array(features)

### NaiveBayes classification method, including the data input process function.

In [None]:
#manualy select 20 random features
choosen = []
# choosen = ["direct_deposit","carry_on","whisper_mode","text","recipe",
#            "smart_home","who_do_you_work_for","rewards_balance","restaurant_reservation","travel_notification",
#            "update_playlist","change_volume","routing","mpg","bill_balance",
#            "do_you_have_pets","cook_time","what_song","new_card","todo_list_update"]

def random_select_labels(Y):
    unique_labels = np.unique(Y)
    global choosen
    choosen = random.sample(list(unique_labels),20)

def accuracy(pred, labels):
    correct = (np.array(pred) == np.array(labels)).sum()
    accuracy = correct/len(pred)
    print("Accuracy: %i / %i = %.4f " %(correct, len(pred), accuracy))


def read_data(path):
    #load input data from json file
    with open(path +'data_full.json') as f:
        data = json.load(f)
    train_frame = np.array(data['train'])
    train_label = train_frame[:,1]

    #select 20 random labels
    random_select_labels(train_label)
    print("Select labels:{}".format(choosen))

    #filter the input dataset with selected 20 labels
    train_frame = train_frame[np.in1d(train_label,np.array(choosen))]
    test_frame = np.array(data['test'])
    test_label = test_frame[:, 1]
    test_frame = test_frame[np.in1d(test_label,np.array(choosen))]

    return train_frame, test_frame


class NaiveBayesClassifier:
    """Naive Bayes Classifier
    """

    def __init__(self):
        # Add your code here!
        self.features_prob = []
        self.label_prob = []

    def fit(self, X, Y):
        #calculate the feature probability and 20 label probability 
        for i in range(len(choosen)):
            features_count = X[np.in1d(Y, choosen[i])]
            features_count = np.sum(features_count,axis=0)
            #add 1 smoothing
            features_count += 1
            self.features_prob.append(np.divide(features_count, np.sum(features_count)))
            self.label_prob.append(np.sum(np.in1d(Y, choosen[i])) / len(Y))

    def predict(self, X):
        #predict the label result for the given test data
        pred = []
        for row in X:
            pred_prob = np.log(self.label_prob) + np.sum(np.log(np.power(self.features_prob, row)),axis=1)
            pred.append(choosen[np.argmax(pred_prob)])
        return np.array(pred)

In [None]:
def run(feature = "unigram", path = './data/'):
    #choices can be choosen from 'unigram', 'bigram', 'customized'
    print("Chosen feature is: {}".format(feature))

    train_frame, test_frame = read_data(path)

    # Convert text into features
    if feature == "unigram":
        feat_extractor = UnigramFeature()
    elif feature == "bigram":
        feat_extractor = BigramFeature()
    elif feature == "customized":
        feat_extractor = CustomFeature()
    else:
        raise Exception("Pass unigram, bigram or customized to --feature")

    # Tokenize text into tokens
    tokenized_text = []
    for i in range(0, len(train_frame)):
        tokenized_text.append(tokenize(train_frame[i][0]))

    feat_extractor.fit(tokenized_text)

    # form train set for training
    X_train = feat_extractor.transform_list(tokenized_text)
    Y_train = train_frame[:,1]


    # form test set for evaluation
    tokenized_text = []
    for i in range(0, len(test_frame)):
        tokenized_text.append(tokenize(test_frame[i][0]))
    X_test = feat_extractor.transform_list(tokenized_text)
    Y_test = test_frame[:,1]


    model = NaiveBayesClassifier()

    start_time = time.time()
    model.fit(X_train,Y_train)
    print("===== Train Accuracy =====")
    accuracy(model.predict(X_train), Y_train)

    print("===== Test Accuracy =====")
    accuracy(model.predict(X_test), Y_test)

    print("Time for training and test: %.2f seconds" % (time.time() - start_time))



run("unigram","./data/")