# Assignment 3 - Sentiment Analysis
## Taha Salman (260721174)
***

### Q1)

In [1]:
import string

def read_data(data_path,encoding='utf-8'):
    data = []
    with open(data_path,'r',encoding=encoding) as f:
        data = f.readlines()
    return data

def pre_process_data(data):
    '''
    takes as input a list containing lines of data.
    Removes punctuation marks, changes every word to lowercase, and then
    returns a list containing a list of words and the last element is the class
    '''
    output = []
    to_remove = string.punctuation
    to_remove+="br"
    translator = str.maketrans("","",to_remove)
    for line in data:
        word_list = line.split(" ")
        final_word_list = []
        num_words = len(word_list)-1
        for i in range(0,num_words):
            word = word_list[i]
            word = word.translate(translator)
            word = word.lower()
            if word:
                final_word_list.append(word)

        last_words = word_list[-1].split('\t')
        final_word_list.append(last_words[0])
        final_word_list.append(last_words[1].strip('\n'))

        output.append(final_word_list)

    return output

def build_vocab(data):
    vocab_dict = {}
    for line in data:
        line_length = len(line)-1
        for i in range(0,line_length):
            if line[i] in vocab_dict:
                vocab_dict[line[i]] = vocab_dict[line[i]] + 1
            else:
                vocab_dict[line[i]] = 1

    sorted_list = []

    for word in sorted(vocab_dict, key=vocab_dict.get,reverse=True):
        sorted_list.append((word,vocab_dict[word]))
    return sorted_list


def save_vocab_file(vocab_list,saving_path,encoding='utf-8'):
    output = ""

    for i in range(0,len(vocab_list)):
        output = output + vocab_list[i][0] + "\t"
        output = output + str(i+1) + "\t"
        output = output + str(vocab_list[i][1]) + "\n"

    with open(saving_path,'w',encoding=encoding) as f:
        f.write(output)


def create_vocab_file(vocab_size,reading_path,saving_path):
    vocab_size = 10000

    data = read_data(reading_path)
    processed_data = pre_process_data(data)

    vocab_list = build_vocab(processed_data)[0:vocab_size]
    save_vocab_file(vocab_list, saving_path)


def read_vocab(vocab_path):
    vocab_dict = {}
    data = []
    with open(vocab_path,"r",encoding="utf-8") as f:
        data = f.readlines()
    for line in data:
        temp = line.split("\t")
        vocab_dict[temp[0]] = temp[1]
    return vocab_dict


def code_data(data,vocab):
    output = ""
    for line in data:
        num_words = len(line) - 2
        for i in range(0,num_words):
            if line[i] in vocab:
                output += vocab[line[i]] + " "

        output += line[-2] + "\t"
        output += line[-1] + "\n"

    return output

def create_coded_file(reading_path,saving_path,vocab_path):
    data = read_data(reading_path)
    processed_data = pre_process_data(data)
    vocab_dict = read_vocab(vocab_path)
    coded_data = code_data(processed_data,vocab_dict)
    with open(saving_path,"w",encoding="utf-8") as f:
        f.write(coded_data)


def prepare_data():
    datasets = ("IMDB","yelp")
    dataclasses= ("test","train","valid")

    for dataset in datasets:
        create_vocab_file(
            vocab_size=10000,
            reading_path='Data/Raw/{}-train.txt'.format(dataset),
            saving_path='Data/Processed/{}-vocab.txt'.format(dataset)
        )
        print("{}-vocab.txt file has been created!".format(dataset))

        for dataclass in dataclasses:
            create_coded_file(
                reading_path="Data/Raw/{}-{}.txt".format(dataset,dataclass),
                saving_path="Data/Processed/{}-{}.txt".format(dataset,dataclass),
                vocab_path="Data/Processed/{}-vocab.txt".format(dataset)
            )
            print("{}-{}.txt file has ben created!".format(dataset,dataclass))


if __name__=="__main__":
    prepare_data()


IMDB-vocab.txt file has been created!
IMDB-test.txt file has ben created!
IMDB-train.txt file has ben created!
IMDB-valid.txt file has ben created!
yelp-vocab.txt file has been created!
yelp-test.txt file has ben created!
yelp-train.txt file has ben created!
yelp-valid.txt file has ben created!


***
### Q2

#### Part (a)

In [3]:
import random
from sklearn.metrics import f1_score

def read_data(data_path, encoding="utf-8"):
    data = []
    with open(data_path, "r", encoding=encoding) as f:
        data = f.readlines()

    return data

class Classifier():
    def __init__(self,training_data_path,classes=[]):
        if classes:
            self.classes = classes
        else:
            self.classes = self.set_classes(training_data_path)

    def set_classes(self,data_path):
        classes = []
        data = read_data(data_path)
        for line in data:
            temp = line.split(" ")
            cl = (temp[-1].split("\t")[1]).strip('\n')
            if cl not in classes:
                classes.append(cl)
        return classes


class RandomClassifier(Classifier):

    def classify(self,data_path):
        data = read_data(data_path)
        predictions = []
        for line in data:
            predictions.append(self.predict_class())
        return predictions

    def predict_class(self):
        rint = random.randint(0,len(self.classes)-1)
        return self.classes[rint]



class MajorityClassClassifier(Classifier):
    def __init__(self, training_data_path, classes=[]):
        super().__init__(training_data_path, classes)
        self.class_frequencies = self.set_class_frequencies(training_data_path)

    def classify(self,data_path):
        predictions = []
        data = read_data(data_path)
        for line in data:
            predictions.append(self.predict_class())

        return predictions

    def set_class_frequencies(self,data_path):
        class_dict = {}
        data = read_data(data_path)

        for line in data:
            temp = line.split(" ")
            cl = (temp[-1].split("\t")[1]).strip('\n')
            if cl in class_dict:
                class_dict[cl] = class_dict[cl] + 1
            else:
                class_dict[cl] = 1

        class_frequencies = []
        for cl in sorted(class_dict,key=class_dict.get,reverse=True):
            class_frequencies.append((cl,class_dict[cl]))

        return class_frequencies

    def predict_class(self):
        return self.class_frequencies[0][0]


class PerformanceTester():
    def __init__(self,predictions):
        self.predictions=predictions

    def get_F1_score(self,data_path):
        results = PerformanceTester.get_actual_results(data_path)
        score = f1_score(
            y_true=results,
            y_pred=self.predictions,
            average="micro"
        )
        return score


    @staticmethod
    def get_actual_results(data_path):
        results = []
        data = read_data(data_path)
        for line in data:
            temp = line.split(" ")
            cl = (temp[-1].split("\t")[1]).strip('\n')
            results.append(cl)
        return results


def run_random_classifier():
    TRAINING_DATA_PATH = "Data/Processed/yelp-train.txt"
    TESTING_DATA_PATH = "Data/Processed/yelp-test.txt"

    print("Initializing Random Classifier with training data from the path {}".format(TRAINING_DATA_PATH))
    rc = RandomClassifier(TRAINING_DATA_PATH)
    cls = rc.classes
    print("There are {} possible classes:".format(len(cls)))
    for i in range(0, len(cls)):
        print("Class {}".format(cls[i]))

    print("Making Predictions on the data from the path {}".format(TESTING_DATA_PATH))
    predictions = rc.classify(TESTING_DATA_PATH)

    p_tester = PerformanceTester(predictions)
    f1s = p_tester.get_F1_score(TESTING_DATA_PATH)
    print("The F1 score for this random classifier is {}".format(f1s))


def run_majority_class_classifier():
    TRAINING_DATA_PATH = "Data/Processed/yelp-train.txt"
    TESTING_DATA_PATH = "Data/Processed/yelp-test.txt"

    print("Initializing Majority Class Classifier with training data from the path {}".format(TRAINING_DATA_PATH))
    mcc = MajorityClassClassifier(TRAINING_DATA_PATH)
    cls = mcc.classes
    print("There are {} possible classes:".format(len(cls)))
    for i in range(0, len(cls)):
        print("Class {}".format(cls[i]))

    print("Making Predictions on the data from the path {}".format(TESTING_DATA_PATH))
    predictions = mcc.classify(TESTING_DATA_PATH)

    p_tester = PerformanceTester(predictions)
    f1s = p_tester.get_F1_score(TESTING_DATA_PATH)
    print("The F1 score for this Majority Class Classifier is {}".format(f1s))


if __name__ == "__main__":
    run_random_classifier()
    print()
    run_majority_class_classifier()

Initializing Random Classifier with training data from the path Data/Processed/yelp-train.txt
There are 5 possible classes:
Class 5
Class 3
Class 2
Class 1
Class 4
Making Predictions on the data from the path Data/Processed/yelp-test.txt
The F1 score for this random classifier is 0.2105

Initializing Majority Class Classifier with training data from the path Data/Processed/yelp-train.txt
There are 5 possible classes:
Class 5
Class 3
Class 2
Class 1
Class 4
Making Predictions on the data from the path Data/Processed/yelp-test.txt
The F1 score for this Majority Class Classifier is 0.351
