In [117]:
import sys
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [162]:
def read_data():
    stoplist_file = open('data/stoplist.txt')
    traindata_file = open('data/traindata.txt')
    trainlabel_file = open('data/trainlabels.txt')
    testdata_file = open('data/testdata.txt')
    testlabel_file = open('data/testlabels.txt')
    
    stop_words = []
    train_data = []
    train_label = []
    test_data = []
    test_label = []
    
    for line in stoplist_file:
        line = line.replace('\n', '')
        stop_words.append(line)
        
    for line in traindata_file:
        line = line.replace('\n', '')
        train_data.append(line)
        
    for line in trainlabel_file:
        line = line.replace('\n', '')
        if(line == '1'):
            train_label.append('future')
        else:
            train_label.append('saying')
    
    for line in testdata_file:
        line = line.replace('\n', '')
        test_data.append(line)
    
    for line in testlabel_file:
        line = line.replace('\n', '')
        if(line == '1'):
            test_label.append('future')
        else:
            test_label.append('saying')
        
    return (stop_words, train_data, train_label, test_data, test_label)

In [120]:
def preprocess(stop_words, train_data):
    
    vocabulary = []
    
    for line in train_data:
        line = line.replace('\n', '')
        line = line.split(' ')
        for word in line:
            if word not in stop_words and word not in vocabulary and len(word) > 0:
                vocabulary.append(word)
    vocabulary.sort()
    return vocabulary

In [166]:
def convert_to_feature(vocabulary, train_data, train_label, test_data, test_label):
    
    train_x = np.zeros((len(train_data), len(vocabulary)))
    test_x = np.zeros((len(test_data), len(vocabulary)))
    
    train_count = 0
    
    for line in train_data:
        line = line.replace('\n', '')
        line = line.split(' ')
        for word in line:
            if word in vocabulary:
                index = vocabulary.index(word)
                train_x[train_count][index] = 1
                
        train_count += 1
        
    test_count = 0
    
    for line in test_data:
        line = line.replace('\n', '')
        line = line.split(' ')
        for word in line:
            if word in vocabulary:
                index = vocabulary.index(word)
                test_x[test_count][index] = 1
                
        test_count += 1
    
    train_x = pd.DataFrame(train_x, columns = vocabulary)
    test_x = pd.DataFrame(test_x, columns = vocabulary)
    
    train_y = pd.DataFrame(train_label, columns = ['label'])
    test_y =  pd.DataFrame(test_label, columns = ['label'])
    
    return (train_x, train_y, test_x, test_y)

In [172]:
class NaiveBayes():
    
    def __init__(self):
        self.attribute_estimates = {}
        self.class_estimates = {}
        
    def fit(self, x, y):
        
        attributes = x.columns.values
        labels = y.columns.values
        
        for label in labels:
            value, count = np.unique(y[label], return_counts = True)
            value_count = dict(zip(value, count))
            for key, value in value_count.items():
                self.class_estimates[key] = value_count[key] / y.shape[0]
         
        for attribute in attributes:
            value, count = np.unique(x[attribute], return_counts = True)
            
            word_dict = {}
            
            for v in value:
                y_l = y.values
                index = np.where(x[attribute] == v)
                corr_y = np.take(y_l, index)[0]
                ops, total = np.unique(corr_y, return_counts = True)
                ops_total = dict(zip(ops, total))
                total = np.sum(list(ops_total.values()))
                
                for key, val in ops_total.items():
                    ops_total[key] = ops_total[key] / total
                
                word_dict[v] = ops_total
                
            self.attribute_estimates[attribute] = word_dict
            
                
    def score(self, x, y):
        pass
    
    def predict(self):
        pass

In [174]:
def main():
    # Read the data from text file
    (stop_words, train_data, train_label, test_data, test_label) = read_data()
    
    # Create a vocabulary of words 
    vocabulary = preprocess(stop_words, train_data)
    
    # Convert the data into feature vector
    (train_x, train_y, test_x, test_y) = convert_to_feature(vocabulary, train_data, train_label, test_data, test_label)
    
    # Instantiate Naive Bayes classifier object
    nb = NaiveBayes()
    
    # Fit model on training data
    nb.fit(train_x, train_y)
    
    

In [175]:
if __name__ == '__main__':
    main()

{'ability': {0.0: {'future': 0.4748427672955975, 'saying': 0.5251572327044025}, 1.0: {'future': 0.25, 'saying': 0.75}}, 'absent': {0.0: {'future': 0.470404984423676, 'saying': 0.5295950155763239}, 1.0: {'future': 1.0}}, 'accept': {0.0: {'future': 0.471875, 'saying': 0.528125}, 1.0: {'future': 0.5, 'saying': 0.5}}, 'accomplished': {0.0: {'future': 0.475, 'saying': 0.525}, 1.0: {'saying': 1.0}}, 'achieve': {0.0: {'future': 0.470404984423676, 'saying': 0.5295950155763239}, 1.0: {'future': 1.0}}, 'act': {0.0: {'future': 0.4735202492211838, 'saying': 0.5264797507788161}, 1.0: {'saying': 1.0}}, 'action': {0.0: {'future': 0.4735202492211838, 'saying': 0.5264797507788161}, 1.0: {'saying': 1.0}}, 'active': {0.0: {'future': 0.4735202492211838, 'saying': 0.5264797507788161}, 1.0: {'saying': 1.0}}, 'added': {0.0: {'future': 0.4735202492211838, 'saying': 0.5264797507788161}, 1.0: {'saying': 1.0}}, 'advance': {0.0: {'future': 0.46875, 'saying': 0.53125}, 1.0: {'future': 1.0}}, 'advanced': {0.0: {'fu