In [1]:
import nltk
#nltk.download('punkt')
#import sklearn
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import apply_features
import string
import pickle	# this is for saving and loading your trained classifiers.
import numpy as np
from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import make_pipeline
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re
##############################################################
#		If you need, you may import other packages etc.      #
##############################################################

In [2]:
def preprocess(filename, data_type):

    filepath = "463_A1_TASK1_data/data/"+ data_type+ "/" +filename 
    file = open(filepath, 'r')
    lines = file.read().splitlines()
    file.close()

    processed = []
    for line in lines:
        
        tokens = word_tokenize(line)

       
        tokens = [word for word in tokens if word.isalpha()]
        
        
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(word) for word in tokens]
        
        preprocessed_line = ' '.join(tokens)
        processed.append((preprocessed_line, filename.split("_")[0]))  

    return processed

In [3]:
def create_training_megadoc():
    training_documents = ["philosophy_train.txt","sports_train.txt","mystery_train.txt","religion_train.txt","science_train.txt","romance_train.txt","horror_train.txt","science-fiction_train.txt"]
    training_megadoc = []

    for filename in training_documents:
        training_megadoc.append(preprocess(filename,"train"))
        #####
#...
    
# Here, you may write the training_megadoc to a file. (You may also do it elsewhere or nowhere.)
#...
#####
    return training_megadoc

In [4]:
def create_test_megadoc():
    test_documents = ["philosophy_test.txt","sports_test.txt","mystery_test.txt","religion_test.txt","science_test.txt","romance_test.txt","horror_test.txt","science-fiction_test.txt"]	
    test_megadoc = []
    
    for filename in test_documents:
        test_megadoc.append(preprocess(filename, "test"))
        
    
    return test_megadoc

In [5]:
def extract_features(megadoc):# megadoc can be either training_megadoc for training phase or test_megadoc for testing phase.
    all_features = []
    
    for doc in megadoc:
        
        texts = [entry[0] for entry in doc]
        label = doc[0][1]
    
        vectorizer = TfidfVectorizer(strip_accents='ascii', stop_words= 'english')
        x = vectorizer.fit_transform(texts)
    
        features = vectorizer.get_feature_names_out()
        all_features.append((features,label))
        
    return all_features

In [6]:
def train(classifier, training_set, model): #classifier is either nltk.NaiveBayesClassifier or SklearnClassifier(SVC()).
    if(model == 'NB'):
        nb_classifier = classifier.train(training_set)
        return nb_classifier
    
    if(model == 'SVC'):
        
        svc_classifier = classifier.fit(training_set[0], training_set[1])
        return svc_classifier
    
    

In [7]:
def test(classifier, test_set, classify_type):
    if(classify_type == 'NB'):
        nb = classifier.classify(test_set)
        return nb
    
    if(classify_type == 'SVC'):
        svc = classifier.predict(test_set)
        return svc
    
   

In [11]:
def save_classifier(classifier, filename):	#filename should end with .pickle and type(filename)=string
	with open(filename, "wb") as f:
		pickle.dump(classifier, f)
	return
	
	
def load_classifier(filename):	#filename should end with .pickle and type(filename)=string
	classifier_file = open(filename, "rb")
	classifier = pickle.load(classifier_file)
	classifier_file.close()
	return classifier




if __name__ == "__main__":
    # You may add or delete global variables.
    training_set = []
    test_set = []
    dev_set = []
    
    
    ##### TRAINING
    training_set = create_training_megadoc()
    
    training = extract_features(training_set)
    
    
    nb_going_train = []
    svc_set = []
    svc_labels = []
    
    for feat in training:
        nb_feature_set = {}
        
        for i in feat[0]:
            nb_feature_set[i] = True
            svc_set.append(i)
            svc_labels.append(feat[1])
            
        nb_going_train.append((nb_feature_set, feat[1]))
    
    nb_classifier = train(nltk.NaiveBayesClassifier, nb_going_train,'NB')
    
    svc_model = SVC()
    #svc_classifier = train(svc_model, (svc_set, svc_labels),'SVC')
    
    #### DEVELOPMENT
    #dev_set = create_test_megadoc()
    
    #developing = extract_features(dev_set)
    #going_dev = []
    
    #for feat in developing:
    #    feature_set = {}
    #    for i in feat[0]:
    #        feature_set[i] = True
    #    predicted = test(nb_classifier,feature_set)
    #    print(predicted)
    #    going_dev.append(predicted)
        
    #print(going_dev)
    
    
    ##### TESTING
    test_set = create_test_megadoc()
    
    testing = extract_features(test_set)
    nb_going_test = []
    svc_going_test = []
    
    for feat in testing:
        feature_set = {}
        svc_test_set = []
        
        
        for i in feat[0]:
            feature_set[i] = True
            svc_test_set.append(i)
            #svc_test_labels.append(feat[1])
            
        nb_predicted = test(nb_classifier,feature_set,'NB')
        nb_going_test.append(nb_predicted)
        
        #svc_predicted = test(svc_classifier, svc_test_set, 'SVC')
        #svc_going_test.append(svc_predicted)
        
    print("NAIVE BAYES PREDICTIONS:", nb_going_test)
    print("SVC PREDICTIONS", svc_going_test)  
    ############## SVC APPROACH
    

NAIVE BAYES PREDICTIONS: ['philosophy', 'sports', 'mystery', 'religion', 'science', 'romance', 'horror', 'science-fiction']
SVC PREDICTIONS []
