# Lab 3: Text Classification

Text classification is the process of sorting documents of text into a set of predefined categories. 

In this lab we'll train machine learning models to classify the sentiment of Twitter tweets as either **positive, negative, or neutral**.


In [None]:
import collections
from collections import Counter
import re
import string
from scipy import sparse 
import sklearn
from sklearn import svm, ensemble, naive_bayes, linear_model
import numpy as np
import Stemmer
import re

In [None]:
# The file 'datasets/stopwords_en.txt' contains a list of stopwords - one per line.
with open("datasets/stopwords_en.txt") as f:
    enStopWords = set(f.read().splitlines())

# Initialze the SnowballStemmer
enStemmer = Stemmer.Stemmer('english')

def preprocess_line_en(line: str) -> list[str]:
    # Convert to lower case
    tokens = line.lower()
    
    # Split into tokens with no punctuation
    tokens = re.split("[^\w]", tokens)
    
    # Remove empty strings and stop words and apply the stemmer
    # Initially disable stopping and stemming
    #tokens = [enStemmer.stemWord(x) for x in tokens if x and x not in enStopWords]
    
    # Return the tokens
    return tokens

In [None]:
def preprocess_dataset(lines):
    # Preprocess data for classification
    documents = []
    categories = []
    vocab = set()
    
    # url_regex = re.compile(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')
        
    for line in lines:
        line = line.strip()
        if line:
            tweet_id, category, tweet = line.split('\t')

            # tweet = url_regex.sub('', tweet)
                
            # Preprocess the text
            terms = preprocess_line_en(tweet)

            # Add terms to the vocab
            for term in terms:
                vocab.add(term)
           
            # Add document to the collection of docs
            documents.append(terms)

            # Add category to the list of categories
            categories.append(category)
    
    return documents, categories, vocab

In [None]:
def read_dataset(train_test):
    with open(f"datasets/tweets_{train_test}.txt", encoding="latin-1") as f:
        # Skip the header 
        return f.readlines()[1:]

Load the dataset, splitting it into a 90-10% train/validation test set split

In [None]:
def get_train_val_data(seed):
    # Load the corpus

    # Randomly permute the corpus
    
    # Split the data into 90% training set 10% development set
        


1. Print the number of documents and size of vocabulary in each dataset.
2. Print the distribution of categories in the training set

In [None]:
# Print some info about the dataset


# Print the distribution of categories in the training set


Convert the dataset into a bag-of-words representation

In [None]:
# Every word in the vocabulary is given an index


# Every category is given an index too


# Convert to bag of words
# This is a count matrix where each row is a document and each column is a word. 
# The elements of the matrix are the frequencies of how often that word appears in the dataset.
def convert_to_bow_matrix(data, word2id):

    # Matrix size is number of docs, vocab size + 1 (for OOV)

    # Matrix indexed by [doc_id, token_id]
    
    # Loop over documents and insert word counts
    

Train the model

In [None]:
# Answer
X_train = convert_to_bow_matrix(train_docs, word2id)
y_train = [cat2id[cat] for cat in train_categories]

# Train the model
model = sklearn.svm.SVC(C=1000)
model.fit(X_train, y_train)

Compute the accuracy of the model by evaluating predictions on the training set

In [None]:
def compute_accuracy(predictions, targets):
    num_correct = 0
    num_total = len(predictions)
    for predicted, target in zip(predictions, targets):
        if predicted == target:
            num_correct += 1
    return num_correct / num_total

In [None]:
y_train_predictions = model.predict(X_train)
accuracy = compute_accuracy(y_train_predictions, y_train)
print("Training accuracy: ", accuracy)

Compute the accuracy of the model on the validation set

In [None]:
# Calculate accuracy on the validation set
X_val = convert_to_bow_matrix(val_docs, word2id)
y_val = [cat2id[cat] for cat in val_categories]

y_val_predictions = model.predict(X_val)
accuracy = compute_accuracy(y_val_predictions, y_val)

print("Validation accuracy: ", accuracy)

### Exercises

Can we improve the performance of the model? Try some of the following ideas and experiment with any of your own:
1. Modify preprocessing to remove hyperlinks
2. Modify preprocessing to keep hashtags (don't remove the # character)
3. Try using another model instead of SVM, for example `sklearn.linear_model.LogisticRegression` or `sklearn.ensemble.RandomForestClassifier`.


Once all optimizations to the model have been made evaluate it on the test set.

In [None]:
# Train and calculate accuracy on the test set
# X_test = convert_to_bow_matrix(test_docs, word2id)
# y_test = [cat2id[cat] for cat in test_categories]

# y_test_predictions = model.predict(X_test)
# accuracy = compute_accuracy(y_test_predictions, y_test)
# print("Test accuracy: ", accuracy)