# Lesson 3: Text Classification

Text classification is the process of sorting documents of text into a set of predefined categories. 

In this lab we'll train machine learning models to classify the sentiment of Twitter tweets as either **positive, negative, or neutral**.


In [1]:
import collections
from collections import Counter
import re
import string
from scipy import sparse 
import sklearn
from sklearn import svm, ensemble, naive_bayes, linear_model
import numpy as np
import Stemmer
import re

## Inspect the dataset

Each line of the file `tweets_train.txt` contains a tweet and its sentiment.

In [12]:
with open("datasets/tweets_train.txt") as f:
    for _ in range(2):
        f.readline()
    for _ in range(9):
        print(f.readline())

640169120600862720	neutral	Haruna Lukmon may av just played himself out of d Super Eagles under coach Sunday Oliseh

635946254254624769	neutral	Zach Putnam will be unavailable for the White Sox again tonight after experiencing right groin soreness while warming up Saturday night.

667121920333258752	negative	"""""""@daithimckay what about the victims of IRA terrorism David, do you ever think of them? Kingsmill, Le Mon or even Kevin McGuigan"""""""

628637519643586560	neutral	"""""""LHP Matt Boyd, traded to @tigers in David Price deal, will start tomorrow vs. the @Royals. #BlueJays"""""""

622623084395114496	neutral	What should I wear to the Taylor Swift concert tomorrow? What would my Bad Blood character name be? These questions will keep me up tonight.

638281382184288256	neutral	"""""""@HighOnVibe Depends what she said though, Nicki may have had a reason to start! Have you seen the video?"""""""

624067647529947136	positive	"""""""Jake and I will be in Sharknado 4. We may die in the 

In [2]:
# The file 'datasets/stopwords_en.txt' contains a list of stopwords - one per line.
with open("datasets/stopwords_en.txt") as f:
    enStopWords = set(f.read().splitlines())

# Initialze the SnowballStemmer
enStemmer = Stemmer.Stemmer('english')

def preprocess_line_en(line: str) -> list[str]:
    # Convert to lower case
    tokens = line.lower()
    
    # Split into tokens with no punctuation
    tokens = re.split("[^\w]", tokens)
    
    # Remove empty strings and stop words and apply the stemmer
    # Initially disable stopping and stemming
    #tokens = [enStemmer.stemWord(x) for x in tokens if x and x not in enStopWords]
    
    # Return the tokens
    return tokens

In [3]:
def preprocess_dataset(lines):
    # Preprocess data for classification
    documents = []
    categories = []
    vocab = set()
    
    # url_regex = re.compile(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')
        
    for line in lines:
        line = line.strip()
        if line:
            tweet_id, category, tweet = line.split('\t')

            # tweet = url_regex.sub('', tweet)
                
            # Preprocess the text
            terms = preprocess_line_en(tweet)

            # Add terms to the vocab
            for term in terms:
                vocab.add(term)
           
            # Add document to the collection of docs
            documents.append(terms)

            # Add category to the list of categories
            categories.append(category)
    
    return documents, categories, vocab

In [10]:
def read_dataset(train_test):
        with open(f"datasets/tweets_{train_test}.txt", encoding="latin-1") as f:
            # Skip the header 
            return f.readlines()[1:]

Load the dataset, splitting it into a 90-10% train/validation test set split

In [None]:
def get_train_val_data(seed):
    # Load the corpus

    # Randomly permute the corpus
    
    # Split the data into 90% training set 10% development set
        


In [4]:
# Answer      
def get_train_val_data(seed):
    # Load the corpus
    data_lines = read_dataset("train")

    # Randomly permute the corpus
    rnd = np.random.RandomState(seed=seed)
    rnd.shuffle(data_lines)

    # Split the data into 90% training set 10% development set
    split_idx = int(len(data_lines) * 0.9)
    train_data, val_data = data_lines[:split_idx], data_lines[split_idx:]
    return train_data, val_data
        
seed = 1234
training_data, val_data = get_train_val_data(seed)
test_data = read_dataset("test")

train_docs, train_categories, train_vocab = preprocess_dataset(training_data)
val_docs, val_categories, val_vocab = preprocess_dataset(val_data)
test_docs, test_categories, test_vocab = preprocess_dataset(test_data)

1. Print the number of documents and size of vocabulary in each dataset.
2. Print the distribution of categories in the training set

In [None]:
# Print some info about the dataset


# Print the distribution of categories in the training set


In [5]:
# Answer
# Print some info about the dataset
print(f"Training dataset has {len(train_docs)} documents and vocab size {len(train_vocab)}")
print(f"Validation dataset has {len(val_docs)} documents and vocab size {len(val_vocab)}")
print(f"Test dataset has {len(test_docs)} documents and vocab size {len(test_vocab)}")

# Print the distribution of categories in the training set
print()
print("Distribution of categories:")
print(collections.Counter(train_categories).most_common())

Training dataset has 16781 documents and vocab size 36791
Validation dataset has 1865 documents and vocab size 8221
Test dataset has 4662 documents and vocab size 15268

Distribution of categories:
[('neutral', 7915), ('positive', 5354), ('negative', 3512)]


Convert the dataset into a bag-of-words representation

In [None]:
# Every word in the vocabulary is given an index


# Every category is given an index too


# Convert to bag of words
# This is a count matrix where each row is a document and each column is a word. 
# The elements of the matrix are the frequencies of how often that word appears in the dataset.
def convert_to_bow_matrix(data, word2id):

    # Matrix size is number of docs, vocab size + 1 (for OOV)

    # Matrix indexed by [doc_id, token_id]
    
    # Loop over documents and insert word counts
    

In [11]:
# Answer
# Every word in the vocabulary is given an index
word2id = {}
for word_id, word in enumerate(train_vocab):
    word2id[word] = word_id

# Every category is given an index too
cat2id = {}
for cat_id, cat in enumerate(set(train_categories)):
    cat2id[cat] = cat_id

# Convert to bag of words
# This is a count matrix where each row is a document and each column is a word. 
# The elements of the matrix are the frequencies of how often that word appears in the dataset.
def convert_to_bow_matrix(data, word2id):

    # Matrix size is number of docs, vocab size + 1 (for OOV)
    matrix_size = (len(data), len(word2id) + 1)
    oov_index = len(word2id)

    # Matrix indexed by [doc_id, token_id]
    X = sparse.dok_matrix(matrix_size)
    
    # Loop over documents and insert word counts
    for doc_id, doc in enumerate(data):
        for word in doc:
            # default is 0. Add the count for this word in this doc
            # if the word is oov, then increment the oov index
            X[doc_id, word2id.get(word, oov_index)] += 1
    
    return X

Train the model

In [7]:
# Answer
X_train = convert_to_bow_matrix(train_docs, word2id)
y_train = [cat2id[cat] for cat in train_categories]

# Train the model
model = sklearn.svm.SVC(C=1000)
model.fit(X_train, y_train)

Compute the accuracy of the model by evaluating predictions on the training set

In [8]:
def compute_accuracy(predictions, targets):
    num_correct = 0
    num_total = len(predictions)
    for predicted, target in zip(predictions, targets):
        if predicted == target:
            num_correct += 1
    return num_correct / num_total

In [9]:
y_train_predictions = model.predict(X_train)
accuracy = compute_accuracy(y_train_predictions, y_train)
print("Training accuracy: ", accuracy)

Training accuracy:  0.9994040879566176


Compute the accuracy of the model on the validation set

In [None]:
# Calculate accuracy on the validation set
X_val = convert_to_bow_matrix(val_docs, word2id)
y_val = [cat2id[cat] for cat in val_categories]

y_val_predictions = model.predict(X_val)
accuracy = compute_accuracy(y_val_predictions, y_val)

print("Validation accuracy: ", accuracy)

### Exercises

Can we improve the performance of the model? Try some of the following ideas and experiment with any of your own:
1. Modify preprocessing to remove hyperlinks
2. Modify preprocessing to keep hashtags (don't remove the # character)
3. Try using another model instead of SVM, for example `sklearn.linear_model.LogisticRegression` or `sklearn.ensemble.RandomForestClassifier`.


Once all optimizations to the model have been made evaluate it on the test set.

In [None]:
# Train and calculate accuracy on the test set
# X_test = convert_to_bow_matrix(test_docs, word2id)
# y_test = [cat2id[cat] for cat in test_categories]

# y_test_predictions = model.predict(X_test)
# accuracy = compute_accuracy(y_test_predictions, y_test)
# print("Test accuracy: ", accuracy)