# NLP Assignment 1 (40% of grade): Sentiment Analysis from Tweets

This coursework will involve you implementing functions for a text classifier, which you will train to identify the **sentiment expressed in a text** in a dataset of approx. 27,000 entries, which will be split into a 80%/20% training/test split. 

In this template you are given the basis for that implementation, though some of the functions are missing, which you have to fill in.

Follow the instructions file **NLP_Assignment_1_Instructions.pdf** for details of each question - the outline of what needs to be achieved for each question is as below.

You must submit all **ipython notebooks and extra resources you need to run the code if you've added them** in the code submission, and a **2 page report (pdf)** in the report submission on QMPlus where you report your methods and findings according to the instructions file for each question.

In [1]:
import csv                               # csv reader
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support # to report on precision and recall
import numpy as np

In [2]:
def load_data(path):
    """Load data from a tab-separated file and append it to raw_data."""
    with open(path, encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            if line[0] == "Id":  # skip header
                continue
            (label, text) = parse_data_line(line[0])
            raw_data.append((text, label))

def split_and_preprocess_data(percentage):
    """Split the data between train_data and test_data according to the percentage
    and performs the preprocessing."""
    num_samples = len(raw_data)
    num_training_samples = int((percentage * num_samples))
    for (text, label) in raw_data[:num_training_samples]:
        train_data.append((to_feature_vector(pre_process(text)),label))
    for (text, label) in raw_data[num_training_samples:]:
        test_data.append((to_feature_vector(pre_process(text)),label))

# Question 1: Input and Basic preprocessing (10 marks)

In [3]:
def parse_data_line(data_line):
    # Should return a tuple of the label as just positive or negative and the statement
    # e.g. (label, statement)
    
    #Splits the data using tab
    parts = data_line.strip().split('\t')
    
    # Extract label and statement
    label = parts[0].lower()  # Assuming labels are in lowercase
    statement = parts[1]
    return (label, statement)

In [4]:

# Input: a string of one statement
def pre_process(text):
    # Should return a list of tokens
    # DESCRIBE YOUR METHOD IN WORDS
    tokens = text.split()
    return tokens

# Question 2: Basic Feature Extraction (20 marks)

In [5]:
global_feature_dict = {}

def to_feature_vector(tokens):
    # Initialize an empty dictionary for the feature vector
    feature_vector = {}

    # Iterate through each token in the list of tokens
    for token in tokens:
        # Use binary feature values: 1 if the feature is present, 0 if it's not
        feature_vector[token] = 1

        # Incrementally build up the global feature dictionary
        if token in global_feature_dict:
            global_feature_dict[token] += 1
        else:
            global_feature_dict[token] = 1

    return feature_vector


In [6]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

def train_classifier(data):
    print("Training Classifier...")
    
    # Extract features from the data using to_feature_vector()
    feature_sets = [(to_feature_vector(tokens), label) for tokens, label in data]

    # Train the classifier using SklearnClassifier
    pipeline = Pipeline([('svc', LinearSVC())])
    classifier = SklearnClassifier(pipeline).train(feature_sets)

    return classifier


# Question 3: Cross-validation (20 marks)

In [7]:
from sklearn.metrics import precision_recall_fscore_support

def cross_validate(dataset, folds):
    results = []
    fold_size = int(len(dataset) / folds) + 1

    for i in range(0, len(dataset), fold_size):
        print("Fold start on items %d - %d" % (i, i + fold_size))

        # Split the data into training and testing sets for the current fold
        train_set = dataset[:i] + dataset[i + fold_size:]
        test_set = dataset[i:i + fold_size]

        # Train the classifier on the training set
        classifier = train_classifier(train_set)

        # Extract features and labels from the test set
        test_features = [to_feature_vector(tokens) for tokens, label in test_set]
        gold_labels = [label for tokens, label in test_set]

        # Predict labels using the trained classifier
        predicted_labels = predict_labels(test_features, classifier)

        # Evaluate precision, recall, f1 score, and accuracy
        metrics = precision_recall_fscore_support(gold_labels, predicted_labels, average='weighted')
        accuracy = sum(1 for true, pred in zip(gold_labels, predicted_labels) if true == pred) / len(gold_labels)

        # Store the results for this fold
        results.append({
            'precision': metrics[0],
            'recall': metrics[1],
            'f1_score': metrics[2],
            'accuracy': accuracy
        })

    # Calculate average scores for all folds
    cv_results = {
        'average_precision': sum(result['precision'] for result in results) / folds,
        'average_recall': sum(result['recall'] for result in results) / folds,
        'average_f1_score': sum(result['f1_score'] for result in results) / folds,
        'average_accuracy': sum(result['accuracy'] for result in results) / folds,
    }

    return cv_results

In [8]:
# PREDICTING LABELS GIVEN A CLASSIFIER

def predict_labels(samples, classifier):
    """Assuming preprocessed samples, return their predicted labels from the classifier model."""
    return classifier.classify_many(samples)

def predict_label_from_raw(sample, classifier):
    """Assuming raw text, return its predicted label from the classifier model."""
    return classifier.classify(to_feature_vector(preProcess(reviewSample)))

In [9]:
# MAIN

# loading reviews
# initialize global lists that will be appended to by the methods below
raw_data = []          # the filtered data from the dataset file
train_data = []        # the pre-processed training data as a percentage of the total dataset
test_data = []         # the pre-processed test data as a percentage of the total dataset


# references to the data files
data_file_path = 'sentiment-dataset.tsv'

# Do the actual stuff (i.e. call the functions we've made)
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Preparing the dataset...",sep='\n')

load_data(data_file_path) 

# We split the raw dataset into a set of training data and a set of test data (80/20)
# You do the cross validation on the 80% (training data)
# We print the number of training samples and the number of features before the split
print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Preparing training and test data...",sep='\n')

split_and_preprocess_data(0.8)

# We print the number of training samples and the number of features after the split
print("After split, %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Training Samples: ", len(train_data), "Features: ", len(global_feature_dict), sep='\n')


Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...


IndexError: list index out of range

In [None]:
cross_validate(train_data, 10)  # will work and output overall performance of p, r, f-score when cv implemented


# Question 4: Error Analysis (20 marks)

In [None]:
from sklearn import metrics
import matplotlib.pyplot as plt
# a function to make the confusion matrix readable and pretty
def confusion_matrix_heatmap(y_test, preds, labels):
    """Function to plot a confusion matrix"""
    # pass labels to the confusion matrix function to ensure right order
    # cm = metrics.confusion_matrix(y_test, preds, labels)
    cm = metrics.confusion_matrix(y_test, preds, labels=labels)
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(111)
    cax = ax.matshow(cm)
    plt.title('Confusion matrix of the classifier')
    fig.colorbar(cax)
    ax.set_xticks(np.arange(len(labels)))
    ax.set_yticks(np.arange(len(labels)))
    ax.set_xticklabels( labels, rotation=45)
    ax.set_yticklabels( labels)

    for i in range(len(cm)):
        for j in range(len(cm)):
            text = ax.text(j, i, cm[i, j],
                           ha="center", va="center", color="w")

    plt.xlabel('Predicted')
    plt.ylabel('True')
    
    # fix for mpl bug that cuts off top/bottom of seaborn viz:
    b, t = plt.ylim() # discover the values for bottom and top
    b += 0.5 # Add 0.5 to the bottom
    t -= 0.5 # Subtract 0.5 from the top
    plt.ylim(b, t) # update the ylim(bottom, top) values
    plt.show() # ta-da!
    plt.show()

# Questions 5: Optimising pre-processing and feature extraction (30 marks)

**Note:** it is advisable to implement question 5 in a separate notebook where you further develop the pre-processing and feature extraction functions you implemented above.

In [None]:
# Finally, check the accuracy of your classifier by training on all the traning data
# and testing on the test set
# Will only work once all functions are complete
functions_complete = False  # set to True once you're happy with your methods for cross val
if functions_complete:
    print(test_data[0])   # have a look at the first test data instance
    classifier = train_classifier(train_data)  # train the classifier
    test_true = [t[1] for t in test_data]   # get the ground-truth labels from the data
    test_pred = predict_labels([x[0] for x in test_data], classifier)  # classify the test data to get predicted labels
    final_scores = precision_recall_fscore_support(test_true, test_pred, average='weighted') # evaluate
    print("Done training!")
    print("Precision: %f\nRecall: %f\nF Score:%f" % final_scores[:3])