In [1]:
from collections import defaultdict
import math
import numpy as np
import nltk
import sklearn
import os
import operator
import ssl
import warnings

warnings.filterwarnings("ignore")
ssl._create_default_https_context = ssl._create_unverified_context

#### loading the dataset

This notebook should be exactly placed in the directory of the folder 'datasets_coursework1'

In [2]:
dataset_path = './datasets_coursework1/bbc'
def read_class_data(dataset_path, class_name):
    path = dataset_path + '/' + class_name
    files = os.listdir(path)
    data = []
    for file in files:
        if not os.path.isdir(file):
            f = open(path + '/' + file, encoding='latin')
            content = f.read()
            data.append(content)
    return data
    
business_data = read_class_data(dataset_path, 'business')
entertainment_data = read_class_data(dataset_path, 'entertainment')
politics_data = read_class_data(dataset_path, 'politics')
sport_data = read_class_data(dataset_path, 'sport')
tech_data = read_class_data(dataset_path, 'tech')

#### Pre-processing the data

In [3]:
# download all the dependancies of nltk
# nltk.download('all')

In [4]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

lemmatizer = WordNetLemmatizer()

def get_used_words(data):
    words = []
    for passage in data:
        word_tokens = nltk.tokenize.word_tokenize(passage)
        words += word_tokens
    words = [word.lower() for word in words] # to lowercase
    words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english') and word.isalpha()]
    return words

# obatin the words used by each class of documents
business_words = get_used_words(business_data)
entertainment_words = get_used_words(entertainment_data)
politics_words = get_used_words(politics_data)
sport_words = get_used_words(sport_data)
tech_words = get_used_words(tech_data)

In [5]:
# obtain the tf-idf values of the used words
def get_tf_idf(list_words):
    doc_frequency = defaultdict(int)
    for word_list in list_words:
        for i in word_list:
            doc_frequency[i] += 1

    # compute the term frequency of each word
    word_tf = {}
    for i in doc_frequency:
        word_tf[i] = doc_frequency[i] / sum(doc_frequency.values())

    # compute the inverse document frequency of each word
    doc_num = len(list_words)
    word_idf = {}  
    word_doc = defaultdict(int)
    for i in doc_frequency:
        for j in list_words:
            if i in j:
                word_doc[i] += 1
    for i in doc_frequency:
        word_idf[i] = math.log(doc_num / (word_doc[i] + 1))

    # compute the value of TF * IDF
    word_tf_idf = {}
    for i in doc_frequency:
        word_tf_idf[i] = word_tf[i] * word_idf[i]

    return word_tf_idf

dictionary = get_tf_idf([business_words, entertainment_words, politics_words, sport_words, tech_words])

In [6]:
# obtain the top 2000 words as default
def find_topwords(word_tf_idf, num=2000):
    dict_feature_select = sorted(word_tf_idf.items(), key=operator.itemgetter(1), reverse=True)
    return dict_feature_select[:num]

list_vocabulary = [pair[0] for pair in find_topwords(dictionary)]

In [7]:
def get_list_tokens(string):
    sentence_split = nltk.tokenize.sent_tokenize(string)
    list_tokens = []
    sentence_tokens = []
    for sentence in sentence_split:
        list_tokens_sentence = nltk.tokenize.word_tokenize(sentence)
        sentence_tokens.append(list_tokens_sentence)
        for token in list_tokens_sentence:
            list_tokens.append(lemmatizer.lemmatize(token).lower())
    num_sentence = len(sentence_tokens)
    num_vocab = len(list(set(list_tokens)))
    return list_tokens, np.array(num_sentence), np.array(num_vocab)

def get_vector_text(list_vocab,string):
    vector_text = np.zeros(len(list_vocab))
    list_tokens_string, num_sentence, num_vocab = get_list_tokens(string)
    for i, word in enumerate(list_vocab):
        if word in list_tokens_string:
            vector_text[i] = list_tokens_string.count(word)
    return vector_text, num_sentence, num_vocab

In [8]:
# three features are used: top 2000 words frequency vector, number of sentence of the passage and number of used words
def creating_features(list_vocabulary, data):
    features = []
    for passage in data:
        vector, num_sentence, num_vocab = get_vector_text(list_vocabulary, passage)
        vector = np.append(vector, num_sentence)
        vector = np.append(vector, num_vocab)
        features.append(vector)
    return features

business_features = creating_features(list_vocabulary, business_data)
entertainment_features = creating_features(list_vocabulary, entertainment_data)
politics_features = creating_features(list_vocabulary, politics_data)
sport_features = creating_features(list_vocabulary, sport_data)
tech_features = creating_features(list_vocabulary, tech_data)

In [9]:
# mix different class of documents
all_features = np.array(
    business_features + entertainment_features + politics_features + sport_features + tech_features)
all_labels = [0 for i in range(len(business_features))] + [1 for i in range(len(entertainment_features))] + [2 for i in range(len(politics_features))] + [3 for i in range(len(sport_features))] + [4 for i in range(len(tech_features))]

vector_features = all_features[:, 0:-2]
length_features = all_features[:, -2:]

# dimension reduction on the word vector features
from sklearn.decomposition import PCA
pca = PCA(n_components=100)
pca.fit(vector_features)
newX = pca.fit_transform(vector_features)
print('Explained variance: ' + str(sum(pca.explained_variance_ratio_)))

# normalizing the length_features
min_value = np.min(length_features, 0)
max_value = np.max(length_features, 0)
length_features = (length_features - min_value) / max_value

# concatenate all the features
newX = np.concatenate((newX, length_features), axis=1)

Explained variance: 0.517129351031216


#### Creating SVM classifier and using stratified cross validation

In [10]:
from sklearn import svm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score


stratified_folder = StratifiedKFold(n_splits=10, random_state=0, shuffle=False)

accuracy = []
f1_scores = []
precision = []
recall = []

for train_index, test_index in stratified_folder.split(newX, all_labels):
    model = svm.SVC(decision_function_shape='ovo')
    model.fit([newX[i] for i in train_index], [all_labels[i] for i in train_index])
    Y_pred = model.predict([newX[i] for i in test_index])
    Y_true = np.array([all_labels[i] for i in test_index])
    acc = accuracy_score(Y_true, Y_pred)
    f1_scores = f1_score(Y_true, Y_pred, average='macro')
    pr = precision_score(Y_true, Y_pred, average='macro')
    re = recall_score(Y_true, Y_pred, average='macro')
    accuracy.append(acc)
    precision.append(pr)
    recall.append(re)
print('Accuracy using cross validation: '+ str(np.mean(accuracy)))
print('F1 score using cross validation: '+ str(np.mean(f1_scores)))
print('Precision using cross validation: '+ str(np.mean(precision)))
print('Recall using cross validation: '+ str(np.mean(recall)))

Accuracy using cross validation: 0.8426921013093527
F1 score using cross validation: 0.8209264929716177
Precision using cross validation: 0.8981623168666774
Recall using cross validation: 0.8356586826297617


#### Error Analysis

In [11]:
# obatin the index where the instance is misclassified
misclassified_index = np.where(Y_true != Y_pred)[0]
misclassified_pairs = [(Y_true[i], Y_pred[i]) for i in list(misclassified_index)]

# calculate the statistics
# (a) how many instances are misclassified in total
# (b) what is the distribution of the misclassified instances for each class
print('Misclassified instances in total of '+ str(len(Y_true)) + ': ' + str(len(misclassified_index)) + '\n')
test_number_of_each_class = np.zeros((5,))
for i in range(len(Y_true)):
    test_number_of_each_class[Y_true[i]] += 1
misclassified_distr = np.zeros((5, 5))
for i in range(len(misclassified_pairs)):
    misclassified_distr[misclassified_pairs[i][0]][misclassified_pairs[i][1]] += 1
for i in range(len(misclassified_distr)):
    print('The misclassified instances of class ' + str(i) + ' is: ' + str(sum(misclassified_distr[i])) + ' out of ' + str(test_number_of_each_class[i]) + ' instances.')
    print('The misclassified distribution of class ' + str(i) + ' is: ' + str(misclassified_distr[i]))
    print()

Misclassified instances in total of 221: 42

The misclassified instances of class 0 is: 1.0 out of 51.0 instances.
The misclassified distribution of class 0 is: [0. 0. 0. 0. 1.]

The misclassified instances of class 1 is: 5.0 out of 38.0 instances.
The misclassified distribution of class 1 is: [4. 0. 0. 0. 1.]

The misclassified instances of class 2 is: 14.0 out of 41.0 instances.
The misclassified distribution of class 2 is: [11.  0.  0.  1.  2.]

The misclassified instances of class 3 is: 13.0 out of 51.0 instances.
The misclassified distribution of class 3 is: [13.  0.  0.  0.  0.]

The misclassified instances of class 4 is: 9.0 out of 40.0 instances.
The misclassified distribution of class 4 is: [7. 2. 0. 0. 0.]



#### Model Comparison
In this part, we use several machine learning models to perform classification. They are SVM with kernals of 'rbf' (this is implemented in skearn SVC by default), 'linear' and 'poly' (the default is degree=3), Logistic Regression and Decision Tree.

In [12]:
# (a) kernel='linear'
accuracy = []
f1_scores = []
precision = []
recall = []

for train_index, test_index in stratified_folder.split(newX, all_labels):
    model = svm.SVC(decision_function_shape='ovo', kernel='linear')
    model.fit([newX[i] for i in train_index], [all_labels[i] for i in train_index])
    Y_pred = model.predict([newX[i] for i in test_index])
    Y_true = np.array([all_labels[i] for i in test_index])
    acc = accuracy_score(Y_true, Y_pred)
    f1_scores = f1_score(Y_true, Y_pred, average='macro')
    pr = precision_score(Y_true, Y_pred, average='macro')
    re = recall_score(Y_true, Y_pred, average='macro')
    accuracy.append(acc)
    precision.append(pr)
    recall.append(re)
print('Accuracy using cross validation: '+ str(np.mean(accuracy)))
print('F1 score using cross validation: '+ str(np.mean(f1_scores)))
print('Precision using cross validation: '+ str(np.mean(precision)))
print('Recall using cross validation: '+ str(np.mean(recall)))

Accuracy using cross validation: 0.9316836879896758
F1 score using cross validation: 0.9249669564776598
Precision using cross validation: 0.9394650628399048
Recall using cross validation: 0.9291714144925276


In [13]:
# (b) kernel='poly'
accuracy = []
f1_scores = []
precision = []
recall = []

for train_index, test_index in stratified_folder.split(newX, all_labels):
    model = svm.SVC(decision_function_shape='ovo', kernel='poly')
    model.fit([newX[i] for i in train_index], [all_labels[i] for i in train_index])
    Y_pred = model.predict([newX[i] for i in test_index])
    Y_true = np.array([all_labels[i] for i in test_index])
    acc = accuracy_score(Y_true, Y_pred)
    f1_scores = f1_score(Y_true, Y_pred, average='macro')
    pr = precision_score(Y_true, Y_pred, average='macro')
    re = recall_score(Y_true, Y_pred, average='macro')
    accuracy.append(acc)
    precision.append(pr)
    recall.append(re)
print('Accuracy using cross validation: '+ str(np.mean(accuracy)))
print('F1 score using cross validation: '+ str(np.mean(f1_scores)))
print('Precision using cross validation: '+ str(np.mean(precision)))
print('Recall using cross validation: '+ str(np.mean(recall)))

Accuracy using cross validation: 0.3950309113163241
F1 score using cross validation: 0.3799190532398679
Precision using cross validation: 0.8356691632098755
Recall using cross validation: 0.3689921253580922


In [14]:
# (c) Logistic Regression
from sklearn.linear_model import LogisticRegression

accuracy = []
f1_scores = []
precision = []
recall = []

for train_index, test_index in stratified_folder.split(newX, all_labels):
    model = LogisticRegression(multi_class='ovr')
    model.fit([newX[i] for i in train_index], [all_labels[i] for i in train_index])
    Y_pred = model.predict([newX[i] for i in test_index])
    Y_true = np.array([all_labels[i] for i in test_index])
    acc = accuracy_score(Y_true, Y_pred)
    f1_scores = f1_score(Y_true, Y_pred, average='macro')
    pr = precision_score(Y_true, Y_pred, average='macro')
    re = recall_score(Y_true, Y_pred, average='macro')
    accuracy.append(acc)
    precision.append(pr)
    recall.append(re)
print('Accuracy using cross validation: '+ str(np.mean(accuracy)))
print('F1 score using cross validation: '+ str(np.mean(f1_scores)))
print('Precision using cross validation: '+ str(np.mean(precision)))
print('Recall using cross validation: '+ str(np.mean(recall)))

Accuracy using cross validation: 0.9397716696466366
F1 score using cross validation: 0.9277827363943573
Precision using cross validation: 0.9464378877092294
Recall using cross validation: 0.937115492489086


In [15]:
# (d) Decision Tree
from sklearn import tree

accuracy = []
f1_scores = []
precision = []
recall = []

for train_index, test_index in stratified_folder.split(newX, all_labels):
    model = tree.DecisionTreeClassifier(criterion="entropy")
    model.fit([newX[i] for i in train_index], [all_labels[i] for i in train_index])
    Y_pred = model.predict([newX[i] for i in test_index])
    Y_true = np.array([all_labels[i] for i in test_index])
    acc = accuracy_score(Y_true, Y_pred)
    f1_scores = f1_score(Y_true, Y_pred, average='macro')
    pr = precision_score(Y_true, Y_pred, average='macro')
    re = recall_score(Y_true, Y_pred, average='macro')
    accuracy.append(acc)
    precision.append(pr)
    recall.append(re)
print('Accuracy using cross validation: '+ str(np.mean(accuracy)))
print('F1 score using cross validation: '+ str(np.mean(f1_scores)))
print('Precision using cross validation: '+ str(np.mean(precision)))
print('Recall using cross validation: '+ str(np.mean(recall)))

Accuracy using cross validation: 0.8849864419133742
F1 score using cross validation: 0.8745450376569437
Precision using cross validation: 0.8874458868921268
Recall using cross validation: 0.8853233752076916
