# Multi-Class Text Classification using Scikit-learn OneVsRestClassifier

## Import the required libraries

In [None]:
import re
import string
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk import word_tokenize

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import Perceptron

import os
import random
import operator

import matplotlib.pyplot as plt

## Data Import and Cleanup

In [2]:
fname = os.path.join("<<your path and input excel file name with extension>>")
df = pd.read_excel (fname)
df.fillna('', inplace=True)

In [None]:
df['label'].value_counts()

In [5]:
df = df.drop(df[df.Comments == ''].index)
df = df.drop(df[df.label == ''].index)

In [None]:
df['label'].value_counts()

## If the above data is unbalanced, it needs to be balanced first. Otherwise the precision, recall and F1 scores for the minority classes will be low. Do the following to balance the data in the following order:
## 1. Label more data, especially those records that have the output value belonging to the minority classes.
## 2. Augment data using machine translation.
## 3. Augment data using BERT contextual embeddings.

In [None]:
classes = set(df['label'])
print(classes)
print(len(list(classes)))

class_to_index = dict((c, i) for i, c in enumerate(classes))
index_to_class = dict((v, k) for k, v in class_to_index.items())

In [None]:
class_to_index

In [None]:
index_to_class

## Create another column to get the integer representation of the label. 

In [None]:
df['label_int'] = df['label'].apply(lambda x: class_to_index[x])
df = df[['Comments', 'label_int']]
df

## Train test split

In [12]:
X = list(df['Comments'].copy())
y = df['label_int'].copy()
y = list(y)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

## Vectorize the text and use scikit-learn OneVsRestClassifier to make predictions.

In [41]:
tf_with_aug = TfidfVectorizer()
X_train_vect = tf_with_aug.fit_transform(X_train)
X_train_vect = X_train_vect.toarray()

In [42]:
ovr = OneVsRestClassifier(estimator=Perceptron())
_ = ovr.fit(X_train_vect, y_train)
# print(len(ovr.estimators_))

In [43]:
X_test_vect = tf_with_aug.transform(X_test)
X_test_vect = X_test_vect.toarray()

In [45]:
y_preds = ovr.predict(X_test_vect)

## Evaluate the results

In [None]:
%matplotlib inline

def show_confusion_matrix(y_true, y_pred, classes):
    from sklearn.metrics import confusion_matrix
    
    cm = confusion_matrix(y_true, y_pred, normalize='true')
    
    plt.figure(figsize=(50, 50))
    sp = plt.subplot(1, 1, 1)
    ctx = sp.matshow(cm)
    plt.xticks(list(range(0, 8)), labels=classes, fontsize=50, rotation=90)
    plt.yticks(list(range(0, 8)), labels=classes, fontsize=50)
    plt.colorbar(ctx)
    plt.show()

In [None]:
classes_cf_int = list(set(y_test))
index_to_class_cf = list(k for k, v in class_to_index.items() if v in classes_cf_int)
y_test=np.asarray(y_test).astype(np.int_)

In [None]:
cf_display_classes = [str(count) + " - " + index_to_class[count] for count, value in enumerate(list(set(y_test)))]

show_confusion_matrix(y_test, y_preds, cf_display_classes)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_preds, target_names=cf_display_classes))