<a href="https://colab.research.google.com/github/vinejain/multilingual_feedback_multilabel_classification/blob/master/Experiments_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

REF: https://github.com/nkartik94/Multi-Label-Text-Classification/blob/master/Mark_6.ipynb

In [32]:
import os
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))


In [33]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [35]:
data_path = "/content/drive/My Drive/thesis_data/multilabelDF.csv"
data_raw = pd.read_csv(data_path)
#data_raw = data_raw.loc[np.random.choice(data_raw.index, size=2000)]
data_raw.shape
print("Number of rows in data =",data_raw.shape[0])
print("Number of columns in data =",data_raw.shape[1])
print("\n")
printmd("**Sample data:**")
data_raw.head()

missing_values_check = data_raw.isnull().sum()
print(missing_values_check)

##### Calculating number of comments under each label¶
rowSums = data_raw.iloc[:,2:].sum(axis=1)
clean_comments_count = (rowSums==0).sum(axis=0)
print("Total number of comments = ",len(data_raw))
print("Number of clean comments = ",clean_comments_count)
print("Number of comments with labels =",(len(data_raw)-clean_comments_count))
categories = list(data_raw.columns.values)
categories = categories[6:]
print(categories)

# Calculating number of comments in each category

counts = []
for category in categories:
    counts.append((category, data_raw[category].sum()))
df_stats = pd.DataFrame(counts, columns=['category', 'number of comments'])
df_stats

Number of rows in data = 3065
Number of columns in data = 12




**Sample data:**

ID                 0
reviewText         0
Label              0
cleaned_text       4
Lab1               0
Lab2            2919
bug                0
comment            0
complaint          0
meaningless        0
request            0
undetermined       0
dtype: int64
Total number of comments =  3065
Number of clean comments =  0
Number of comments with labels = 3065
['bug', 'comment', 'complaint', 'meaningless', 'request', 'undetermined']


Unnamed: 0,category,number of comments
0,bug,72
1,comment,1758
2,complaint,950
3,meaningless,306
4,request,103
5,undetermined,22


**Data Pre-Processing**

In [36]:
data = data_raw
data = data_raw.loc[np.random.choice(data_raw.index, size=2000)]
data.shape
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re

import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

# Cleaning DATA

def cleanHtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext


def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned


def keepAlpha(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

data['reviewText'] = data['reviewText'].str.lower()
data['reviewText'] = data['reviewText'].apply(cleanHtml)
data['reviewText'] = data['reviewText'].apply(cleanPunc)
data['reviewText'] = data['reviewText'].apply(keepAlpha)
#data.head()


In [37]:
########### Stemming
stemmer = SnowballStemmer("english")
def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence

data['reviewText'] = data['reviewText'].apply(stemming)
#data.head()

########## Train-Test Split

test = pd.read_csv('/content/drive/My Drive/thesis_data/multilabelTestDF.csv')
train = train[['ID', 'reviewText', 'bug', 'comment', 'complaint', 'meaningless', 'request', 'undetermined']]
test = test[['ID', 'reviewText', 'bug', 'comment', 'complaint', 'meaningless', 'request', 'undetermined']]

print(train.shape)
print(test.shape)
train_text = train['reviewText']
test_text = test['reviewText']

(1400, 8)
(500, 8)


In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')
vectorizer.fit(train_text)
vectorizer.fit(test_text)
x_train = vectorizer.transform(train_text)
y_train = train.drop(labels = ['ID','reviewText'], axis=1)
x_test = vectorizer.transform(test_text)
y_test = test.drop(labels = ['ID','reviewText'], axis=1)

**Multi-Label Classification**

In [39]:
# Multiple Binary Classifications - (One Vs Rest Classifier)
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
#%%time

# Using pipeline for applying logistic regression and one vs rest classifier
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1)),
            ])

for category in categories:
    printmd('**Processing {} comments...**'.format(category))
    
    # Training logistic regression model on train data
    LogReg_pipeline.fit(x_train, train[category])
    
    # calculating test accuracy
    prediction = LogReg_pipeline.predict(x_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))
    print("\n")


**Processing bug comments...**

Test accuracy is 0.98




**Processing comment comments...**

Test accuracy is 0.704




**Processing complaint comments...**

Test accuracy is 0.708




**Processing meaningless comments...**

Test accuracy is 0.876




**Processing request comments...**

Test accuracy is 0.974




**Processing undetermined comments...**

Test accuracy is 0.992




In [45]:
!pip install scikit-multilearn
# Multiple Binary Classifications - (Binary Relevance)
# using binary relevance
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB

# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier
classifier = BinaryRelevance(GaussianNB())

# train
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))
print("\n")


Accuracy =  0.39




In [47]:
# using classifier chains
from skmultilearn.problem_transform import ClassifierChain
from sklearn.linear_model import LogisticRegression
# Classifier Chains¶
# initialize classifier chains multi-label classifier
classifier = ClassifierChain(LogisticRegression())
# Training logistic regression model on train data
classifier.fit(x_train, y_train)
# predict
predictions = classifier.predict(x_test)
# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))
print("\n")

Accuracy =  0.578




In [48]:
# Label Powerset¶
# using Label Powerset
from skmultilearn.problem_transform import LabelPowerset
# initialize label powerset multi-label classifier
classifier = LabelPowerset(LogisticRegression())
# train
classifier.fit(x_train, y_train)
# predict
predictions = classifier.predict(x_test)
# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))
print("\n")

Accuracy =  0.56




In [49]:
# Adapted Algorithm¶
# http://scikit.ml/api/api/skmultilearn.adapt.html#skmultilearn.adapt.MLkNN

from skmultilearn.adapt import MLkNN
from scipy.sparse import csr_matrix, lil_matrix
classifier_new = MLkNN(k=10)
# Note that this classifier can throw up errors when handling sparse matrices.
x_train = lil_matrix(x_train).toarray()
y_train = lil_matrix(y_train).toarray()
x_test = lil_matrix(x_test).toarray()
# train
classifier_new.fit(x_train, y_train)
# predict
predictions_new = classifier_new.predict(x_test)
# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions_new))
print("\n")

Accuracy =  0.228


