In [78]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from datasets import load_dataset
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from keras.preprocessing.sequence import pad_sequences
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection, naive_bayes
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier


In [2]:
dataset = load_dataset("lex_glue",'ecthr_a')

  from .autonotebook import tqdm as notebook_tqdm
Reusing dataset lex_glue (/Users/ytkd/.cache/huggingface/datasets/lex_glue/ecthr_a/1.0.0/c3c0bd7433b636dc39ae49a84dc401190c73156617efc415b04e9835a93a7043)
100%|██████████| 3/3 [00:00<00:00, 229.36it/s]


In [3]:
train = dataset['train']
test = dataset['test']

In [67]:
def pre_process(data):
    lemmatizer = WordNetLemmatizer()
    new_data = []
    new_labels = []
    for i in range(len(data)):
        tokenized = [word_tokenize(entry.lower()) for entry in data[i]['text'] if entry not in stopwords.words('english')]
        lemmatized = [[lemmatizer.lemmatize(token) for token in sent] for sent in tokenized]
        new_data.append(lemmatized)
        # new_data.append([word_tokenize(entry.lower()) for entry in data[i]['text'] if entry not in stopwords.words('english')])
        new_labels.append(data[i]['labels'])
    return new_data, new_labels

def create_corpus(data):
    corpus = []
    for doc in data:
        for sen in doc:
            for word in sen:
                corpus.append(word)
    return corpus   

def convert_2d(data):
    lst2 = []
    for i in data:
        lst1 = []
        for j in i:
            for k in j:
                lst1.append(k)
        lst2.append(lst1)
    return lst2


In [68]:
train_X, train_y  = pre_process(train) 
test_X, test_y = pre_process(test)

In [73]:
corpus_train  = create_corpus(train_X)
corpus_test = create_corpus(test_X)


In [74]:
train_X_2d = convert_2d(train_X)
test_X_2d = convert_2d(test_X)

In [75]:
def identity_tokenizer(text):
    return text

tfidf = TfidfVectorizer(tokenizer=identity_tokenizer, stop_words='english', lowercase=False)
tfidf.fit(corpus_train+corpus_test)
vectors_train = tfidf.fit_transform(train_X_2d)
vectors_test = tfidf.transform(test_X_2d)



In [76]:
train_y_enc = MultiLabelBinarizer().fit_transform(train_y)
test_y_enc = MultiLabelBinarizer().fit_transform(test_y)

In [88]:
classification_models = [
                         OneVsRestClassifier(KNeighborsClassifier(n_neighbors=3)),
                         OneVsRestClassifier(SVC(kernel = "linear" ,C=1)),
                         OneVsRestClassifier(SVC(kernel='rbf', C=1)),
                         OneVsRestClassifier(naive_bayes.MultinomialNB()),
                         OneVsRestClassifier(DecisionTreeClassifier()),
                         OneVsRestClassifier(RandomForestClassifier(n_estimators=500)),
                         ]

model_scores = []
for model in classification_models:
  # Pipeline object is created to perform model training and evaluate the performance of each model.
  model_pipeline = Pipeline([('model_training', model)])
  model_pipeline.fit(vectors_train, train_y_enc)

  model_name = model
  if model_name=='SVC' and model.kernel=='rbf': 
    model_name+='RBF kernel'
  
  model_scores.append((model_name,(f'{100*model_pipeline.score(vectors_test, test_y_enc):.2f}%')))

# Create the dataframe for score of each model
df_model_scores = pd.DataFrame(model_scores,columns=['Classification Model','Accuracy Score'])
df_model_scores.sort_values(by='Accuracy Score',axis=0,ascending=False)

Unnamed: 0,Classification Model,Accuracy Score
1,"OneVsRestClassifier(estimator=SVC(C=1, kernel=...",52.30%
2,OneVsRestClassifier(estimator=SVC(C=1)),49.10%
0,OneVsRestClassifier(estimator=KNeighborsClassi...,38.60%
5,OneVsRestClassifier(estimator=RandomForestClas...,34.40%
4,OneVsRestClassifier(estimator=DecisionTreeClas...,32.30%
3,OneVsRestClassifier(estimator=MultinomialNB()),25.30%
