In [1]:
import pandas as pd

df = pd.read_csv('LOCAL_PATH_TO_DATASET')
df = df[['Emotion','Statement']]
display(df.head())

Unnamed: 0,Emotion,Statement
0,guilt,Once when I was in the cell group (religious a...
1,shame,When I overslept for the second time on the da...
2,shame,I had not punched a ticket in the bus because ...
3,disgust,When a man spoke very sexistly in the company ...
4,shame,About a dozen girls laughed at me and I was su...


In [2]:
def process_text(document):
     
    # Remove extra white space from text
    document = re.sub(r'\s+', ' ', document, flags=re.I)
         
    # Remove all the special characters from text
    document = re.sub(r'\W', ' ', str(document))
 
    # Remove all single characters from text
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
 
    # Converting to Lowercase
    document = document.lower()
 
    # Word tokenization       
    tokens = document.split()
      
    lemma_txt = [stemmer.lemmatize(word) for word in tokens]

    # Remove Drop words 
    lemma_no_stop_txt = [word for word in lemma_txt if word not in en_stop]
        
    tokens = [word for word in tokens if len(word) > 3]
                 
    clean_txt = ' '.join(lemma_no_stop_txt)
 
    return clean_txt

In [3]:
from tqdm import tqdm
import numpy as np
import nltk
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))
import re
from nltk import WordNetLemmatizer
stemmer = WordNetLemmatizer()

nltk.download('wordnet')
nltk.download('stopwords')  

df['preprocessedStatement'] = df.Statement.apply(process_text)
df['preprocessedStatement'][:5]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0    wa cell group religious activity found almost ...
1                overslept second time day examination
2    punched ticket bus card ticket collector came ...
3               man spoke sexistly company friend mine
4          dozen girl laughed wa sure wa nothing wrong
Name: preprocessedStatement, dtype: object

In [4]:
clean_corpus = df.preprocessedStatement.tolist()
clean_corpus[:5]

['wa cell group religious activity found almost everyone group read bible daily felt guilty heart',
 'overslept second time day examination',
 'punched ticket bus card ticket collector came turned forgotten shame felt wa great though wa done purpose',
 'man spoke sexistly company friend mine',
 'dozen girl laughed wa sure wa nothing wrong']

In [5]:
y = df.pop('Emotion')

## For the next 3 cells, run only one of them depending on whether you want to use 1) CountVectorizer, 2) TfidfVectorizer, or 3) TfidfVectorizer ngrams

In [6]:
# 1) Only run this cell if you want to use CountVectorizer. Skip the 2 cells below.

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

X_train,X_test, y_train, y_test = train_test_split(clean_corpus,y, stratify=y, test_size=0.05, random_state=0)

count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(clean_corpus)

#transform the training and validation data using count vectorizer object
X_train =  count_vect.transform(X_train)
X_test =  count_vect.transform(X_test)

In [8]:
# 2) Only run this cell if you want to use TfidfVectorizer. Skip the cell above and below.

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

X_train,X_test, y_train, y_test = train_test_split(clean_corpus,y, stratify=y, test_size=0.05, random_state=0)

# unigram/word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}')
tfidf_vect.fit(clean_corpus)

X_train =  tfidf_vect.transform(X_train)
X_test =  tfidf_vect.transform(X_test)

In [6]:
# 3) Only run this cell if you want to use Ngrams. Skip the 2 cells above.

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

X_train,X_test, y_train, y_test = train_test_split(clean_corpus,y, stratify=y, test_size=0.05, random_state=0)

# bigram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,2))
tfidf_vect_ngram.fit(clean_corpus)
X_train =  tfidf_vect_ngram.transform(X_train)
X_test =  tfidf_vect_ngram.transform(X_test)

In [7]:
from sklearn import preprocessing

encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.utils import class_weight

class_weight = class_weight.compute_class_weight('balanced'
                                               ,np.unique(y)
                                               ,y)

model_lr = LogisticRegression(multi_class='ovr', max_iter=1000) #Insert class_weight = class_weight if training on Meld-dd dataset
model_lr.fit(X_train, y_train)

y_pred_lr = model_lr.predict(X_test)

print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

           0       0.51      0.50      0.50        54
           1       0.52      0.60      0.56        53
           2       0.79      0.81      0.80        54
           3       0.54      0.52      0.53        52
           4       0.60      0.80      0.69        55
           5       0.73      0.61      0.67        54
           6       0.68      0.46      0.55        54

    accuracy                           0.62       376
   macro avg       0.62      0.62      0.61       376
weighted avg       0.62      0.62      0.61       376



In [9]:
from sklearn.multiclass import OneVsRestClassifier

model_lr2 = LogisticRegression(max_iter=1000) #Insert class_weight = class_weight if training on Meld-dd dataset

model_lr2 = OneVsRestClassifier(model_lr2)

model_lr2.fit(X_train, y_train)

y_pred_lr2 = model_lr2.predict(X_test)

print(classification_report(y_test, y_pred_lr2))

              precision    recall  f1-score   support

           0       0.51      0.50      0.50        54
           1       0.52      0.60      0.56        53
           2       0.79      0.81      0.80        54
           3       0.54      0.52      0.53        52
           4       0.60      0.80      0.69        55
           5       0.73      0.61      0.67        54
           6       0.68      0.46      0.55        54

    accuracy                           0.62       376
   macro avg       0.62      0.62      0.61       376
weighted avg       0.62      0.62      0.61       376



In [None]:
from sklearn.svm import SVC

model_svm = SVC(decision_function_shape='ovo') #insert class_weight = 'balanced' if training on Meld-dd dataset
model_svm.fit(X_train, y_train)

y_pred_svm = model_svm.predict(X_test)

print(classification_report(y_test, y_pred_svm))

In [None]:
from sklearn.multiclass import OneVsOneClassifier

model_svm2 = SVC() #insert class_weight = 'balanced' if training on Meld-dd dataset

model_svm2 = OneVsOneClassifier(model_svm2)

model_svm2.fit(X_train, y_train)
  
y_pred_svm2 = model_svm2.predict(X_test) 

print(classification_report(y_test, y_pred_svm2))

In [None]:
list(encoder.inverse_transform([0,1,2,3,4,5,6]))

['anger', 'disgust', 'fear', 'guilt', 'joy', 'sadness', 'shame']