## Similar to the LR&SVC (Countvec, Tf-idf, Ngrams) notebook, but with the creation and addition of 2 features (No of Adj & No of Adv) for the LR and SVC models to train on.

In [None]:
import pandas as pd

df = pd.read_csv('LOCAL_PATH_TO_DATASET')
df = df[['Emotion','Statement']]
display(df.head())

In [None]:
def process_text(document):
     
    # Remove extra white space from text
    document = re.sub(r'\s+', ' ', document, flags=re.I)
         
    # Remove all the special characters from text
    document = re.sub(r'\W', ' ', str(document))
 
    # Remove all single characters from text
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
 
    # Converting to Lowercase
    document = document.lower()
 
    # Word tokenization       
    tokens = document.split()
      
    lemma_txt = [stemmer.lemmatize(word) for word in tokens]
    # Remove Drop words 
    lemma_no_stop_txt = [word for word in lemma_txt if word not in en_stop]
        
    tokens = [word for word in tokens if len(word) > 3]
                 
    clean_txt = ' '.join(lemma_no_stop_txt)
 
    return clean_txt

In [None]:
from sklearn.utils import class_weight
import numpy as np

class_weight = class_weight.compute_class_weight('balanced'
                                               ,np.unique(df['Emotion'])
                                               ,df['Emotion'])
y = df.pop('Emotion')
X = df

In [None]:
from tqdm import tqdm
import nltk
import re
from nltk import WordNetLemmatizer

en_stop = set(nltk.corpus.stopwords.words('english'))
stemmer = WordNetLemmatizer()
nltk.download('wordnet')
nltk.download('stopwords')  

df['preprocessedStatement'] = df.Statement.apply(process_text)

In [None]:
clean_corpus = df.preprocessedStatement.tolist()
clean_corpus[:5]

## For the next 3 cells, run only one of them depending on whether you want to use 1) CountVectorizer, 2) TfidfVectorizer, or 3) TfidfVectorizer ngrams

In [None]:
# 1) Only run this cell if you want to use CountVectorizer. Skip the 2 cells below.

from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(clean_corpus)
clean_corpus =  count_vect.transform(clean_corpus)

In [None]:
# 2) Only run this cell if you want to use TfidfVectorizer. Skip the cell above and below.

from sklearn.feature_extraction.text import TfidfVectorizer

# unigram/word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}')
tfidf_vect.fit(clean_corpus)
clean_corpus =  tfidf_vect.transform(clean_corpus)

In [None]:
# 3) Only run this cell if you want to use Ngrams. Skip the 2 cells above

from sklearn.feature_extraction.text import TfidfVectorizer

# bigram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,2))
tfidf_vect_ngram.fit(clean_corpus)
clean_corpus =  tfidf_vect_ngram.transform(clean_corpus)

In [None]:
clean_corpus = pd.DataFrame.sparse.from_spmatrix(clean_corpus)
clean_corpus.head()

In [None]:
import textblob
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

pos_family = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

# function to check and get the part of speech tag count of a words in a given sentence
def check_pos_tag(x, flag):
    cnt = 0
    try:
        wiki = textblob.TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                cnt += 1
    except:
        pass
    return cnt

df['adj_count'] = df['Statement'].apply(lambda x: check_pos_tag(x, 'adj'))
df['adv_count'] = df['Statement'].apply(lambda x: check_pos_tag(x, 'adv'))

In [None]:
result = pd.concat([df, clean_corpus], axis=1)
result= result.drop(['Statement', 'preprocessedStatement'], axis=1) 
result.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

X_train,X_test, y_train, y_test = train_test_split(result,y, stratify=y, test_size=0.05, random_state=0)

encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

model_lr = LogisticRegression(multi_class='ovr', max_iter=1000) #Insert class_weight = class_weight if training on Meld-dd dataset
model_lr.fit(X_train, y_train)

y_pred_lr = model_lr.predict(X_test)

print(classification_report(y_test, y_pred_lr))

In [None]:
from sklearn.multiclass import OneVsRestClassifier

model_lr2 = LogisticRegression(max_iter=1000) #Insert class_weight = class_weight if training on Meld-dd dataset

model_lr2 = OneVsRestClassifier(model_lr2)

model_lr2.fit(X_train, y_train)

y_pred_lr2 = model_lr2.predict(X_test)

print(classification_report(y_test, y_pred_lr2))

In [None]:
from sklearn.svm import SVC

model_svm = SVC(decision_function_shape='ovo') #insert class_weight = 'balanced' if training on Meld-dd dataset
model_svm.fit(X_train, y_train)

y_pred_svm = model_svm.predict(X_test)

print(classification_report(y_test, y_pred_svm))

In [None]:
from sklearn.multiclass import OneVsOneClassifier

model_svm2 = SVC() #insert class_weight = 'balanced' if training on Meld-dd dataset

model_svm2 = OneVsOneClassifier(model_svm2)

model_svm2.fit(X_train, y_train)
  
y_pred_svm2 = model_svm2.predict(X_test) 

print(classification_report(y_test, y_pred_svm2))

In [None]:
list(encoder.inverse_transform([0,1,2,3,4,5,6]))