## Use Liu Hu Lexicon

from NLTK, use the liu_hu_lexicon to determine the number of positive, negative, neutral words in each training and testing sample and create a disposition (1=positive, -1=negative, 0=neutral ) and add these as features to the test data.

It took such a long time to create these features I decided to run the features creation on the raw training and testing data, then create a new csv file with the original text PLUS the new features.  This data set was used during the training and testing.



In [16]:
import numpy as np
import time
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import StratifiedKFold
import pandas as pd
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from stemming.porter2 import stem
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline, make_union
from sentiment_analysis.transformers import RemoveEllipseTransformer, RemoveHtmlEncodedTransformer, RemoveNumbersTransformer, RemoveSpecialCharactersTransformer, RemoveUsernameTransformer, RemoveUrlsTransformer
from sklearn.preprocessing import LabelEncoder, FunctionTransformer
from sklearn.model_selection import cross_val_score
from sklearn.base import TransformerMixin
import re
import nltk.stem
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from nltk.sentiment.util import mark_negation

In [17]:
training_data = pd.read_csv('../data/kaggle/sa-emotions/train_data.csv')
testing_data = pd.read_csv('../data/kaggle/sa-emotions/test_data.csv')

In [18]:
def liu_hu_lexicon(sentence, verbose=False):
    """
    Basic example of sentiment classification using Liu and Hu opinion lexicon.
    This function simply counts the number of positive, negative and neutral words
    in the sentence and classifies it depending on which polarity is more represented.
    Words that do not appear in the lexicon are considered as neutral.

    :param sentence: a sentence whose polarity has to be classified.
    :param plot: if True, plot a visual representation of the sentence polarity.
    :return array of integers: 1 = positive, 0 = neutral, -1 = negative
    """
    from nltk.corpus import opinion_lexicon
    from nltk.tokenize import treebank

    tokenizer = treebank.TreebankWordTokenizer()
    pos_words = 0
    neg_words = 0
    neu_words = 0
    tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)]

    x = list(range(len(tokenized_sent))) # x axis for the plot
    y = []

    for word in tokenized_sent:
        if word in opinion_lexicon.positive():
            pos_words += 1
            y.append(1) # positive
        elif word in opinion_lexicon.negative():
            neg_words += 1
            y.append(-1) # negative
        else:
            neu_words += 1
            y.append(0) # neutral

    y_sum = sum(y) 
    if y_sum > 0:
        disposition = 1
    elif y_sum < 0:
        disposition = -1
    else:
        disposition = 0
    
    if verbose:
        if disposition == 1:
            print('Positive: {}'.format(sentence))
        elif disposition == -1:
            print('Negative: {}'.format(sentence))
        else:
            print('Neutral: {}'.format(sentence))

    return pd.Series({'pos_words':pos_words/len(tokenized_sent), 'neg_words':neg_words/len(tokenized_sent), 'neu_words':neu_words/len(tokenized_sent), 'disposition':disposition})


In [19]:
# THIS TAKES A VERY VERY LONG TIME
#anger_df = training_data[(training_data.sentiment == 'anger')]
#anger_df.head()
#liu_hu_lexicon_series = training_data.content.apply(liu_hu_lexicon)

In [20]:
df2 = training_data['content'].apply(liu_hu_lexicon)

In [21]:
result = pd.concat([training_data, df2], axis=1)
result.to_csv('../data/kaggle/sa-emotions/train_data_lexicon.csv')

In [22]:
df3 = testing_data['content'].apply(liu_hu_lexicon)
result = pd.concat([testing_data, df3], axis=1)
result.to_csv('../data/kaggle/sa-emotions/test_data_lexicon.csv')