<div style="font-size:30pt">LDA for Sentiment Analysis</div>

# Imports

In [1]:
# Standards
import pandas as pd
import numpy as np

#LDA
from sklearn.decomposition import LatentDirichletAllocation

# Bag of words
from sklearn.feature_extraction.text import CountVectorizer

import pandas as pd
import numpy as np
import seaborn as sns
import unicodedata
import re
import string
from nltk.corpus import stopwords
import nltk
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
%matplotlib inline
nltk.download('stopwords')
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from sklearn import metrics

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/victor/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/victor/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Import data

In [2]:
!ls ../raw_data

Corona_NLP_test.csv  Corona_NLP_train.csv


In [3]:
df_train = pd.read_csv("../raw_data/Corona_NLP_train.csv")

# Cleaning

In [4]:
def lowerize(df, label):
    """ text lowercase
        removes \n
        removes \t
        removes \r """
    df[label] = df[label].str.lower()
    df[label] = df[label].apply(lambda x: x.replace("\n", " "))
    df[label] = df[label].apply(lambda x: x.replace("\r", " "))
    df[label] = df[label].apply(lambda x: x.replace("\t", " "))
    return df

def remove_emails(df, label):
    """ This function removes email adresses
        inputs:
         - text """
    df[label] = df[label].apply(lambda x: re.sub(r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])""", " ", x))
    return df

def remove_mentions(df, label):
    """ This function removes mentions (Twitter - starting with @) from texts
        inputs:
         - text """
    df[label] = df[label].apply(lambda x: re.sub(r"@([a-zA-Z0-9_.-]{1,100})", " ", x))
    return df

def remove_hyperlinks(df, label):
    """ This function removes hyperlinks from texts
        inputs:
         - text """
    df[label] = df[label].apply(lambda x: re.sub(r"http\S+", " ", x))
    return df

def remove_hashtags(df, label):
    """ This function removes hashtags
        inputs:
         - text """
    df[label] = df[label].apply(lambda x: re.sub(r"#\w+", " ", x))
    return df

def remove_html_tags(df, label):
    """ This function removes html tags from texts
        inputs:
         - text """
    df[label] = df[label].apply(lambda x: re.sub(r"<.*?>", " ", x))
    return df

def remove_numbers(df, label):
    """ This function removes numbers from a text
        inputs:
         - text """
    df[label] = df[label].apply(lambda x: re.sub(r"\d+", " ", x))
    return df

def encode_unknown(df, label):
    """ This function encodes special caracters """
    df[label] = df[label].apply(lambda x: unicodedata.normalize("NFD", x).encode('ascii', 'ignore').decode("utf-8"))
    return df

def clean_punctuation_no_accent(df, label):
    """ This function removes punctuation and accented characters from texts in a dataframe 
        To be appplied to languages that have no accents, ex: english 
    """
    df[label] = df[label].apply(lambda x: re.sub(r'[^\w\s]', ' ', x))
    return df

def remove_stop_words(text, stopwords=set(stopwords.words('english'))):
    """ This function removes stop words from a text
        inputs:
         - stopword list
         - text """

    # prepare new text
    text_splitted = text.split(" ")
    text_new = list()
    
    # stop words updated
    #stopwords = stopwords.union({"grocery store", "covid", "supermarket", "people", "grocery", "store", "price", "time"})
    
    # loop
    for word in text_splitted:
        if word not in stopwords:
            text_new.append(word)
    return " ".join(text_new)

def clean_stopwords(df, label):
    """ This function removes stopwords """
    df[label] = df[label].apply(lambda x: remove_stop_words(x))
    return df

def more_cleaning(df, label):
    """ This function
     1) removes remaining one-letter words and two letters words
     2) replaces multiple spaces by one single space
     3) drop empty lines """
    df[label] = df[label].apply(lambda x: re.sub(r'\b\w{1,2}\b', " ", x))
    df[label] = df[label].apply(lambda x: re.sub(r"[ \t]{2,}", " ", x))
    df[label] = df[label].apply(lambda x: x if len(x) != 1 else '')
    df[label] = df[label].apply(lambda x: np.nan if x == '' else x)
    df = df.dropna(subset=[label], axis=0).reset_index(drop=True).copy()
    return df

def lemmatize_one_text(text):
    """ This function lemmatizes words in text (it changes word to most close root word)
        inputs:
         - lemmatizer
         - text """

    # initialize lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # tags
    lem_tags = ['a', 'r', 'n', 'v']

    # prepare new text
    text_splitted = text.split(" ")
    text_new = list()

    # change bool
    changed = ''
    
    # loop
    for word in text_splitted:
        changed = ''
        for tag in lem_tags:
            if lemmatizer.lemmatize(word, tag) != word:
                changed = tag
        if changed == '':
            text_new.append(word)
        else:
            text_new.append(lemmatizer.lemmatize(word, changed))

    return " ".join(text_new)

def lemmatize(df, label):
    """ This function lemmatizes texts """
    df[label] = df[label].apply(lambda x: lemmatize_one_text(x))
    return df

In [5]:
df_train["CleanTweet"] = df_train["OriginalTweet"]
df_train = lowerize(df_train, "CleanTweet")
df_train = remove_emails(df_train, "CleanTweet")
df_train = remove_mentions(df_train, "CleanTweet")
df_train = remove_hyperlinks(df_train, "CleanTweet")
df_train = remove_hashtags(df_train, "CleanTweet")
df_train = remove_html_tags(df_train, "CleanTweet")
df_train = remove_numbers(df_train, "CleanTweet")
df_train = encode_unknown(df_train, "CleanTweet")
df_train = clean_punctuation_no_accent(df_train, "CleanTweet")
df_train = clean_stopwords(df_train, "CleanTweet")
df_train = more_cleaning(df_train, "CleanTweet")
df_train = lemmatize(df_train, "CleanTweet")

In [6]:
df_train.sample(3)

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,CleanTweet
17469,21296,66248,"Lansing, MI",23-03-2020,"Order on food: \r\r\n\r\r\n""As needed, however...",Positive,order food needed however individual may leave...
14205,18025,62977,,21-03-2020,Can you imagine doing this in todays climate?...,Neutral,imagine today climate
39009,42864,87816,EMEIA,12-04-2020,We're in the 'hair color' phase of panic buyi...,Neutral,hair color phase panic buying first went hand...


# Sentiment column preprocessing

In [7]:
def change_sen(sentiment):
    if sentiment == "Extremely Positive":
        return 'positive'
    elif sentiment == "Extremely Negative":
        return 'negative'
    elif sentiment == "Positive":
        return 'positive'
    elif sentiment == "Negative":
        return 'negative'
    else:
        return 'neutral'

In [8]:
df_train["Sentiment"] = df_train["Sentiment"].apply(change_sen)

# Prepocessing

In [9]:
vectorizer_positive = CountVectorizer().fit(df_train[df_train["Sentiment"] == "positive"]["CleanTweet"])
vectorizer_positive

CountVectorizer()

In [10]:
vectorizer_neutral = CountVectorizer().fit(df_train[df_train["Sentiment"] == "neutral"]["CleanTweet"])
vectorizer_neutral

CountVectorizer()

In [11]:
vectorizer_negative = CountVectorizer().fit(df_train[df_train["Sentiment"] == "negative"]["CleanTweet"])
vectorizer_negative

CountVectorizer()

In [13]:
data_vectorized_positive = vectorizer_positive.transform(df_train[df_train["Sentiment"] == "positive"]["CleanTweet"])
print(len(data_vectorized_positive.toarray()[0]))
data_vectorized_neutral = vectorizer_neutral.transform(df_train[df_train["Sentiment"] == "neutral"]["CleanTweet"])
print(len(data_vectorized_neutral.toarray()[0]))
data_vectorized_negative = vectorizer_negative.transform(df_train[df_train["Sentiment"] == "negative"]["CleanTweet"])
print(len(data_vectorized_negative.toarray()[0]))

18858
10530
17581


In [14]:
lda_model_positive = LatentDirichletAllocation(n_components=2).fit(data_vectorized_positive)

In [15]:
lda_model_neutral = LatentDirichletAllocation(n_components=2).fit(data_vectorized_neutral)

In [16]:
lda_model_negative = LatentDirichletAllocation(n_components=2).fit(data_vectorized_negative)

# Get Topic Function

In [17]:
def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i]) for i in topic.argsort()[:-10 - 1:-1]])

In [18]:
print_topics(lda_model_positive, vectorizer_positive)

Topic 0:
[('store', 3943.802621528201), ('grocery', 3158.314807422102), ('supermarket', 3085.8866654171984), ('people', 2099.529586992987), ('worker', 1665.7969837901258), ('like', 1439.171590656096), ('covid', 1304.8686831002797), ('amp', 1300.1123559260461), ('shopping', 1256.5253306073187), ('get', 1161.1337772308116)]
Topic 1:
[('covid', 3356.1313168996394), ('price', 3117.554737199577), ('consumer', 2311.428356038424), ('food', 1728.1020196157497), ('amp', 1356.8876440738834), ('help', 1111.4249687769986), ('hand', 1105.3576395263813), ('pandemic', 984.9920017503847), ('sanitizer', 973.2630721734497), ('demand', 972.4222005026666)]


In [19]:
print_topics(lda_model_neutral, vectorizer_neutral)

Topic 0:
[('covid', 1490.4487364796548), ('price', 1305.5495105754226), ('consumer', 1048.2785332285998), ('online', 664.2248034675271), ('shopping', 629.985066016503), ('pandemic', 415.60525174371116), ('food', 353.88362973066603), ('coronavirus', 341.1404325738438), ('amp', 331.1292988706779), ('toilet', 322.7646644921277)]
Topic 1:
[('store', 1487.9476269625882), ('supermarket', 1329.7501345168455), ('grocery', 1188.1814376525217), ('people', 467.243087204377), ('get', 395.0277400937095), ('covid', 300.55126352032414), ('food', 291.11637026931214), ('need', 254.12215823356323), ('time', 249.5531315003127), ('worker', 244.3341702714723)]


In [20]:
print_topics(lda_model_negative, vectorizer_negative)

Topic 0:
[('price', 4303.336242017323), ('covid', 2397.4091800709566), ('consumer', 1658.1439892806056), ('demand', 1184.7285845538202), ('crisis', 1054.3879496766315), ('oil', 995.4921654198821), ('amp', 892.5351563760489), ('pandemic', 830.240891890073), ('due', 626.9841450919477), ('market', 584.1787023808745)]
Topic 1:
[('food', 3329.498560123918), ('supermarket', 2873.7807816967047), ('store', 2669.4946683459966), ('people', 2603.6929040523346), ('grocery', 2176.4932411969867), ('panic', 2064.2479776662212), ('covid', 1814.590819928993), ('buying', 1255.231402359625), ('get', 1171.1369366868016), ('need', 1157.8618067195448)]
