# LDA - Sentimento em relação ao aquecimento global



## Leitura e análise inicial dos dados

Inicializar um DataFrame a partir do arquivo `../data//twitter/twitter_sentiment_data_cleaned.csv`

In [9]:
# Resposta:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from lda_over_time.lda_over_time import LdaOverTime
from lda_over_time.models.lda_seq_model import LdaSeqModel

import lda_over_time

tweets_path = "../data/twitter/twitter_sentiment_data_cleaned.csv"

data = pd.read_csv(
    tweets_path, 
    usecols=["DATE", "MESSAGE"], 
    encoding="latin-1"
)

data

Unnamed: 0,MESSAGE,DATE
0,@tiniebeany climate change is an interesting h...,2016-10-31 03:13:07.701
1,RT @NatGeoChannel: Watch #BeforeTheFlood right...,2016-10-31 16:15:22.218
2,Fabulous! Leonardo #DiCaprio's film on #climat...,2016-10-31 16:16:07.725
3,RT @Mick_Fanning: Just watched this amazing do...,2016-10-31 16:17:03.392
4,"RT @cnalive: Pranita Biswasi, a Lutheran from ...",2016-10-31 16:19:07.444
...,...,...
43938,"Dear @realDonaldTrump,\r\nYeah right. Human Me...",2016-10-26 15:54:32.840
43939,What will your respective parties do to preven...,2016-10-26 16:33:35.418
43940,RT @MikkiL: UN Poll Shows Climate Change Is th...,2016-10-26 19:15:07.538
43941,RT @taehbeingextra: i still can$q$t believe th...,2016-10-26 21:24:24.018


Utiliza o *spacy* para filtrar e limpar os textos

In [1]:
from spacy import spacy

nlp = spacy.load("en_core_web_sm")

ModuleNotFoundError: No module named 'spacy'

In [None]:
filters = {
    "ADJ": "adjective",
    "ADV": "adverb",
    "NOUN": "noun",
    "PROPN": "proper noun",
    "VERB": "verb",
}

In [None]:
tweets = data['MESSAGE']

clean_tweets = []

# clear each article
for tweet in tweets:
    
    # words that belongs to one of the chosen part of speeches
    clean = []
    
    # filter and store all words from the chosen part of speech in their basic form
    for word in nlp(tweet):
        if word.pos_ in filters:
            clean.append(word.lemma_)
            
    # append cleanned text
    clean_tweets.append(' '.join(clean))

In [None]:
model = LdaSeqModel(
    corpus = clean_tweets,         # list texts to be analysed
    dates = data["Date"].values,     # dates of each article
    date_format = "%Y/%m/%d",        # date format is MM/DD/YYYY
    freq = "6M",                     # frequency of one semester
    n_topics = 5,                    # we want to get 5 topics
    aggregator = "average"           # calculate average of topics
)

In [None]:
main = LdaOverTime(model)