In [137]:
import pandas as pd
pd.set_option('display.max_rows', None)
from textblob import TextBlob
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
from nltk import FreqDist
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import unidecode

In [138]:
df = pd.read_csv('../pre-processed.csv')
df.drop(columns=['index'], inplace=True)
df.sample(20).head()

Unnamed: 0,label,preprocessed_news
5542,true,presidente afastada dilma rousseff tentou cond...
6978,true,cuba comemorara primeiro aniversario morte emb...
3376,fake,passagem bahamas furacao irma tao poderoso sug...
6194,true,moro ouve youssef cervero baiano acao contra l...
2159,fake,ator petista podera interpretar tucano aecio n...


In [139]:
def remove_accents(input_str):
    only_ascii = unidecode.unidecode(input_str)
    return only_ascii

stops = list(set(stopwords.words('portuguese')))
for i in range(0, len(stops)):
  stops[i] = remove_accents(stops[i])

def remove_stopwords(text):
  words = text.split()
  words = [word for word in words if word not in stops]
  return ' '.join(words)

df['preprocessed_news'] = df['preprocessed_news'].apply(remove_stopwords)

In [140]:
df['polarity'] = df['preprocessed_news'].map(lambda text: TextBlob(text).sentiment.polarity)

In [None]:
df['polarity'].iplot(
  kind='hist',
  bins=50,
  xTitle='polarity',
  linecolor='black',
  yTitle='count',
  title='Sentiment Polarity Distribution')

In [None]:
words = df.loc[df['label'] == 'fake']['preprocessed_news']
allwords = []
for wordlist in words:
  allwords += wordlist.split()

mostcommon = FreqDist(allwords).most_common(100)

wordcloud = WordCloud(width=1600, height=800, background_color='white').generate(str(mostcommon))
fig = plt.figure(figsize=(30,10), facecolor='white')
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.title('Top 100 Words in Fake News', fontsize=100)
plt.tight_layout(pad=0)
plt.show()


In [None]:
words_true = df.loc[df['label'] == 'true']['preprocessed_news']
allwords_true = []
for wordlist_true in words_true:
  allwords_true += wordlist_true.split()

mostcommon_true = FreqDist(allwords_true).most_common(100)

wordcloud_true = WordCloud(width=1600, height=800, background_color='white').generate(str(mostcommon_true))
fig = plt.figure(figsize=(30,10), facecolor='white')
plt.imshow(wordcloud_true, interpolation="bilinear")
plt.axis('off')
plt.title('Top 100 Words in True News', fontsize=100)
plt.tight_layout(pad=0)
plt.show()
