In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

plt.style.use('ggplot')

import nltk

In [2]:
from func.utils import save_processed_csv
from func.utils import read_data

In [3]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mikeiasoliveira/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mikeiasoliveira/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/mikeiasoliveira/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/mikeiasoliveira/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/mikeiasoliveira/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [4]:
df = read_data()

# Exploring NLTK

In [5]:
sentence_0 = df.head()['text'][0]
tokens_0 = nltk.word_tokenize(sentence_0)
tokens_0

['I', '`', 'd', 'have', 'responded', ',', 'if', 'I', 'were', 'going']

In [6]:
tagged_0 = nltk.pos_tag(tokens_0)

In [7]:
tagged_0

[('I', 'PRP'),
 ('`', '``'),
 ('d', 'NNS'),
 ('have', 'VBP'),
 ('responded', 'VBN'),
 (',', ','),
 ('if', 'IN'),
 ('I', 'PRP'),
 ('were', 'VBD'),
 ('going', 'VBG')]

In [8]:
ents = nltk.chunk.ne_chunk(tagged_0)
ents.pprint()

(S
  I/PRP
  `/``
  d/NNS
  have/VBP
  responded/VBN
  ,/,
  if/IN
  I/PRP
  were/VBD
  going/VBG)


# SentimentIntensityAnalyzer

Creating a first analysis using a relatively simple algorithm  

In [9]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

In [10]:
sia = SentimentIntensityAnalyzer()

In [11]:
temp_sent = df['text'][1]
print(f'Sentence: {temp_sent}')
print(f'Polarity: {sia.polarity_scores(temp_sent)}')
df[:2]

Sentence:  Sooo SAD I will miss you here in San Diego!!!
Polarity: {'neg': 0.474, 'neu': 0.526, 'pos': 0.0, 'compound': -0.7437}


Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative


In [12]:
# Getting the polarity of which row in the dataframe

res = dict()

for i, row in tqdm(df.iterrows(), total=len(df)):
    text = str(row['text'])
    id = row['textID']
    res[id] = sia.polarity_scores(text)

  0%|          | 0/27481 [00:00<?, ?it/s]

In [13]:
 # Creating a Dataframe, transposing, reseting the index and renaming the index
df_polatity = pd.DataFrame(res).T.reset_index().rename(columns={'index':'textID'})
df_polatity

Unnamed: 0,textID,neg,neu,pos,compound
0,cb774db0d1,0.000,1.000,0.000,0.0000
1,549e992a42,0.474,0.526,0.000,-0.7437
2,088c60f138,0.494,0.506,0.000,-0.5994
3,9642c003ef,0.538,0.462,0.000,-0.3595
4,358bd9e861,0.000,1.000,0.000,0.0000
...,...,...,...,...,...
27476,4eac33d1c0,0.128,0.722,0.150,0.1027
27477,4f4c4fc327,0.000,0.890,0.110,0.3818
27478,f67aae2310,0.000,0.572,0.428,0.9136
27479,ed167662a5,0.000,0.680,0.320,0.3291


In [14]:
# Merging the two dataframes
df = df.merge(df_polatity)

save_processed_csv(df, 'twitter', 'df_sia')

# Tokenizing

# Stop-Words removal

# Stemming

# Lemma