**Classificação de Sentimentos:** Nosso modelo tem que ser capaz de identificar se um tweet é POSITIVO, NEGATIVO ou NEUTRO.

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# Palavras-chave para cada categoria de sentimento [Positivo, Neutro, Negativo]
positive_words = ['love', 'great', 'fantastic', 'happy', 'amazing', 'good', 'wonderful', 'yes']
negative_words = ['hate', 'terrible', 'bad', 'horrible', 'sad', 'awful', "don't", 'no', 'dont', 'stupid']
neutral_words = ['ok', 'fine', 'average', 'neutral']

def detect_label(text, label_type):
  return any(word in text for word in label_type)

In [3]:
def get_sentiment_label(text):
    text = text.lower()
    if detect_label(text, neutral_words): return 1      # Neutro
    elif detect_label(text, negative_words): return 0   # Negativo
    elif detect_label(text, positive_words): return 2   # Positivo
    return 1                                            # Neutro caso não dê match com as palavras-chave

In [4]:
df = pd.read_csv('./sentiment140_10k.csv')
df.head()

Unnamed: 0,author,text
0,scotthamilton,is upset that he can't update his Facebook by ...
1,mattycus,@Kenichan I dived many times for the ball. Man...
2,ElleCTF,my whole body feels itchy and like its on fire
3,Karoli,"@nationwideclass no, it's not behaving at all...."
4,joy_wolf,@Kwesidei not the whole crew


In [5]:
df.shape

(10000, 2)

In [6]:
# adicionando label de sentimento manualmente
df['label'] = df['text'].apply(get_sentiment_label)
df.tail()

# NEGATIVO  = 0
# NEUTRO    = 1
# POSITIVO  = 2

Unnamed: 0,author,text,label
9995,gia_revenge,stupid dvds stuffing up the good bits in jaws.,0
9996,matmurray,@Dandy_Sephy No. Only close friends and family...,0
9997,lexabuckets,CRAP! After looking when I last tweeted... WHY...,1
9998,AmberKarley,Its Another Rainboot day,0
9999,ARoadRetraveled,I think there's a problem with the ISP in this...,2


In [7]:
# Dividindo em treino e teste
texts = df['text'].values
labels = df['label'].values
x_train, x_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [8]:
MAX_WORDS = 30000

In [9]:
vectorizer = TfidfVectorizer(max_features=MAX_WORDS)

# Ajuste e transformação dos dados de treino
x_train_tfidf = vectorizer.fit_transform(x_train)
x_test_tfidf = vectorizer.transform(x_test)

In [10]:
# Iniciando a regressão logística
model = LogisticRegression()
model.fit(x_train_tfidf, y_train)

# Fazendo previsões nos dados de teste
y_pred = model.predict(x_test_tfidf)

In [11]:
f"Accuracy score: {accuracy_score(y_test, y_pred):.4f}"

'Accuracy score: 0.8900'

In [12]:
def predict_sentiment(model, text, vectorizer, sentiment_labels):
    processed_text = vectorizer.transform([text])
    prediction = model.predict(processed_text)[0]
    sentiment = sentiment_labels[prediction]
    return sentiment, prediction

In [13]:
# Fazendo a previsões
LABELS = ["Negativa", "Neutra", "Positiva"]

In [23]:
predicted_sentiment = predict_sentiment(
    model,
    'This movie was good',
    vectorizer,
    LABELS
)
predicted_sentiment

('Positiva', 2)

In [15]:
predicted_sentiment = predict_sentiment(
    model,
    'I dont like this movie!',
    vectorizer,
    LABELS
)
predicted_sentiment

('Negativa', 0)

In [16]:
predicted_sentiment = predict_sentiment(
    model,
    'The bull market is over!',
    vectorizer,
    LABELS
)
predicted_sentiment

('Neutra', 1)

In [19]:
import pickle
pickle.dump(model, open('model.pkl','wb'))
pickle.dump(vectorizer, open('model-vectorizer.pkl','wb'))