# Logistic Regression for Sentiment Analysis on Tweets

In [1]:
import re
import string
import nltk
import numpy as np
nltk.download('twitter_samples')
from nltk.corpus import twitter_samples
from nltk.tokenize import TweetTokenizer
from tqdm import tqdm

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


In [2]:
# T·∫≠p d·ªØ li·ªáu v√≠ d·ª•

train_x = [
           'just plain boring',
           'entirely predictable and lacks energy',
           'no surprises and very few laughs',
           'very powerful',
           'the most fun film of the summer'
]
train_y = [0, 0, 0, 1, 1]

**Download Dataset**

In [3]:
# T·∫£i v·ªÅ t·∫≠p d·ªØ li·ªáu tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

# Chia th√†nh 2 t·∫≠p train v√† test
# train: 4000 samples, test: 1000 samples
train_pos = all_positive_tweets[:4000]
test_pos = all_positive_tweets[4000:]

train_neg = all_negative_tweets[:4000]
test_neg = all_negative_tweets[4000:]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

# T·∫°o nh√£n negative: 0, positive: 1
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))

**Ti·ªÅn x·ª≠ l√Ω d·ªØ li·ªáu cho t·∫≠p Tweets**

In [4]:
def basic_preprocess(text):
    '''
    Args:
        text: c√¢u ƒë·∫ßu v√†o
    Output:
        text_clean: danh s√°ch c√°c t·ª´ (token) sau khi chuy·ªÉn sang ch·ªØ th∆∞·ªùng v√†
            ƒë∆∞·ª£c ph√¢n t√°ch b·ªüi kho·∫£ng tr·∫Øng
    '''
    # x√≥a b·ªè stock market tickers like $GE
    text = re.sub(r'\$\w*', '', text)

    # x√≥a b·ªè old style retweet text "RT"
    text = re.sub(r'^RT[\s]+', '', text)

    # x√≥a b·ªè hyperlinks
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)

    # x√≥a b·ªè hashtags
    text = re.sub(r'#', '', text)

    # tokenize
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    text_tokens = tokenizer.tokenize(text)

    text_clean = []
    for word in text_tokens:
        if word not in string.punctuation:  # remove punctuation
            text_clean.append(word)

    return text_clean

In [5]:
# Ki·ªÉm tra k·∫øt qu·∫£
example_sentence = "RT @Twitter @chapagain Hello There! Have a great day. #good #morning http://chapagain.com.np"
basic_preprocess(example_sentence)

['hello', 'there', 'have', 'a', 'great', 'day', 'good', 'morning']

**X√¢y d·ª±ng b·ªô t·ª´ ƒëi·ªÉn**

In [6]:
def count_freq_words(corpus, labels):
    """ X√¢y d·ª±ng b·ªô t·ª´ ƒëi·ªÉn t·∫ßn su·∫•t xu·∫•t hi·ªán c·ªßa c√°c t·ª´
    Args:
        corpus: t·∫≠p danh s√°ch c√°c c√¢u
        labels: t·∫≠p nh√£n t∆∞∆°ng ·ª©ng v·ªõi c√°c c√¢u trong corpus (0 ho·∫∑c 1)
    Output:
        model: b·ªô t·ª´ ƒëi·ªÉn √°nh x·∫° m·ªói t·ª´ v√† t·∫ßn su·∫•t xu·∫•t hi·ªán c·ªßa t·ª´ ƒë√≥ trong corpus
            key: (word, label)
            value: frequency
            VD: {('boring', 0): 2} => t·ª´ boring xu·∫•t hi·ªán 2 l·∫ßn trong c√°c sample thu·ªôc class 0
    """
    model = {}
    for label, sentence in zip(labels, corpus):
        for word in basic_preprocess(sentence):
            # ƒê·ªãnh nghƒ©a key c·ªßa model l√† tuple (word, label)
            pair = (word, label)
            # N·∫øu key ƒë√£ t·ªìn t·∫°i trong model th√¨ tƒÉng value l√™n 1
            if pair in model:
                model[pair] += 1
            # N·∫øu key ch∆∞a t·ªìn t·∫°i trong model th√¨ b·ªï sung key v√†o model v·ªõi value =1
            else:
                model[pair] = 1
    return model

In [7]:
#Ki·ªÉm tra k·∫øt qu·∫£
freqs = count_freq_words(train_x, train_y)
freqs

{('followfriday', 1.0): 23,
 ('for', 1.0): 606,
 ('being', 1.0): 49,
 ('top', 1.0): 29,
 ('engaged', 1.0): 7,
 ('members', 1.0): 11,
 ('in', 1.0): 381,
 ('my', 1.0): 441,
 ('community', 1.0): 25,
 ('this', 1.0): 242,
 ('week', 1.0): 61,
 (':)', 1.0): 2847,
 ('hey', 1.0): 60,
 ('james', 1.0): 7,
 ('how', 1.0): 60,
 ('odd', 1.0): 1,
 (':/', 1.0): 5,
 ('please', 1.0): 77,
 ('call', 1.0): 21,
 ('our', 1.0): 111,
 ('contact', 1.0): 4,
 ('centre', 1.0): 1,
 ('on', 1.0): 242,
 ('02392441234', 1.0): 1,
 ('and', 1.0): 553,
 ('we', 1.0): 182,
 ('will', 1.0): 150,
 ('be', 1.0): 198,
 ('able', 1.0): 6,
 ('to', 1.0): 836,
 ('assist', 1.0): 1,
 ('you', 1.0): 1187,
 ('many', 1.0): 28,
 ('thanks', 1.0): 311,
 ('had', 1.0): 35,
 ('a', 1.0): 725,
 ('listen', 1.0): 8,
 ('last', 1.0): 36,
 ('night', 1.0): 50,
 ('as', 1.0): 82,
 ('bleed', 1.0): 2,
 ('is', 1.0): 354,
 ('an', 1.0): 99,
 ('amazing', 1.0): 39,
 ('track', 1.0): 5,
 ('when', 1.0): 69,
 ('are', 1.0): 152,
 ('scotland', 1.0): 2,
 ('congrats', 1.0)

In [8]:
# H√†m l·∫•y ra t·∫ßn su·∫•t xu·∫•t hi·ªán l√† value trong `freq` d·ª±a v√†o key (word, label)
def lookup(freqs, word, label):
    '''
    Args:
        freqs: m·ªôt t·ª´ ƒëi·ªÉn v·ªõi t·∫ßn s·ªë c·ªßa m·ªói c·∫∑p
        word: t·ª´ ƒë·ªÉ tra c·ª©u
        label: nh√£n t∆∞∆°ng ·ª©ng v·ªõi t·ª´
    Output:
        count: s·ªë l·∫ßn t·ª´ c√≥ nh√£n t∆∞∆°ng ·ª©ng xu·∫•t hi·ªán.
    '''
    count = 0

    pair = (word, label)
    if pair in freqs:
        count = freqs[pair]

    return count

In [9]:
#Ki·ªÉm tra k·∫øt qu·∫£
lookup(freqs, "just", 0)

197

**Logistic Regression**

In [10]:
def sigmoid(z): 
    '''
    Args:
        z: is the input (can be a scalar or an array)
    Output:
        h: the sigmoid of z
    '''
    # calculate the sigmoid of z
    h = 1 / (1 + np.exp(-z))

    return h

In [11]:
def gradient_descent(x, y, theta, alpha, num_iters):
    '''
    Args:
        x: matrix of features, c√≥ chi·ªÅu (m,n+1)
        y: label t∆∞∆°ng ·ª©ng (m,1)
        theta: vector tr·ªçng s·ªë (n+1,1)
        alpha: t·ªëc ƒë·ªô h·ªçc
        num_iters: s·ªë v√≤ng l·∫∑p
    Output:
        J: final cost
        theta: vector tr·ªçng s·ªë
    '''
    # l·∫•y m s·ªë l∆∞·ª£ng c√°c sample trong matrix x
    m = len(x)
    
    for i in tqdm(range(num_iters)):
        
        # T√≠nh z, ph√©p dot product: x v√† theta
        z = np.dot(x, theta)
        
        # T√≠nh h: sigmoid c·ªßa z
        y_hat = sigmoid(z)
        
        # T√≠nh cost function
        J = (-1 / m) * (np.dot(y.T, np.log(y_hat)) + np.dot((1 - y).T, np.log(1 - y_hat)))

        # C·∫≠p nh·∫≠t tr·ªçn s·ªë theta
        theta = theta - (alpha / m) * (np.dot(x.T, (y_hat - y)))
        
    return J, theta

In [12]:
# Ki·ªÉm qua k·∫øt qu·∫£
np.random.seed(1)

# X input: 10 x 3, bias l√† 1
tmp_X = np.append(np.ones((10, 1)), np.random.rand(10, 2) * 2000, axis=1)

# Y label: 10 x 1
tmp_Y = (np.random.rand(10, 1) > 0.5).astype(float)

# Apply gradient descent
tmp_J, tmp_theta = gradient_descent(tmp_X, tmp_Y, np.zeros((3, 1)), 1e-8, 100)
print(f"\nCost {tmp_J.item()}")
print(f"Weight {tmp_theta}")

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 13012.05it/s]


Cost 0.6860551249930995
Weight [[8.95483666e-08]
 [7.01794701e-05]
 [4.66109371e-05]]





**Tr√≠ch xu·∫•t c√°c feature**

In [13]:
def extract_features(text, freqs):
    '''
    Args: 
        text: tweet
        freqs: b·ªô t·ª´ ƒëi·ªÉn t·∫ßn su·∫•t xu·∫•t hi·ªán c·ªßa t·ª´ theo label (word, label)
    Output: 
        x: vector feature c√≥ chi·ªÅu (1,3)
    '''
    # ti·ªÅn x·ª≠ l√Ω
    word_l = basic_preprocess(text)
    
    # 3 th√†nh ph·∫ßn: bias, feature 1 v√† feature 2
    x = np.zeros((1, 3)) 
    
    # bias
    x[0,0] = 1 

    for word in word_l:
        x[0,1] += lookup(freqs, word, 1)
        
        x[0,2] += lookup(freqs, word, 0)

    assert(x.shape == (1, 3))
    return x

In [14]:
# Ki·ªÉm tra
freqs = count_freq_words(train_x, train_y)
print(train_x[0])
extract_features(train_x[0], freqs)

#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)


array([[1.000e+00, 4.722e+03, 1.612e+03]])

In [15]:
# Ki·ªÉm tra
# VD: c√°c t·ª´ kh√¥ng c√≥ trong b·ªô `freq`
x_test = "vi·ªát nam"
extract_features(x_test, freqs)

array([[1., 0., 0.]])

**Hu·∫•n luy·ªán m√¥ h√¨nh Logistic Regression**

In [16]:
# T·∫°o ma tr·∫≠n X c√≥ k√≠ch th∆∞·ªõc mxn v·ªõi n=3 (s·ªë features)
X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    X[i, :]= extract_features(train_x[i], freqs)

Y = np.expand_dims(train_y, 1)

# Hu·∫•n luy·ªán v·ªõi s·ªë v√≤ng l·∫∑p 1500, t·ªëc ƒë·ªô h·ªçc 1e-6
J, theta = gradient_descent(X, Y, np.zeros((3, 1)), 1e-9, 1500)
print(f"Cost {J.item()}.")
print(f"Weight {theta}")

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1500/1500 [00:05<00:00, 254.35it/s]

Cost 0.23335802523493712.
Weight [[ 5.85783372e-08]
 [ 5.70185881e-04]
 [-5.08632054e-04]]





**D·ª± ƒëo√°n**

In [17]:
def predict_tweet(text, freqs, theta):
    '''
    Args: 
        text: tweet
        freqs: b·ªô t·ª´ ƒëi·ªÉn t·∫ßn su·∫•t xu·∫•t hi·ªán c·ªßa t·ª´ theo label (word, label)
        theta: (3,1) vector tr·ªçng s·ªë
    Output: 
        y_pred: x√°c su·∫•t d·ª± ƒëo√°n
    '''
  
    # extract features
    x = extract_features(text, freqs)
    
    # d·ª± ƒëo√°n
    y_pred = sigmoid(np.dot(x, theta))  
    
    return y_pred

In [18]:
tests = ["happy", "sad"]
for t in tests:
    pred = predict_tweet(t, freqs, theta)
    print(f'{t} -> {pred}')

happy -> [[0.51894153]]
sad -> [[0.48785679]]


**ƒê√°nh gi√° ƒë·ªô ch√≠nh x√°c tr√™n t·∫≠p test**

In [19]:
acc = 0
for sentence, label in zip(test_x, test_y):

    # d·ª± ƒëo√°n t·ª´ng c√¢u trong t·∫≠p test
    pred = predict_tweet(sentence, freqs, theta)

    if pred > 0.5:
        pred_l = 1
    else:
        pred_l = 0

    # so s√°nh nh√£n d·ª± ƒëo√°n v·ªõi nh√£n th·ª±c t·∫ø
    if int(pred_l) == int(label):
        acc += 1

print('Accuracy: ', acc/len(test_x))

Accuracy:  0.967
