In [1]:
import nltk                                  
from nltk.corpus import twitter_samples      
import matplotlib.pyplot as plt              
import numpy as np
import pandas as pd
import random
import re                               
import string                             
from nltk.corpus import stopwords       
from nltk.stem import PorterStemmer        
from nltk.tokenize import TweetTokenizer 
from os import getcwd

In [2]:
nltk.download('twitter_samples')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
def process_tweet(tweet):
 
    tweet2 = re.sub(r'^RT[\s]+', '', tweet)
    tweet2 = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet2)
    tweet2 = re.sub(r'#', '', tweet2)
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet2)
    stopwords_english = stopwords.words('english') 
    tweets_clean = []
 
    for word in tweet_tokens: 
        if (word not in stopwords_english and  
            word not in string.punctuation):  
            tweets_clean.append(word)
 
    stemmer = PorterStemmer() 
 
 
    tweets_stem = [] 
 
    for word in tweets_clean:
        stem_word = stemmer.stem(word)  
        tweets_stem.append(stem_word)
 
    return tweets_stem

In [4]:
def build_freqs(tweets, ys):
    yslist = np.squeeze(ys).tolist()
 
    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1    
    return freqs

In [5]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')
 
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]
 
train_x = train_pos + train_neg 
test_x = test_pos + test_neg

In [6]:
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)
 
 
freqs = build_freqs(train_x, train_y)

In [7]:
def sigmoid(z): 
 
    
    h = 1 / (1 + np.exp(-z))
    
    
    return h

In [8]:
def gradientDescent(x, y, theta, alpha, num_iters):
 
    m = x.shape[0]
    
    for i in range(0, num_iters):
        
        z = np.dot(x,theta)
        h = sigmoid(z)
        J = -1./m * (np.dot(y.transpose(), np.log(h)) + np.dot((1-y).transpose(),np.log(1-h)))
        theta = theta = theta - (alpha/m) * np.dot(x.transpose(),(h-y))
        
    J = float(J)
    return J, theta

In [9]:
def extract_features(tweet, freqs):
 
    word_l = process_tweet(tweet)
    x = np.zeros((1, 3)) 
    x[0,0] = 1 
    
    for word in word_l:
 
        x[0,1] += freqs.get((word, 1.0),0)
        x[0,2] += freqs.get((word, 0.0),0)
    
    assert(x.shape == (1, 3))
    return x

In [10]:
X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    X[i, :]= extract_features(train_x[i], freqs)
Y = train_y
J, theta = gradientDescent(X, Y, np.zeros((3, 1)), 1e-9, 1500)

In [11]:
def predict_tweet(tweet, freqs, theta):
    x = extract_features(tweet,freqs)
    y_pred = sigmoid(np.dot(x,theta))
    
    return y_pred

In [12]:
def test_logistic_regression(test_x, test_y, freqs, theta):
    
    y_hat = []
    
    for tweet in test_x:
        y_pred = predict_tweet(tweet, freqs, theta)
        
        if y_pred > 0.5:
            y_hat.append(1)
        else:
            y_hat.append(0)
 
    accuracy = (y_hat==np.squeeze(test_y)).sum()/len(test_x)
    
    return accuracy

In [13]:
my_tweet = input('comments your thoughts: \n')
print(process_tweet(my_tweet))
y_hat = predict_tweet(my_tweet, freqs, theta)
print(y_hat)
if y_hat > 0.5:
    print('Positive sentiment')
else: 
    print('Negative sentiment')

comments your thoughts: 
it was my worst experience
['worst', 'experi']
[[0.49951328]]
Negative sentiment
