<a href="https://colab.research.google.com/github/tejas2008/Sentiment-analysis/blob/master/sentiment_logistic_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import re
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer(strip_handles=True)
from nltk.stem import PorterStemmer 

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
import numpy as np
import pandas as pd
from nltk.corpus import twitter_samples 
from nltk.corpus import stopwords 
nltk.download('twitter_samples')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

In [5]:
len(positive_tweets)

5000

In [51]:

test_pos = positive_tweets[4000:]
train_pos = positive_tweets[:4000]
test_neg = negative_tweets[4000:]
train_neg = negative_tweets[:4000]
train_x = train_pos + train_neg 
test_x = test_pos + test_neg

In [7]:
len(train_x)

8000

In [8]:
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [9]:
train_y

array([[1.],
       [1.],
       [1.],
       ...,
       [0.],
       [0.],
       [0.]])

In [10]:
stemmer = PorterStemmer()
def preprocess(tweet):
  punc = '''!()-[]{};:'"\, <>./?@#$%^&*'''
  tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
  tweet = re.sub(r'#', '', tweet)
  tokn_word = tknzr.tokenize(tweet)
  stp_words = set(stopwords.words('english')) 
  out = []
  for i in tokn_word:
    if i not in stp_words and i not in punc:
      j = stemmer.stem(i)
      out.append(j)
  return out


In [12]:
for i in range(len(train_x)):
  train_x[i] = preprocess(train_x[i])
print(train_x[0])

['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']


In [13]:
fre = {}
def frequency(l):
  freqs = {}
  for i in range(4000):
    for word in l[i]:
      if (word,1) in freqs:
        freqs[(word,1)] += 1
      else:
        freqs[(word,1)] = 1 
  for i in range(4000,8000):
    for word in l[i]:
      if (word,0) in freqs:
        freqs[(word,0)] += 1
      else:
        freqs[(word,0)] = 1 
  return freqs
fre = frequency(train_x)
# print(fre)

In [36]:
fre[('it',0)]

17

In [38]:
def vec_for_model(fre,l):
  x = []
  for i in range(len(l)):
    temp = []
    temp.append(1)
    pos_count = 0
    neg_count = 0
    for word in l[i]:
      pos_count += fre.get((word,1),0)
      neg_count += fre.get((word,0),0)
    temp.append(pos_count/100)
    temp.append(neg_count/100)
    x.append(temp)
  return x
X = np.array(vec_for_model(fre,train_x))
print(X[:10])
print(X.shape)

[[1.000e+00 3.020e+01 6.100e-01]
 [1.000e+00 3.591e+01 4.620e+00]
 [1.000e+00 3.113e+01 2.190e+00]
 [1.000e+00 2.862e+01 4.000e-02]
 [1.000e+00 3.116e+01 2.240e+00]
 [1.000e+00 2.986e+01 1.570e+00]
 [1.000e+00 4.064e+01 6.120e+00]
 [1.000e+00 3.205e+01 3.290e+00]
 [1.000e+00 6.200e+00 1.830e+00]
 [1.000e+00 2.670e+00 1.170e+00]]
(8000, 3)


In [39]:
print(train_x[0])

['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']


In [43]:

def gradient(x,y,no_itr):
  eta = 0.001
  wt = np.zeros((3,1))
  # wt = np.random.rand(3,1)
  for i in range(no_itr):
    z = np.dot(x,wt)
    h = 1.0/(1 + np.exp(-1.0*z))
    j=(-1/len(x))*(np.dot(y.T,np.log(h))+np.dot((1-𝐲).T,np.log(1-𝐡)))
    wt=wt-(eta/len(x))*np.dot(𝐱.T,(𝐡-𝐲))
    # print('z: ',z)
    # print('h: ',h)
    # print('j: ',j)
    # print('wt: ',wt)
  return j,wt

cost,weights = gradient(X,train_y,10)
print(cost)
print(weights)


[[0.30470152]]
[[ 0.00051714]
 [ 0.04217543]
 [-0.045004  ]]


In [54]:
def pre_sentiment(tweet,weights,fre):
  l = preprocess(tweet)
  l1 = []
  l1.append(l)
  x = np.array(vec_for_model(fre,l1),dtype=np.float128)
  z = np.dot(x,weights)
  # print(x)
  # print(z)
  y_pr = 1/(1+np.exp(-z))
  # print(y_pr)
  if y_pr >= 0.5 :
    return 'Positive'
  else:
    return 'Negative'

pre_sentiment('bad tweet',weights,fre)


'Negative'

In [55]:
pre_sentiment('bad movie',weights,fre)

'Negative'

In [57]:
predictions = []
for i in range(len(test_x)):
  p = pre_sentiment(test_x[i],weights,fre)
  if p == 'Negative' :
    predictions.append(0.from_bytes)
  else:
    predictions.append(1.)
print(predictions[:10])
print(test_y[:10])

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
[[1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]]


In [65]:
c = 0
for i in range(len(predictions)):
  if predictions[i] == test_y[i][0]:
    c += 1
print('Accuracy on testing data: ',(c/len(predictions)*100))


Accuracy on testing data:  97.39999999999999
