In [6]:
import numpy as np
import pandas as pd
import re
import string
import nltk
nltk.download('stopwords')
nltk.download('twitter_samples')
from nltk.corpus import twitter_samples,stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


In [7]:
def process_tweet(tweet):
    #remove
    stop_words=stopwords.words('english')
    tweet=re.sub(r'\$\w*','',tweet)
    tweet=re.sub(r'^RT[\s]+','',tweet)
    tweet=re.sub(r'https?:\/\/.*[\r\n]*','',tweet)
    tweet=re.sub(r'#','',tweet)
    #stem
    clear_word=[]
    stemmer=PorterStemmer()
    tokenizer=TweetTokenizer(preserve_case=False,strip_handles=True,reduce_len=True)
    tweet_tokens=tokenizer.tokenize(tweet)
    for word in tweet_tokens:
        if word not in stop_words and word not in string.punctuation:
            stem_word=stemmer.stem(word)
            clear_word.append(stem_word)
    return clear_word

In [8]:
def build_freq(tweets,sentiment):
    sen_list=np.squeeze(sentiment).tolist()
    freq={}
    for x,tweet in zip(sentiment,tweets):
        for word in process_tweet(tweet):
            pair=(word,x)
            if pair in freq:
                freq[pair]+=1
            else:
                freq[pair]=1
    return freq


In [9]:
pos_tweet=twitter_samples.strings('positive_tweets.json')
neg_tweet=twitter_samples.strings('negative_tweets.json')
pos_train=pos_tweet[:4000]
neg_train=neg_tweet[:4000]
pos_test=pos_tweet[4000:]
neg_test=neg_tweet[4000:]
train_data=pos_train+neg_train
test_data=pos_test+neg_test

In [10]:
y_pos_train=np.ones(len(pos_train))
y_neg_train=np.zeros(len(neg_train))
y_pos_test=np.ones(len(pos_test))
y_neg_test=np.zeros(len(neg_test))
label=np.append(y_pos_train,y_neg_train)
y_test=np.append(y_pos_test,y_neg_test)

In [11]:
freqs=build_freq(train_data,label)
def extract_features(tweet,freqs):
  words=process_tweet(tweet)
  x=np.zeros((1,3))
  x[0,0]=1
  for word in words:
    x[0,1]=x[0,1]+freqs.get((word, 1.0),0)
    x[0,2]=x[0,2]+freqs.get((word, 0.0),0)
  assert(x.shape==(1,3))
  return x


In [12]:
train_x=np.zeros((len(train_data),3))
for i in range(len(train_data)):
  train_x[i]=extract_features(train_data[i],freqs)
train_x

array([[1.000e+00, 3.020e+03, 6.100e+01],
       [1.000e+00, 3.573e+03, 4.440e+02],
       [1.000e+00, 3.005e+03, 1.150e+02],
       ...,
       [1.000e+00, 1.440e+02, 7.830e+02],
       [1.000e+00, 2.050e+02, 3.890e+03],
       [1.000e+00, 1.890e+02, 3.974e+03]])

In [None]:
train_x.shape,label.shape


((8000, 3), (8000,))

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
model=LogisticRegression()
model.fit(train_x,label)


In [14]:
test_x=np.zeros((len(test_data),3))
for i in range(len(test_data)):
  test_x[i]=extract_features(test_data[i],freqs)
predict=model.predict(test_x)
acc_score=accuracy_score(predict,y_test)
acc_score


0.9915

Check model


In [25]:
from os import preadv
test1='Hope you are safe'
test2='I am boring'
test3='It is raining heavily'
test4='The food smells good'
test5='Sorry I am late'
test6='I hate you'
test7='Tom is accused of cheating'
test8=':(((('

tests=[test1,test2,test3,test4,test5,test6,test7,test8]
test_x=[]
for test in tests:
  test_x.append(extract_features(test,freqs))
test_x=np.array(test_x)
test_x=test_x.reshape(8,3)
pre=model.predict(test_x)
for i in range(len(tests)):
  if pre[i]==1:
    print(f'test{i+1} is positive')
  else:
    print(f'test{i+1} is negative')


test1 is positive
test2 is positive
test3 is positive
test4 is positive
test5 is negative
test6 is positive
test7 is positive
test8 is negative
