In [1]:
import nltk
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer        
from nltk.tokenize import TweetTokenizer
import string
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from functions import process_tweet,build_freqs

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vidit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

X = all_positive_tweets + all_negative_tweets
Y = np.append(np.ones((len(all_positive_tweets), 1)), np.zeros((len(all_negative_tweets), 1)), axis=0)

In [3]:
freqs = build_freqs(X,Y)

print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))

type(freqs) = <class 'dict'>
len(freqs) = 13067


In [4]:
def frequency(tweets,labels,freqs):
    word_list = []
    for tweet in tweets:
        cleaned_tweet = process_tweet(tweet)
        for word in cleaned_tweet:
            if word not in word_list:
                word_list.append(word)
                
    pos_freq = []
    neg_freq = []
    for i in range(len(word_list)):
        pos_freq.append(freqs.get((word_list[i],1.0),0) + 1)
        neg_freq.append(freqs.get((word_list[i],0.0),0) + 1)
        
    data = {
        'Word':word_list,
        'Positive Frequency':pos_freq,
        'Negative Frequency':neg_freq
    }
    dataset = pd.DataFrame(data)
    return dataset,word_list

In [5]:
dataset,word_list = frequency(X,Y,freqs)
dataset.head(10)

Unnamed: 0,Word,Positive Frequency,Negative Frequency
0,followfriday,26,1
1,top,33,7
2,engag,8,1
3,member,17,7
4,commun,34,3
5,week,84,57
6,:),3569,3
7,hey,77,27
8,jame,8,5
9,odd,3,4


In [6]:
total_words = len(dataset['Word'])

n_positive = dataset['Positive Frequency'].sum()
n_negative = dataset['Negative Frequency'].sum()

In [7]:
dataset['Positive'] = dataset['Positive Frequency']/(n_positive+total_words)
dataset['Negative'] = dataset['Negative Frequency']/(n_negative+total_words)
pos_freq = dataset['Positive'].tolist()
neg_freq = dataset['Negative'].tolist()

In [8]:
del dataset['Positive Frequency']
del dataset['Negative Frequency']

In [9]:
dataset['Lambda'] = np.log(dataset['Positive']/dataset['Negative'])
del dataset['Positive']
del dataset['Negative']

In [10]:
dataset.head()

Unnamed: 0,Word,Lambda
0,followfriday,3.256136
1,top,1.548637
2,engag,2.077481
3,member,0.885343
4,commun,2.425788


In [11]:
lambda_list = dataset['Lambda'].tolist() 

In [12]:
def predict(tweet,word_list,lambda_list):
    lambda_sum = 0
    cleaned_tweet = process_tweet(tweet)
    for word in cleaned_tweet:
        try:
            index = word_list.index(word)
        except:
            index = -1
        
        if(index!=-1):
            lambda_sum += lambda_list[index]
        else:
            lambda_sum += 0
    return lambda_sum

In [13]:
lambda_sum_list = []

for i in range(len(X)):
    lambda_sum_list.append(predict(X[i],word_list,lambda_list))

In [14]:
Y_pred = []
for value in lambda_sum_list:
    if value>0:
        Y_pred.append(1)
    else:
        Y_pred.append(0)

In [15]:
from sklearn.metrics import confusion_matrix,accuracy_score

In [16]:
confusion_matrix(Y,Y_pred)

array([[4989,   11],
       [  24, 4976]], dtype=int64)

In [17]:
print(str(accuracy_score(Y,Y_pred)*100)+"%")

99.65%


In [18]:
custom_tweet = "I am having a lovely day"
if(predict(custom_tweet,word_list,lambda_list)>0):
    print('Positive Sentiment')
else:
    print('Negative Sentiment')

Positive Sentiment
