# ML Based Sentiment Analysis

## Imports

In [2]:
import nltk


# file paths

In [3]:
pos_reviews_path = "rt-polaritydata/rt-polaritydata/rt-polarity.pos"
neg_reviews_path = "rt-polaritydata/rt-polaritydata/rt-polarity.neg"

### Constants

In [6]:
split_index = 2500

## Opening and reading lines

In [4]:
with open(pos_reviews_path, 'r', encoding='latin-1') as pos_file :
    pos_reviews = pos_file.readlines()

In [5]:
with open(neg_reviews_path, 'r', encoding='latin-1') as neg_file :
    neg_reviews = neg_file.readlines()

### Using Naive Bayes from NLTK 

In [8]:
test_negative_reviews = neg_reviews[split_index+1:]
test_positive_reviews = pos_reviews[split_index+1:]
training_negative_reviews = neg_reviews[:split_index]
training_positive_reviews = pos_reviews[:split_index]

In [9]:
def get_vocabulary() :
    pos_word_list = [word for line in training_positive_reviews for word in line.split()]
    neg_word_list = [word for line in training_negative_reviews for word in line.split()]
    all_word_list = [item for sublist in [pos_word_list, neg_word_list] for item in sublist]
    all_word_set = list(set(all_word_list))
    vocabulary = all_word_set
    return vocabulary
    

In [10]:
vocabulary = get_vocabulary()

In [13]:
len(vocabulary)

14094

In [18]:
def get_training_data():
    neg_train_tagged_list = [{'review': one_review.split(), 'label' : 'negative'} for one_review in training_negative_reviews]
    pos_train_tagged_list = [{'review': one_review.split(), 'label' : 'positive'} for one_review in training_positive_reviews]
    full_tagged_list = [item for sublist in [neg_train_tagged_list, pos_train_tagged_list] for item in sublist]
    training_data = [(review['review'],review['label']) for review in full_tagged_list]
    return training_data
    

In [19]:
training_data = get_training_data()

In [20]:
training_data[0]

(['simplistic', ',', 'silly', 'and', 'tedious', '.'], 'negative')

In [21]:
len(training_data)

5000

### Feature Extraction 

In [25]:
def extract_features(review):
    review_words = set(review)
    features = {}
    for word in vocabulary :
        features[word] = (word in review_words)
    return features
        

### Using the Naive Bayes Classifier

In [26]:
def get_trained_naive_bayes_classifier(extract_features, training_data):
    training_features = nltk.classify.apply_features(extract_features, training_data)
    trained_nb_classifier = nltk.NaiveBayesClassifier.train(training_features)
    return trained_nb_classifier


In [27]:
trained_nb_classifier = get_trained_naive_bayes_classifier(extract_features, training_data)


In [28]:
def naive_bayes_sentiment_calculator(review):
    test_instance = review.split()
    test_features = extract_features(test_instance)
    return trained_nb_classifier.classify(test_features)

## Testing the Naive Bayes Sentiment Calculator

In [30]:
naive_bayes_sentiment_calculator("What an amazing movie")

'positive'

In [33]:
naive_bayes_sentiment_calculator("I hated that movie ")

'negative'

In [34]:
def get_review_sentiment(senti_calculator):
    neg_reviews_result = [senti_calculator(review) for review in test_negative_reviews]
    pos_reviews_result = [senti_calculator(review) for review in test_positive_reviews]
    label_to_num = {'positive' : 1, 'negative' : -1}
    neg_results_score = [label_to_num[x] for x in neg_reviews_result]
    pos_results_score = [label_to_num[x] for x in pos_reviews_result]
    return {'results-on-positive' : pos_results_score, 'results-on-negative' : neg_results_score}
    

In [37]:
naive_bayes_results = get_review_sentiment(naive_bayes_sentiment_calculator)

## Checking the accuracy of Naive Bayes

In [44]:
sum(naive_bayes_results['results-on-positive'])


1324

In [45]:
sum(naive_bayes_results['results-on-negative'])


-1530

In [47]:
pos_reviews_results = naive_bayes_results['results-on-positive']

pct_true_positives = sum(x>0 for x in pos_reviews_results)/len(pos_reviews_results)

In [48]:
pct_true_positives

0.7339222614840989

In [51]:
neg_reviews_results = naive_bayes_results['results-on-negative']

pct_true_negatives = sum(x>0 for x in neg_reviews_results)/len(neg_reviews_results)

In [52]:
pct_true_negatives

0.22968197879858657