## Sentiment analysis

In real world, we all have constant evolution about natural language processing, one of classic task is sentiment analysis to predict text classification. This task is very important still nowdays...

# 1) Import dependencies

In [1]:
import pyspark

conf = pyspark.SparkConf()
conf.setAppName('Minha aplicação')
conf.setMaster('local[*]')

sc = pyspark.SparkContext(conf=conf)

In [2]:
sc

# 1.1) Load data

We will use review file to train our model.

In [3]:
def parse_line(line):
    parts = line[1:-1].split('","')
    sentiment = int(parts[0])
    title = parts[1].replace('""', '"')
    body = parts[2].replace('""', '"')
    return (sentiment, title, body)

rdd = sc.textFile('train.csv').map(parse_line)

In [4]:
print('The len of data is: ', rdd.count())

The len of data is:  3600000


In [5]:
# Sanity Check:
rdd.take(1)

[(2,
  'Stuning even for the non-gamer',
  'This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^')]

Yay! We have a dataset with 3600000 instances labaled!

In [6]:
import os

pickle_filename = 'reviews.pickle'
if pickle_filename not in os.listdir():
    print('Saving as pickle file...')
    rdd.saveAsPickleFile(pickle_filename)
else:
    print('Getting pickle file...')
    rdd = sc.pickleFile('reviews.pickle')

Getting pickle file...


# 2) Let's create first Classifier!

In here, we will create Naive-bayes classifier. So, let's started!

Remember, all data science project follow this step-by-step:

- i)   Data aquisition and Sanity Check data
- ii)  Explore data
- iii) Pre-processing
- iv)  Create model
- v)   Train model
- vi)  Evaluate model
- vii) Deploy model

In our case, we just complete i), ii) (not required to exploring in this text mode), so let's pre-processing data!

## 2.2) Pre-processing

To treat all this data, let's build a class and discuss all step-by-step:

In [7]:
import numpy as np

'''

Slice dataset in train-test

'''

def train_test_split(rdd, test_size=0.2, random_seed=42):
    # Divida o RDD em duas partes: 70% e 30%
    rdd_partes = rdd.randomSplit([1-test_size, test_size], seed=random_seed)
    
    # Acesse as partes individualmente
    train = rdd_partes[0]
    test = rdd_partes[1]
    return train, test

In [8]:
rdd_train_full, _ = train_test_split(rdd, test_size=0.55)
rdd_train, rdd_test = train_test_split(rdd_train_full, test_size=0.2)

In [9]:
rdd_test.count()

324448

In [10]:
class BaseModel:
    def __init__(self):
        self.n_total_words = None
        self.rdd_label_1 = None
        self.rdd_label_2 = None
        self.rdd_full = None
        self.total_bag_1 = None
        self.total_bag_2 = None
        self.total_bag = None
        self.log_prob_C_1 = None
        self.log_prob_C_2 = None
        
    def tokenizer(self, X):
        '''
        
        #TODO Tokenizer from lists
        
        '''
        pass
    
    def get_log_probs(self, tokens, label=1):
        '''
        
        #TODO get_log_probs
        
        '''
        
        pass 
    
    def get_words(self, rdd):
        value = rdd.map(lambda x: x[1].strip().lower() + ' ' + x[2].strip().lower()).flatMap(lambda x: x.split()).map(lambda x: (x, 1)) \
                .reduceByKey(lambda x, y: x + y)
        return value
        
    def get_tokens(self, rdd):
        '''
        
        #TODO get_tokens from rdd
        
        '''
        
        pass
    
    def get_count(self, rdd):
        '''
        
        #TODO get_count from rdd
        
        '''
        pass
    
    def total_words(self, rdd):
        '''
        
        #TODO total_words from rdd using rdd frequencies
        
                
        '''
    
    def to_log(self, rdd):
        '''
        
        #TODO to_log from rdd counts
        
        '''
        
        pass
    
    def get_label(self, rdd, label=1):
        '''
        
        #TODO get_label from rdd
        
        '''
        
        pass
        
        
        
    def full_outer_join(self, rdd_label_2, rdd_label_1):
        '''
        
        #TODO full_outer_join from rdd_label_2 and rdd_label_1 (rdd frequencies)
        
        '''
        
        pass

    def get_total_words(self, rdd_label):
        '''
        
        #TODO get_total_words from rdd_label
        
        '''
        
        pass    
    

    def laplace(self, rdd, alpha=1.0):

        '''

        TODO LAPLACE

        '''
        
        pass
    

In [11]:
class NaiveBayesClassifier(BaseModel):
    def __init__(self):
        super(NaiveBayesClassifier, self).__init__()
        self.sc = sc
        self.bag = None
        self.broadcast_bag = None
    
    def train(self, X, y=None):
        
        rdd = X
        
        self.n_total_words = self.total_words(self.get_count(rdd))
        print('Get total words...')
        self.rdd_label_1 = self.get_count(self.get_label(rdd, label=1))
        print('Get label 1 words...')
        self.rdd_label_2 = self.get_count(self.get_label(rdd, label=2))
        print('Get label 2 words...')
        self.rdd_full_temp = self.full_outer_join(self.rdd_label_2, self.rdd_label_1)
        print('Get all log probs...')

        self.rdd_full = self.to_log(self.laplace(self.rdd_full_temp))
        
        self.total_bag_1 = self.get_total_words(self.rdd_label_1)
        print('Get bag 1...')
        self.total_bag_2 = self.get_total_words(self.rdd_label_2)
        print('Get bag 2...')
    
        self.total_bag = self.get_total_words(self.rdd_full)
        print('Get total bag...')
        
        self.log_prob_C_1 = np.log10(self.total_bag_1 / self.total_bag)
        self.log_prob_C_2 = np.log10(self.total_bag_2 / self.total_bag)
        
        print('Creating hashtable...')
        self.bag = {k: {'log_prob_positive': lp, 'log_prob_negative': ln} for k, lp, ln in self.rdd_full.collect()}
        print('Done!')
        
        return None

    def predict(self, X):

        tokens = self.tokenizer(X)
        log_probs_1 = self.log_prob_C_1 + sum(self.get_log_probs(tokens, label=1))
        log_probs_2 = self.log_prob_C_2 + sum(self.get_log_probs(tokens, label=2))
        # print(log_probs_1, log_probs_2)

        y_hat = 1 if log_probs_1 > log_probs_2 else 2
        buffer = np.exp(np.array([log_probs_1, log_probs_2]))
        probs = buffer / buffer.sum()
        entropy = - probs[0] * np.log(probs[0]) - probs[1] * np.log(probs[1])
        return y_hat, probs, entropy

    def predict_rdd(self, rdd):
        local_broadcast_bag = sc.broadcast(self.bag)
        local_broadcast_log_prob_C_1 = sc.broadcast(self.log_prob_C_1)
        local_broadcast_log_prob_C_2 = sc.broadcast(self.log_prob_C_2)
        
        def predict_review_(review):
            tokens = review.lower().split()
            bag = local_broadcast_bag.value
            log_prob_C_1 = local_broadcast_log_prob_C_1.value
            log_prob_C_2 = local_broadcast_log_prob_C_2.value
            log_probs_1 = sum([bag[token]['log_prob_negative'] for token in tokens if token in bag]) + log_prob_C_1
            log_probs_2 = sum([bag[token]['log_prob_positive'] for token in tokens if token in bag]) + log_prob_C_2
            return log_probs_1, log_probs_2
        
        def predict_rdd_(rdd):
            return rdd.map(lambda x: (x[1] + ' ' + x[2], x[0], predict_review_(x[1] + ' ' + x[2]))).map(lambda x: (x[0], x[1], 1 if x[2][0] > x[2][1] else 2))
        return predict_rdd_(rdd)
    

In [12]:
model = NaiveBayesClassifier()

In [13]:
model.train(rdd_train)

Get total words...
Get label 1 words...
Get label 2 words...
Get all log probs...
Get bag 1...
Get bag 2...
Get total bag...
Creating hashtable...
Done!


In [14]:
sample = '''
    This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! 
    I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! 
    It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^
'''.lower() # True label = 2 (positive)

In [15]:
model.predict(sample)

(2, array([4.02157975e-04, 9.99597842e-01]), 0.0035464158168687465)

## Word Cloud

In [16]:
def get_biggest_gradient(rdd, get_postive=True):
    mapped = rdd.map(lambda x: (x[0], np.exp(x[1]) - np.exp(x[2])))
    sorted_rdd = mapped.sortBy(lambda x: x[1], ascending=not get_postive)
    return sorted_rdd

In [17]:
more_positive = get_biggest_gradient(model.rdd_full).take(100)
more_negative = get_biggest_gradient(model.rdd_full, get_postive=False).take(100)

In [18]:
more_positive, more_negative

([('great', 0.04609083261618893),
  ('love', 0.030412711580681845),
  ('best', 0.030378116910397587),
  ('excellent', 0.029755241562465895),
  ('easy', 0.024180575634912264),
  ('wonderful', 0.021751140528508545),
  ('highly', 0.021678981523263384),
  ('perfect', 0.020493107449738836),
  ('favorite', 0.01939627030347198),
  ('awesome', 0.01813581253778119),
  ('and', 0.017785920342953537),
  ('loves', 0.017529420057399943),
  ('amazing', 0.01737859773901529),
  ('well', 0.016853610504812225),
  ('fun', 0.01604903104016305),
  ('beautiful', 0.015247899011071701),
  ('enjoyed', 0.01498688512503896),
  ('loved', 0.014938995709557142),
  ('great!', 0.014261138635231107),
  ('good', 0.014249626752536118),
  ('fantastic', 0.01366770029757496),
  ('works', 0.013426625874115895),
  ('album', 0.013422995585168691),
  ('must', 0.012977586349611034),
  ('classic', 0.01238458955769707),
  ('nice', 0.012141623682665817),
  ('life', 0.012135232271361762),
  ('it!', 0.011789879595124904),
  ('helps',

In [19]:
# !pip install wordcloud

In [20]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

more_positives_d = dict(more_positive)
more_positives_n = dict(more_negative)

In [None]:
# Crie uma instância do WordCloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(more_positives_d)

# Plote a Word Cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Title: More positive')
plt.show()

In [None]:
# Crie uma instância do WordCloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(more_positives_n)

# Plote a Word Cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Title: More negative')
plt.show()

## Evaluate model

In [24]:
def get_accuracy(predictions):
    '''
    
    #TODO get_accuracy
    
    '''
    pass

def get_precision(predicitons):
    '''
    
    #TODO get_precision
    
    '''
    pass
    
    
def get_recall(predictions):
    '''
    
    #TODO get_recall
    
    '''
    pass

def evaluate(predictions):
    '''
    
    #TODO evaluate
    
    '''
    pass

In [25]:
rdd_predictions = model.predict_rdd(rdd_test)

In [None]:
predictions = rdd_predictions.collect()

In [None]:
accuracy, precision, recall, f1 = evaluate(predictions)

In [None]:
print(f'Accuracy: {accuracy * 100 :.2f} %')
print(f'Precision: {precision * 100 :.2f} %')
print(f'Recall: {recall * 100 :.2f} %')
print(f'F1: {f1 * 100 :.2f} %')