## Sentiment analysis: speed and parallelization

Import Libraries and Json articles

In [1]:
import re
import numpy as np
import pandas as pd
from pandarallel import pandarallel

from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer

import datetime
import pytz

pd.set_option('display.max_rows', 10)
pd.set_option('display.max_colwidth', 200)

In [2]:
# !pip install vaderSentiment

In [3]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [4]:
import multiprocessing

num_processors = multiprocessing.cpu_count()
print(f'Available CPUs: {num_processors}')

Available CPUs: 8


In [5]:
%%time

tweets_df = pd.read_json('https://storage.googleapis.com/msca-bdp-data-open/tweets/tweets_ai_ml.json', orient='records', lines=True)
print(f'Rows: {tweets_df.shape[0]}, Columns: {tweets_df.shape[1]}')

Rows: 99787, Columns: 8
CPU times: user 784 ms, sys: 208 ms, total: 992 ms
Wall time: 1.71 s


In [6]:
tweets_df.head(5)

Unnamed: 0,id,lang,created_at,screen_name,name,location,retweet_count,text
0,1450733866327498752,en,2021-10-20 08:01:21,rahul05ranjan,Rahul,"Chandigarh, India",14.0,RT @its___Denise: https://t.co/u0ULJ5X61F\n\n#WomenWhoCode Lakers #javascript #GirlsWhoCode #Ronaldo Liverpool #PataChanjoKaaChonjo #mydonkey…
1,1450733877501308933,en,2021-10-20 08:01:24,WomenCodersBot,Women Coders Bot,"Hamburg, Germany",14.0,RT @its___Denise: https://t.co/u0ULJ5X61F\n\n#WomenWhoCode Lakers #javascript #GirlsWhoCode #Ronaldo Liverpool #PataChanjoKaaChonjo #mydonkey…
2,1450733902025400331,en,2021-10-20 08:01:30,WomenCodersBot,Women Coders Bot,"Hamburg, Germany",14.0,RT @its___Denise: https://t.co/u0ULJ5X61F\n\n#WomenWhoCode Lakers #javascript #GirlsWhoCode #Ronaldo Liverpool #PataChanjoKaaChonjo #mydonkey…
3,1450733918706184192,en,2021-10-20 08:01:34,appthisway,Appthisway®,United Kingdom,,RT @IainLJBrown: UC adopts recommendations for the responsible use of Artificial Intelligence - Preuss School Ucsd\n\nRead more here: https:/…
4,1450733927929454593,en,2021-10-20 08:01:36,aProgrammerBot,Another Programmer Bot,,14.0,RT @its___Denise: https://t.co/u0ULJ5X61F\n\n#WomenWhoCode Lakers #javascript #GirlsWhoCode #Ronaldo Liverpool #PataChanjoKaaChonjo #mydonkey…


In [7]:
tweets_df['lang'].value_counts()

en    99787
Name: lang, dtype: int64

In [8]:
%%time

# Remove special characters to avoid problems with analysis
tweets_df['text_clean'] = tweets_df['text'].map(lambda x: re.sub('[^a-zA-Z0-9 @ . , : - _]', '', str(x)))

# Remove initial "RT" from the tweets
tweets_df['text_clean'] = tweets_df['text_clean'].apply(lambda x: re.compile('\#').sub('', re.compile('RT @').sub('@', x, count=1).strip()))

CPU times: user 1.16 s, sys: 3.81 ms, total: 1.16 s
Wall time: 1.16 s


In [9]:
tweets_df[['text', 'text_clean']].head(5)

Unnamed: 0,text,text_clean
0,RT @its___Denise: https://t.co/u0ULJ5X61F\n\n#WomenWhoCode Lakers #javascript #GirlsWhoCode #Ronaldo Liverpool #PataChanjoKaaChonjo #mydonkey…,@its___Denise: https:t.cou0ULJ5X61FWomenWhoCode Lakers javascript GirlsWhoCode Ronaldo Liverpool PataChanjoKaaChonjo mydonkey
1,RT @its___Denise: https://t.co/u0ULJ5X61F\n\n#WomenWhoCode Lakers #javascript #GirlsWhoCode #Ronaldo Liverpool #PataChanjoKaaChonjo #mydonkey…,@its___Denise: https:t.cou0ULJ5X61FWomenWhoCode Lakers javascript GirlsWhoCode Ronaldo Liverpool PataChanjoKaaChonjo mydonkey
2,RT @its___Denise: https://t.co/u0ULJ5X61F\n\n#WomenWhoCode Lakers #javascript #GirlsWhoCode #Ronaldo Liverpool #PataChanjoKaaChonjo #mydonkey…,@its___Denise: https:t.cou0ULJ5X61FWomenWhoCode Lakers javascript GirlsWhoCode Ronaldo Liverpool PataChanjoKaaChonjo mydonkey
3,RT @IainLJBrown: UC adopts recommendations for the responsible use of Artificial Intelligence - Preuss School Ucsd\n\nRead more here: https:/…,@IainLJBrown: UC adopts recommendations for the responsible use of Artificial Intelligence Preuss School UcsdRead more here: https:
4,RT @its___Denise: https://t.co/u0ULJ5X61F\n\n#WomenWhoCode Lakers #javascript #GirlsWhoCode #Ronaldo Liverpool #PataChanjoKaaChonjo #mydonkey…,@its___Denise: https:t.cou0ULJ5X61FWomenWhoCode Lakers javascript GirlsWhoCode Ronaldo Liverpool PataChanjoKaaChonjo mydonkey


### Vader

#### Timing Vader with native Python Pandas single-threading

In [10]:
%%time

tweets = tweets_df.copy()


analyzer = SentimentIntensityAnalyzer()

sentiment_vader = pd.DataFrame()
sentiment_vader['vader'] = tweets['text_clean'].apply(analyzer.polarity_scores)
sentiment_vader = pd.DataFrame(sentiment_vader['vader'].tolist())

# Create a positive / negative sentiment variable
sentiment_vader['sentiment'] = np.where(sentiment_vader['compound'].astype(float) > 0, 'Positive', 'Negative')
sentiment_vader['sentiment'] = np.where(sentiment_vader['compound'].astype(float) == 0, 'Neutral', sentiment_vader['sentiment'])

# Merge Vader values with Sentiments
tweets = pd.merge(tweets, sentiment_vader, left_index=True, right_index=True)
tweets[['text_clean', 'sentiment', 'compound']].head(5)

CPU times: user 12.3 s, sys: 28.7 ms, total: 12.4 s
Wall time: 12.4 s


Unnamed: 0,text_clean,sentiment,compound
0,@its___Denise: https:t.cou0ULJ5X61FWomenWhoCode Lakers javascript GirlsWhoCode Ronaldo Liverpool PataChanjoKaaChonjo mydonkey,Neutral,0.0
1,@its___Denise: https:t.cou0ULJ5X61FWomenWhoCode Lakers javascript GirlsWhoCode Ronaldo Liverpool PataChanjoKaaChonjo mydonkey,Neutral,0.0
2,@its___Denise: https:t.cou0ULJ5X61FWomenWhoCode Lakers javascript GirlsWhoCode Ronaldo Liverpool PataChanjoKaaChonjo mydonkey,Neutral,0.0
3,@IainLJBrown: UC adopts recommendations for the responsible use of Artificial Intelligence Preuss School UcsdRead more here: https:,Positive,0.7269
4,@its___Denise: https:t.cou0ULJ5X61FWomenWhoCode Lakers javascript GirlsWhoCode Ronaldo Liverpool PataChanjoKaaChonjo mydonkey,Neutral,0.0


#### Timing Vader with parallelization

In [11]:
pandarallel.initialize(nb_workers=num_processors-1, use_memory_fs=False)

INFO: Pandarallel will run on 7 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [12]:
%%time

tweets = tweets_df.copy()


analyzer = SentimentIntensityAnalyzer()

sentiment_vader = pd.DataFrame()
sentiment_vader['vader'] = tweets['text_clean'].parallel_apply(analyzer.polarity_scores)
sentiment_vader = pd.DataFrame(sentiment_vader['vader'].tolist())

# Create a positive / negative sentiment variable
sentiment_vader['sentiment'] = np.where(sentiment_vader['compound'].astype(float) > 0, 'Positive', 'Negative')
sentiment_vader['sentiment'] = np.where(sentiment_vader['compound'].astype(float) == 0, 'Neutral', sentiment_vader['sentiment'])

# Merge Vader values with Sentiments
tweets = pd.merge(tweets, sentiment_vader, left_index=True, right_index=True)
tweets[['text_clean', 'sentiment', 'compound']].head(5)

CPU times: user 582 ms, sys: 193 ms, total: 775 ms
Wall time: 3.17 s


Unnamed: 0,text_clean,sentiment,compound
0,@its___Denise: https:t.cou0ULJ5X61FWomenWhoCode Lakers javascript GirlsWhoCode Ronaldo Liverpool PataChanjoKaaChonjo mydonkey,Neutral,0.0
1,@its___Denise: https:t.cou0ULJ5X61FWomenWhoCode Lakers javascript GirlsWhoCode Ronaldo Liverpool PataChanjoKaaChonjo mydonkey,Neutral,0.0
2,@its___Denise: https:t.cou0ULJ5X61FWomenWhoCode Lakers javascript GirlsWhoCode Ronaldo Liverpool PataChanjoKaaChonjo mydonkey,Neutral,0.0
3,@IainLJBrown: UC adopts recommendations for the responsible use of Artificial Intelligence Preuss School UcsdRead more here: https:,Positive,0.7269
4,@its___Denise: https:t.cou0ULJ5X61FWomenWhoCode Lakers javascript GirlsWhoCode Ronaldo Liverpool PataChanjoKaaChonjo mydonkey,Neutral,0.0


In [13]:
tweets['sentiment'].value_counts()

Neutral     46524
Positive    44009
Negative     9254
Name: sentiment, dtype: int64

### TextBlob PatternAnalyzer

#### Timing TextBlob PatternAnalyzer with native Python Pandas single-threading

In [14]:
%%time

tweets = tweets_df.copy()

tweets['polarity'] = tweets.apply(lambda x: TextBlob(x['text_clean']).sentiment.polarity, axis=1)
tweets['subjectivity'] = tweets.apply(lambda x: TextBlob(x['text_clean']).sentiment.subjectivity, axis=1)

# Create a positive / negative sentiment variable
tweets['sentiment'] = np.where(tweets['polarity'].astype(float) > 0, 'Positive', 'Negative')
tweets['sentiment'] = np.where(tweets['polarity'].astype(float) == 0, 'Neutral', tweets['sentiment'])

tweets[['text_clean', 'sentiment', 'polarity', 'subjectivity']].head(5)

CPU times: user 1min 1s, sys: 79.3 ms, total: 1min 1s
Wall time: 1min 1s


Unnamed: 0,text_clean,sentiment,polarity,subjectivity
0,@its___Denise: https:t.cou0ULJ5X61FWomenWhoCode Lakers javascript GirlsWhoCode Ronaldo Liverpool PataChanjoKaaChonjo mydonkey,Neutral,0.0,0.0
1,@its___Denise: https:t.cou0ULJ5X61FWomenWhoCode Lakers javascript GirlsWhoCode Ronaldo Liverpool PataChanjoKaaChonjo mydonkey,Neutral,0.0,0.0
2,@its___Denise: https:t.cou0ULJ5X61FWomenWhoCode Lakers javascript GirlsWhoCode Ronaldo Liverpool PataChanjoKaaChonjo mydonkey,Neutral,0.0,0.0
3,@IainLJBrown: UC adopts recommendations for the responsible use of Artificial Intelligence Preuss School UcsdRead more here: https:,Positive,0.033333,0.683333
4,@its___Denise: https:t.cou0ULJ5X61FWomenWhoCode Lakers javascript GirlsWhoCode Ronaldo Liverpool PataChanjoKaaChonjo mydonkey,Neutral,0.0,0.0


#### Timing TextBlob PatternAnalyzer with parallelization

In [15]:
pandarallel.initialize(nb_workers=num_processors-1, use_memory_fs=False)

INFO: Pandarallel will run on 7 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [16]:
%%time

tweets = tweets_df.copy()

tweets['polarity'] = tweets.parallel_apply(lambda x: TextBlob(x['text_clean']).sentiment.polarity, axis=1)
tweets['subjectivity'] = tweets.parallel_apply(lambda x: TextBlob(x['text_clean']).sentiment.subjectivity, axis=1)

# Create a positive / negative sentiment variable
tweets['sentiment'] = np.where(tweets['polarity'].astype(float) > 0, 'Positive', 'Negative')
tweets['sentiment'] = np.where(tweets['polarity'].astype(float) == 0, 'Neutral', tweets['sentiment'])

tweets[['text_clean', 'sentiment', 'polarity', 'subjectivity']].head(5)

CPU times: user 562 ms, sys: 540 ms, total: 1.1 s
Wall time: 13.3 s


Unnamed: 0,text_clean,sentiment,polarity,subjectivity
0,@its___Denise: https:t.cou0ULJ5X61FWomenWhoCode Lakers javascript GirlsWhoCode Ronaldo Liverpool PataChanjoKaaChonjo mydonkey,Neutral,0.0,0.0
1,@its___Denise: https:t.cou0ULJ5X61FWomenWhoCode Lakers javascript GirlsWhoCode Ronaldo Liverpool PataChanjoKaaChonjo mydonkey,Neutral,0.0,0.0
2,@its___Denise: https:t.cou0ULJ5X61FWomenWhoCode Lakers javascript GirlsWhoCode Ronaldo Liverpool PataChanjoKaaChonjo mydonkey,Neutral,0.0,0.0
3,@IainLJBrown: UC adopts recommendations for the responsible use of Artificial Intelligence Preuss School UcsdRead more here: https:,Positive,0.033333,0.683333
4,@its___Denise: https:t.cou0ULJ5X61FWomenWhoCode Lakers javascript GirlsWhoCode Ronaldo Liverpool PataChanjoKaaChonjo mydonkey,Neutral,0.0,0.0


In [17]:
tweets['sentiment'].value_counts()

Neutral     53915
Positive    36746
Negative     9126
Name: sentiment, dtype: int64

### NaiveBayesAnalyzer

#### Timing TextBlob NaiveBayesAnalyzer with native Python Pandas single-threading

In [18]:
%%time

# NaiveBayesAnalyzer is SLOW, taking a small sample of Tweets
tweets = tweets_df.copy()
tweets = tweets.sample(n=100, random_state=12345)

tweets['sentiment_class'] = tweets.apply(lambda x: TextBlob(x['text_clean'], analyzer=NaiveBayesAnalyzer()).sentiment.classification, axis=1)
tweets['sentiment_prob_pos'] = tweets.apply(lambda x: TextBlob(x['text_clean'], analyzer=NaiveBayesAnalyzer()).sentiment.p_pos, axis=1)
tweets['sentiment_prob_neg'] = tweets.apply(lambda x: TextBlob(x['text_clean'], analyzer=NaiveBayesAnalyzer()).sentiment.p_neg, axis=1)
tweets['sentiment_class'] = np.where(tweets['sentiment_prob_pos']==0.5, 'neu', tweets['sentiment_class'])
tweets[['text_clean', 'sentiment_class', 'sentiment_prob_pos', 'sentiment_prob_neg']].head(5)

CPU times: user 24min 36s, sys: 1min, total: 25min 37s
Wall time: 25min 37s


Unnamed: 0,text_clean,sentiment_class,sentiment_prob_pos,sentiment_prob_neg
25006,@hsianghui: 5 python Productivity Tips With Pydash coding https:t.cozbMOvJ7709,pos,0.508044,0.491956
7603,"@essay_help2: Legit, reliable, and quality work guaranteed inFinancepythonPaper pay Case study. pythonhomeworkOnlineclass",pos,0.805737,0.194263
23988,@Vecto_Mobile: Scale and Success with IoT Via @Vecto_Mobile @GersonRolimAI ML DigitalTransformation BigData DataScience Pytho,pos,0.731928,0.268072
77949,Credit Card Fraud Detection ArtificialIntelligence learning machinelearning https:t.coXN0e4HzNS0,pos,0.784486,0.215514
75626,What is machine learning https:t.coVv1838xnEj MachineLearning,pos,0.591783,0.408217


#### Timing TextBlob NaiveBayesAnalyzer with parallelization

In [19]:
pandarallel.initialize(nb_workers=num_processors-1, use_memory_fs=False)

INFO: Pandarallel will run on 7 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [20]:
%%time

# NaiveBayesAnalyzer is SLOW, taking a small sample of Tweets
tweets = tweets_df.copy()
tweets = tweets.sample(n=100, random_state=12345)

tweets['sentiment_class'] = tweets.parallel_apply(lambda x: TextBlob(x['text_clean'], analyzer=NaiveBayesAnalyzer()).sentiment.classification, axis=1)
tweets['sentiment_prob_pos'] = tweets.parallel_apply(lambda x: TextBlob(x['text_clean'], analyzer=NaiveBayesAnalyzer()).sentiment.p_pos, axis=1)
tweets['sentiment_prob_neg'] = tweets.parallel_apply(lambda x: TextBlob(x['text_clean'], analyzer=NaiveBayesAnalyzer()).sentiment.p_neg, axis=1)
tweets['sentiment_class'] = np.where(tweets['sentiment_prob_pos']==0.5, 'neu', tweets['sentiment_class'])
tweets[['text_clean', 'sentiment_class', 'sentiment_prob_pos', 'sentiment_prob_neg']].head(5)

CPU times: user 369 ms, sys: 772 ms, total: 1.14 s
Wall time: 5min 11s


Unnamed: 0,text_clean,sentiment_class,sentiment_prob_pos,sentiment_prob_neg
25006,@hsianghui: 5 python Productivity Tips With Pydash coding https:t.cozbMOvJ7709,pos,0.508044,0.491956
7603,"@essay_help2: Legit, reliable, and quality work guaranteed inFinancepythonPaper pay Case study. pythonhomeworkOnlineclass",pos,0.805737,0.194263
23988,@Vecto_Mobile: Scale and Success with IoT Via @Vecto_Mobile @GersonRolimAI ML DigitalTransformation BigData DataScience Pytho,pos,0.731928,0.268072
77949,Credit Card Fraud Detection ArtificialIntelligence learning machinelearning https:t.coXN0e4HzNS0,pos,0.784486,0.215514
75626,What is machine learning https:t.coVv1838xnEj MachineLearning,pos,0.591783,0.408217


In [21]:
tweets['sentiment_class'].value_counts()

pos    69
neg    28
neu     3
Name: sentiment_class, dtype: int64

In [22]:
datetime.datetime.now(pytz.timezone('US/Central')).strftime("%a, %d %B %Y %H:%M:%S")

'Wed, 26 October 2022 10:49:14'