IMPLEMENTING BAG OF WORDS

In [2]:
# saving datafiles into dataframes

import pandas as pd
import numpy as np

# when running, change to own local paths
review_pos = pd.read_json('review_pos.json')
review_neg = pd.read_json('review_neg.json')
review_tip = pd.read_json('tip.json')

pos = (' '.join(review_pos['text'].dropna()))
neg = (' '.join(review_neg['text'].dropna()))
tip = (' '.join(review_tip['text'].dropna()))

In [4]:
# computing TF/IDF vector for each file

from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [pos, neg, tip]
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()

array(['00', '000', '0000', ..., 'ﾟﾍﾟ', 'ﾟﾟ__ー_ーoー_ー_メ', 'ﾟﾟﾟﾟ〆_メ__メ'],
      dtype=object)

In [None]:
# listing words with highest TF/IDF score

n = 10

print('Positive words with highest TF/IDF score:')
i_pos = np.flip(np.argsort(x[0, :].A)[0][-n:])
pos_scores = np.flip(np.sort(x[0, :].A)[0][-n:])
pos_words = vectorizer.get_feature_names_out()[i_pos]
print(dict(zip(pos_words, pos_scores)))

print('\nNegative words with highest TF/IDF score:')
i_neg = np.flip(np.argsort(x[1, :].A)[0][-n:])
neg_scores = np.flip(np.sort(x[1, :].A)[0][-n:])
neg_words = vectorizer.get_feature_names_out()[i_neg]
print(dict(zip(neg_words, neg_scores)))

print('\nTip words with highest TF/IDF score:')
i_tip = np.flip(np.argsort(x[2, :].A)[0][-n:])
tip_scores = np.flip(np.sort(x[2, :].A)[0][-n:])
tip_words = vectorizer.get_feature_names_out()[i_tip]
print(dict(zip(tip_words, tip_scores)))

In [None]:
# generating word clouds w/ n words

n = 50

i_pos = np.flip(np.argsort(x[0, :].A)[0][-n:])
pos_scores = np.flip(np.sort(x[0, :].A)[0][-n:])
pos_words = vectorizer.get_feature_names_out()[i_pos]

i_neg = np.flip(np.argsort(x[1, :].A)[0][-n:])
neg_scores = np.flip(np.sort(x[1, :].A)[0][-n:])
neg_words = vectorizer.get_feature_names_out()[i_neg]

i_tip = np.flip(np.argsort(x[2, :].A)[0][-n:])
tip_scores = np.flip(np.sort(x[2, :].A)[0][-n:])
tip_words = vectorizer.get_feature_names_out()[i_tip]

In [None]:
# plotting word clouds

import matplotlib.pyplot as plt
from PIL import Image
from wordcloud import WordCloud
from os import path
import PIL.ImageOps

# circular mask
i, j = np.ogrid[:1000, :1000]
mask = (i-500)**2 + (j-500)**2 > 400**2
mask = 255 * mask.astype(int)

# positive wordcloud
wordcloud = WordCloud(background_color="white", width=2000, height=2000, scale=4, mask=mask)
wordcloud.generate_from_frequencies(dict(zip(pos_words, pos_scores)))
plt.figure(figsize=(5, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.title("Positive wordcloud")
plt.axis("off")
plt.show()

# negative wordcloud
wordcloud = WordCloud(background_color="white", width=2000, height=2000, scale=4, mask=mask)
wordcloud.generate_from_frequencies(dict(zip(neg_words, neg_scores)))
plt.figure(figsize=(5, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.title("Negative wordcloud")
plt.axis("off")
plt.show()

# tip wordcloud
wordcloud = WordCloud(background_color="white", width=2000, height=2000, scale=4, mask=mask)
wordcloud.generate_from_frequencies(dict(zip(tip_words, tip_scores)))
plt.figure(figsize=(5, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.title("Tip wordcloud")
plt.axis("off")
plt.show()