# **Exploratory Data Analysis**

In this program, we will analyze comments of banking services by visualizing word frequency as wordcloud. Three wordclound for possitive, negative and neutral comments will be displayed. 

In [1]:
!pip3 install pythainlp # install the pythainlp library



In [2]:
# Import python libraries used in this program
import string
import unicodedata
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.probability import FreqDist
from pythainlp.tokenize import word_tokenize
from pythainlp.util import normalize
from pythainlp.tag import pos_tag
from pythainlp.corpus.common import thai_stopwords
from pythainlp import thai_punctuations

ModuleNotFoundError: No module named 'pandas'

In [0]:
# Define proprocessing function 
def preprocessing(text):
  text = unicodedata.normalize("NFKD", text)
  # step 1: word tokenization 
  token = word_tokenize(text, engine="longest", keep_whitespace=False)
  # step 2: word normalization 
  normalized_token = []
  for item in token:
    normalized_token.append(normalize(item))
  #step 3: remove stop words
  stopwords = thai_stopwords()
  woStopword_token = []
  for item in normalized_token:
    if item not in stopwords:
      woStopword_token.append(item)
  #step 4: remove punctuation
  en_punctuation = string.punctuation
  th_punctuation = thai_punctuations
  punctuation = en_punctuation+th_punctuation
  final_token = []
  for item in woStopword_token:
	  if item not in punctuation:
		  final_token.append(item) 
  return final_token

In [0]:
def filteringPOS(tokens,poslist):
  pos = pos_tag(tokens,engine='perceptron',corpus='orchid')
  filtered_token = []
  for item in pos:
    if item[1] in poslist:
      filtered_token.append(item[0])
  return filtered_token

In [0]:
# Read data from an excel file (i.e. FB180_Social_Dataset_Classification.xlsx) 
data = pd.read_excel('FB180_Social_Dataset_Classification.xlsx') # You may need to change the file name. Please get the file name from the previous cell
data.head()

In [0]:
# Explore possitive comments
# Step 1: select recodes of possitive comments from data
possitive_comments = data[(data.Sentiment == 'positive')]
possitive_comments.head()

# Step 2: create a set of words extracted from possitive comments
possitive_words = []
for item in possitive_comments.itertuples():
  token = preprocessing(item.Text)
  #token = filteringPOS(token,["ADVN","ADV","ADVI","ADVP","ADVS","ADJ","NONM","VATT","DONM","PART","FIXN","FIXV","EAFF","EITT","AITT","NEG"])
  possitive_words = possitive_words+token

# Step 3: count word frequency
fdist = FreqDist(possitive_words)

# Step 4: visualize wordcloud
wc = WordCloud(font_path='THSarabunNew.ttf',background_color="white", regexp = r"[ก-๙a-zA-Z']+",)
# generate word cloud
wc.generate_from_frequencies(fdist)
# show
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()

In [0]:
# Explore negative comments
# Step 1: select recodes of negative comments from data
negative_comments = data[(data.Sentiment == 'negative')]
negative_comments.head()

# Step 2: create a set of words extracted from negative comments
negative_words = []
for item in negative_comments.itertuples():
  token = preprocessing(item.Text)
  #token = filteringPOS(token,["ADVN","ADV","ADVI","ADVP","ADVS","ADJ","NONM","VATT","DONM","PART","FIXN","FIXV","EAFF","EITT","AITT","NEG"])
  negative_words = negative_words+token

# Step 3: count word frequency
fdist = FreqDist(negative_words)

# Step 4: visualize wordcloud
wc = WordCloud(font_path='THSarabunNew.ttf',background_color="white", regexp = r"[ก-๙a-zA-Z']+",)
# generate word cloud
wc.generate_from_frequencies(fdist)
# show
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()

In [0]:
# Explore neutral comments
# Step 1: select recodes of neutral comments from data
neutral_comments = data[(data.Sentiment == 'neutral')]
neutral_comments.head()

# Step 2: create a set of words extracted from neutral comments
neutral_words = []
for item in neutral_comments.itertuples():
  token = preprocessing(item.Text)
  #token = filteringPOS(token,["ADVN","ADV","ADVI","ADVP","ADVS","ADJ","NONM","VATT","DONM","PART","FIXN","FIXV","EAFF","EITT","AITT","NEG"])
  neutral_words = neutral_words+token

# Step 3: count word frequency
fdist = FreqDist(neutral_words)

# Step 4: visualize wordcloud
wc = WordCloud(font_path='THSarabunNew.ttf',background_color="white", regexp = r"[ก-๙a-zA-Z']+",)
# generate word cloud
wc.generate_from_frequencies(fdist)
# show
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()