# **Sentiment Analysis for Thai Texts**
The process of sentiment analysis is given as follows:
1. Load dataset into the program and split the dataset into training and test set
2. Prepare training set: extract tf-idf and word2vec feature vectors from sentences
3. Prepare test set: extract tf-idf and word2vec feature vectors from sentences
4. Construct classifiers (i.e. SVM and kNN) using training set
5. Evaluate the classiffiers using test set 



In [0]:
!pip install pythainlp # install the pythainlp library

In [0]:
# Import python libraries used in this program
import string
import unicodedata
import numpy as np
import pandas as pd

from nltk.probability import FreqDist
from gensim.models import KeyedVectors

from pythainlp.tokenize import word_tokenize
from pythainlp.util import normalize
from pythainlp.tag import pos_tag
from pythainlp.corpus.common import thai_stopwords
from pythainlp import thai_punctuations
import pythainlp.word_vector as thword2vec

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [0]:
# Define proprocessing function 
def preprocessing(text):
  text = unicodedata.normalize("NFKD", text)
  # step 1: word tokenization 
  token = word_tokenize(text, engine="longest", keep_whitespace=False)
  # step 2: word normalization 
  normalized_token = []
  for item in token:
    normalized_token.append(normalize(item))
  #step 3: remove stop words
  stopwords = thai_stopwords()
  woStopword_token = []
  for item in normalized_token:
    if item not in stopwords:
      woStopword_token.append(item)
  #step 4: remove punctuation
  en_punctuation = string.punctuation
  th_punctuation = thai_punctuations
  punctuation = en_punctuation+th_punctuation
  final_token = []
  for item in woStopword_token:
	  if item not in punctuation:
		  final_token.append(item) 
  return final_token

In [0]:
def sentence2vec(text):
  model = thword2vec.get_model()
  tokens = preprocessing(text)
  xall = np.empty((0,300), float)
  for item in tokens:
    if item in model.index2word:
      wordvec = model[item][np.newaxis]
      xall = np.append(xall,wordvec,0)
  x = np.mean(xall, axis=0)
  return x

In [0]:
# Step 1: load dataset into program and split the dataset into random train and test subsets
data = pd.read_excel('FB180_Social_Dataset_Classification.xlsx')
x = data['Text']
y = data['Sentiment']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

In [0]:
# Step 2: prepare training dataset - using tf-idf and word2vec as feature representation
x_train_tfidf = []
x_train_word2vec = np.empty((0,300), float)
for item in x_train:
  xtmp = sentence2vec(item)
  x_train_word2vec = np.append(x_train_word2vec,xtmp[np.newaxis],0)

vectorizer = TfidfVectorizer(tokenizer=preprocessing, ngram_range=(1,1))
vectorizer_model = vectorizer.fit(x_train) # This model will be applied to test data
x_train_tfidf = vectorizer_model.transform(x_train).todense()

In [0]:
# Step 3: prepare test dataset
x_test_tfidf = []
x_test_word2vec = np.empty((0,300), float)
for item in x_test:
  xtmp = sentence2vec(item)
  x_test_word2vec = np.append(x_test_word2vec,xtmp[np.newaxis],0)

x_test_tfidf = vectorizer_model.transform(x_test).todense()  

In [0]:
# Step 4: Build a SVM classifier and a kNN classifier
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [2**i for i in [-10,-8,-6,-4,-2,0,2,4,6,8,10]],
                     'C': [2**i for i in [-10,-8,-6,-4,-2,0,2,4,6,8,10]]},
                    {'kernel': ['linear'], 'C': [2**i for i in [-10,-8,-6,-4,-2,0,2,4,6,8,10]]}]

svc_model = GridSearchCV(SVC(), tuned_parameters, cv=5)
svc_model.fit(x_train_tfidf, y_train)

knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(x_train_tfidf, y_train)

In [0]:
# Step 5: evaluate the classifiers
y_predict_svm = svc_model.predict(x_test_tfidf)
y_predict_knn = svc_model.predict(x_test_tfidf)

print("Classification Performance for SVM\n")
print(classification_report(y_test, y_predict_svm))
print(confusion_matrix(y_test, y_predict_svm))

print("\nClassification Performance for kNN\n")
print(classification_report(y_test, y_predict_knn))
print(confusion_matrix(y_test, y_predict_knn))
