In [None]:
from google.colab import drive

drive.mount('/content/drive')

# Load pre-trained model

In [2]:
import gzip
import shutil

compressed_file = "/content/drive/MyDrive/FundFlow/cc.th.300.bin.gz"
extracted_file = "/content/cc.th.300.bin"

with gzip.open(compressed_file, 'rb') as f_in:
    with open(extracted_file, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

print("Extraction completed!")

Extraction completed!


In [None]:
!pip install fasttext

Load pre-trained model `cc.th.300.bin` which is pre-trained using 2 million Thai words from FastText

In [4]:
import fasttext

pre_trained_model = fasttext.load_model('/content/cc.th.300.bin')

# Load dataset

Load the dataset that is collected by web crawling and web scraping technique and it is already cleaned and preprocessed.

In [7]:
import pandas as pd

dataset = pd.read_csv('/content/thai_texts.csv')
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48918 entries, 0 to 48917
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   texts   48918 non-null  object
dtypes: object(1)
memory usage: 382.3+ KB


In [8]:
dataset['formatted_text'] = dataset['texts'].apply(lambda x: ' '.join(eval(x)))

output_path = '/content/formatted_sentences.txt'
dataset['formatted_text'].to_csv(output_path, index=False, header=False)

print(f"Formatted dataset saved to {output_path}")

Formatted dataset saved to /content/formatted_sentences.txt


# Fine-tune model

Load pre-trained vectors `cc.th.300.vec` from FastText

In [9]:
compressed_file = "/content/drive/MyDrive/FundFlow/cc.th.300.vec.gz"
extracted_file = "/content/cc.th.300.vec"

with gzip.open(compressed_file, 'rb') as f_in:
    with open(extracted_file, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

print("Extraction completed!")

Extraction completed!


Fine-tune the pre-trained vectors

In [10]:
fine_tuned_model = fasttext.train_unsupervised(
    input='/content/formatted_sentences.txt',
    model='skipgram',
    epoch=200,
    lr=0.05,
    dim=300,
    pretrainedVectors='/content/cc.th.300.vec',
    verbose=1
)

Save the model

In [11]:
model_path = '/content/fine_tuned_model.bin'
fine_tuned_model.save_model(model_path)
print(f"Fine-tuned model saved to {model_path}")

Fine-tuned model saved to /content/fine_tuned_model.bin


In [12]:
from google.colab import files

files.download(model_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Evaluate the model

## Load validation data

- **TH-SimLex-999**: The dataset contains 666 noun, 222 verb and 111 adjective pairs. It is rated the word pairs by 16 Thai native speakers as annotators.

- **TH-WordSim-353**: It contains 353 word pairs, and measures the semantic relatedness and a scale from 0 to 10. It is rated the word pairs by 13 annotators.

In [17]:
simlex = pd.read_excel('/content/th-simlex-999-details.xlsx')
wordsim = pd.read_excel('/content/th-wordsim-353.details.xlsx', sheet_name='combined')

In [18]:
simlex = simlex.drop(simlex.index[999:])
simlex['average'] = simlex['average'] / 6
simlex = simlex[['Thaiword1', 'Thaiword2', 'average']]

In [19]:
simlex

Unnamed: 0,Thaiword1,Thaiword2,average
0,เก่า,ใหม่,0.218750
1,หลักแหลม,ฉลาด,0.843750
2,ยาก,ยาก,1.000000
3,สุข,ร่าเริง,0.666667
4,ยาก,ง่าย,0.229167
...,...,...,...
994,เข้าร่วม,ได้มา,0.135417
995,ส่ง,เข้าประชุม,0.010417
996,รวบรวม,เข้าประชุม,0.041667
997,ซึมซับ,ถอน,0.000000


In [20]:
wordsim = wordsim.rename(columns=wordsim.iloc[0]).drop(wordsim.index[0])
wordsim = wordsim.drop(wordsim.index[-5:])
wordsim = wordsim[['Thai-word-1-1', 'Thai-word-2-1', 'Mean Similarity Rating']]
wordsim['Mean Similarity Rating'] = wordsim['Mean Similarity Rating'] / 10

In [21]:
wordsim

Unnamed: 0,Thai-word-1-1,Thai-word-2-1,Mean Similarity Rating
1,ความรัก,เพศ,0.677
2,เสือ,แมวป่า,0.735
3,เสือ,เสือ,1.0
4,หนังสือ,หนังสือพิมพ์,0.746
5,คอมพิวเตอร์,แป้นพิมพ์,0.762
...,...,...,...
349,ฝนตกปรอยๆ,น้ำท่วม,0.603
350,สภาพอากาศ,พยากรณ์,0.834
351,ความหายนะ,พื้นที่,0.625
352,ผู้ว่าการรัฐ,สำนักงาน,0.634


In [22]:
wordsim.columns = simlex.columns
similarity_dataset = pd.concat([simlex, wordsim], ignore_index=True)

similarity_dataset

Unnamed: 0,Thaiword1,Thaiword2,average
0,เก่า,ใหม่,0.21875
1,หลักแหลม,ฉลาด,0.84375
2,ยาก,ยาก,1.0
3,สุข,ร่าเริง,0.666667
4,ยาก,ง่าย,0.229167
...,...,...,...
1347,ฝนตกปรอยๆ,น้ำท่วม,0.603
1348,สภาพอากาศ,พยากรณ์,0.834
1349,ความหายนะ,พื้นที่,0.625
1350,ผู้ว่าการรัฐ,สำนักงาน,0.634


## Spearman Correlation

**Spearman’s rank Correlation** is a nonparametric measure of statistical dependence between the rankings of two variables. It assesses how well the relationship between two variables can be described using a monotonic function. Spearman’s rank correlation coefficient works by ranking the observations in a dataset and calculating the correlation between the ranks rather than the raw value themselves.

In [23]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import spearmanr
import numpy as np

def get_cosine_similarity(word1, word2, model):
    try:
        vec1 = model.get_word_vector(word1).reshape(1, -1)
        vec2 = model.get_word_vector(word2).reshape(1, -1)
        similarity = cosine_similarity(vec1, vec2)[0][0]
        return similarity
    except KeyError:
        return None

In [24]:
models = {
    "Pre-trained Model": pre_trained_model,
    "Fine-tuned Model": fine_tuned_model
}

for model_name, model in models.items():
    model_scores = []
    human_scores = []

    for _, row in similarity_dataset.iterrows():
        sim = get_cosine_similarity(row['Thaiword1'], row['Thaiword2'], model)
        if sim is not None:
            model_scores.append(sim)
            human_scores.append(row['average'])

    correlation, _ = spearmanr(human_scores, model_scores)
    print(f"{model_name} Spearman Correlation: {correlation:.4f}")

Pre-trained Model Spearman Correlation: 0.3106
Fine-tuned Model Spearman Correlation: 0.3176


## Use the model

In [29]:
memo = "ค่าข้าว"
categories = ["เดินทาง", "อาหาร", "ขนม", "ช้อปปิ้ง", "บ้าน"]

target_vector = fine_tuned_model.get_word_vector(memo)
word_vectors = [fine_tuned_model.get_word_vector(word) for word in categories]

similarities = cosine_similarity([target_vector], word_vectors)[0]

max_index = np.argmax(similarities)
most_similar_word = categories[max_index]
highest_similarity = similarities[max_index]

print(f"Most similar word: {most_similar_word}")
print(f"Similarity: {highest_similarity}")

Most similar word: อาหาร
Similarity: 0.6384586691856384
