In [None]:
#https://fasttext.cc/docs/en/options.html

In [None]:
!pip install fasttext

In [None]:
#use fasttext to train a linear sentiment classifier on the kaggle sentiment data
# method 1: let fasttext learn the word embeddings from the data, and then train the linear classifier using softmax
# method 2: encode training data with pre-trained word embeddings, and then build linear classifiers using other algorithms like svm or logistic regression regression.
#  

In [30]:
# fasttext can train a cbow or skipgram embedding model from input text data
# for example, the following command trains a cbow model with 100 dimensions from the training data
import fasttext
kaggle_embedding = fasttext.train_unsupervised('data/kaggle-sentiment/train.tsv', model='cbow', dim=300)

Read 1M words
Number of words:  25016
Number of labels: 0
Progress: 100.0% words/sec/thread:  180145 lr:  0.000000 avg.loss:  2.019099 ETA:   0h 0m 0s


In [31]:
# now let's play with this embedding model

# retrieve the vector of a word
word_vector = kaggle_embedding['fantastic']
#print(word_vector)

In [32]:
# get the nearest neighbors of a word (aka most similar words)
import pandas as pd
def most_similar_words(word, model, k=10):
    similar_words = model.get_nearest_neighbors(word, k=k)
    # Convert similar_words to a DataFrame
    similar_words_df = pd.DataFrame(similar_words, columns=['Similarity', 'Word'])
    return similar_words_df

# it seems this embedding model does not capture word sentiment very well
word = "excellent"
similar_words = most_similar_words(word, kaggle_embedding)
print(similar_words)

   Similarity           Word
0    0.942038     ambivalent
1    0.933749         accent
2    0.925762      ebullient
3    0.925723        opulent
4    0.924235       virulent
5    0.920235         fluent
6    0.920110     equivalent
7    0.916053      efficient
8    0.913590  establishment
9    0.911892      repellent


In [None]:
# download the pre-trained fasttext word embeddings model
# the smallest model is wiki-news-300d-1M.vec and wiki-news-300d-1M-subword.vec (2.26GB)
# cc.en.300.bin is 7GB
# https://fasttext.cc/docs/en/english-vectors.html

In [14]:
import fasttext.util
fasttext.util.download_model('en', if_exists='ignore')  # English

'cc.en.300.bin'

In [16]:
# the pre-trained fasttext embedding model is better
# the power of large text corpora
model = fasttext.load_model('cc.en.300.bin')
similar_words = most_similar_words('fantastic', model)
print(similar_words)

   Similarity        Word
0    0.848709    terrific
1    0.835334     amazing
2    0.823144      superb
3    0.820897   wonderful
4    0.813151    fabulous
5    0.803248       great
6    0.768460     awesome
7    0.767226    fantasic
8    0.766214  incredible
9    0.751928   excellent


In [17]:
# Method 1: use fasttext to train embedding and classifier

# Read the data file
data = pd.read_csv('data/kaggle-sentiment/train.tsv', sep='\t')
data = data[['Phrase', 'Sentiment']]
print(data.head())

                                              Phrase  Sentiment
0  A series of escapades demonstrating the adage ...          1
1  A series of escapades demonstrating the adage ...          2
2                                           A series          2
3                                                  A          2
4                                             series          2


In [18]:
# if necessary, sample the data to make training faster
# adjust to a number that your computer can handle
sampled_data = data.sample(n=100000, replace=True)
print(data.head())

                                              Phrase  Sentiment
0  A series of escapades demonstrating the adage ...          1
1  A series of escapades demonstrating the adage ...          2
2                                           A series          2
3                                                  A          2
4                                             series          2


In [19]:
# convert movie review data to fasttext format
def convert_to_fasttext_format(data, output_file):
    with open(output_file, 'w') as f:
        for i in range(len(data)):
            label = data[i][1]
            text = data[i][0]
            f.write('__label__{} {}\n'.format(label, text))
    

In [20]:
convert_to_fasttext_format(sampled_data.values, 'data/kaggle_fasttext.txt')

In [21]:
# Training data should be in the format: __label__<label> <text>
#training_data = 'data/moviereview_fasttext.txt'
training_data = 'data/kaggle_fasttext.txt'
model_path = 'fasttext_model.bin'

# Train the model
model = fasttext.train_supervised(input=training_data, epoch=25, lr=0.1, wordNgrams=2, verbose=2, minCount=1)

# Save the model
model.save_model(model_path)

Read 0M words
Number of words:  17952
Number of labels: 5
Progress: 100.0% words/sec/thread: 2422055 lr:  0.000000 avg.loss:  0.277408 ETA:   0h 0m 0s


In [38]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_data, test_data = train_test_split(sampled_data, test_size=0.3, random_state=42)

# Convert the training and testing data to fastText format
convert_to_fasttext_format(train_data.values, 'data/kaggle_sample_fasttext_train.txt')
convert_to_fasttext_format(test_data.values, 'data/kaggle_sample_fasttext_test.txt')

# Train the model using fastText
model = fasttext.train_supervised(input='data/kaggle_sample_fasttext_train.txt', epoch=25, lr=0.1, wordNgrams=2, verbose=2, minCount=1)

# Test the model
def test_model(test_file, model):
    test_result = model.test(test_file)
    print(f"Number of examples: {test_result[0]}")
    print(f"Precision: {test_result[1]:.4f}")
    print(f"Recall: {test_result[2]:.4f}")

test_model('data/kaggle_sample_fasttext_test.txt', model)

Read 0M words
Number of words:  17596
Number of labels: 5
Progress:  93.9% words/sec/thread: 2566883 lr:  0.006101 avg.loss:  0.282210 ETA:   0h 0m 0s

Number of examples: 30000
Precision: 0.7412
Recall: 0.7412


Progress: 100.0% words/sec/thread: 2434635 lr:  0.000000 avg.loss:  0.269366 ETA:   0h 0m 0s


In [28]:
# Load the model
model = fasttext.load_model(model_path)

# Predict the label for a given text

test_texts = ['hard to tell if this is a good movie', \
              'terrible movie ever!', \
                'best movie ever!', \
                'good movie', \
                'bad movie', \
                'not bad',\
                 'not good']

def test_pred(texts, test_model):
    predictions = test_model.predict(texts)
    results = list (zip(predictions[0], predictions[1]))
    # Create a DataFrame to display the results
    results_df = pd.DataFrame(results, columns=['Label', 'Probability'])
    results_df['Text'] = texts

    # Print the DataFrame
    print(results_df)

test_pred(test_texts, model)

          Label   Probability                                  Text
0  [__label__1]  [0.54284286]  hard to tell if this is a good movie
1  [__label__0]    [0.999006]                  terrible movie ever!
2  [__label__3]   [0.9754409]                      best movie ever!
3  [__label__3]   [0.9469355]                            good movie
4  [__label__1]  [0.90281624]                             bad movie
5  [__label__2]   [0.5109526]                               not bad
6  [__label__1]   [0.9634638]                              not good


In [24]:
# method 2 use pre-trained word embeddings to train a logistic regression classifier

import numpy as np
def text_to_embedding(text, model):
    words = text.split()
    word_vectors = [model.get_word_vector(word) for word in words if word in model]
    return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(300)  # 300-dim embeddings

# Convert all texts into embeddings
texts = sampled_data['Phrase'].values
labels = sampled_data['Sentiment'].values

X = np.array([text_to_embedding(text, model) for text in texts])
y = np.array(labels)

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(y_test.shape)

# Train a simple classifier
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Evaluate
accuracy = clf.score(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")

(30000,)
Test Accuracy: 0.62


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [26]:
from sklearn.metrics import f1_score, confusion_matrix

# Predict the labels for the test set
y_pred = clf.predict(X_test)

# Calculate the F1 score
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"F1 Score: {f1:.2f}")

# Generate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

F1 Score: 0.59
Confusion Matrix:
[[  202   401   733    23     5]
 [   89  2012  2911   122     1]
 [   93  1086 12976  1231   116]
 [    8   115  2880  3052   198]
 [    1     4   527   838   376]]


In [None]:
print(clf.predict(text_to_embedding('terrible movie ever!', model).reshape(1, -1)))


[0]


In [None]:
test_texts = ['hard to tell if this is a good movie', \
              'terrible movie ever!', \
                'best movie ever!', \
                'good movie', \
                'bad movie', \
                'not bad',\
                 'not good']

In [39]:
# Convert test_texts to embeddings
test_embeddings = np.array([text_to_embedding(text, model) for text in test_texts])

# Predict the labels using the logistic regression classifier
predictions = clf.predict(test_embeddings)

# Print the predictions
for text, prediction in zip(test_texts, predictions):
    print(f"Text: {text} -> Predicted Sentiment: {prediction}")

Text: hard to tell if this is a good movie -> Predicted Sentiment: 2
Text: terrible movie ever! -> Predicted Sentiment: 0
Text: best movie ever! -> Predicted Sentiment: 3
Text: good movie -> Predicted Sentiment: 3
Text: bad movie -> Predicted Sentiment: 2
Text: not bad -> Predicted Sentiment: 2
Text: not good -> Predicted Sentiment: 2


In [None]:
# the second method takes average of word embeddings, 
# which loses quite some nuances
# the accuracy is usually not so good
# also note that because the input features are dense vectors, 
# the logistic regression model is not interpretable
