In [1]:
!wget http://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip

--2020-11-13 04:01:29--  http://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 172.67.9.4, 104.22.75.142, 104.22.74.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|172.67.9.4|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1523785255 (1.4G) [application/zip]
Saving to: ‘crawl-300d-2M.vec.zip’


2020-11-13 04:03:46 (10.6 MB/s) - ‘crawl-300d-2M.vec.zip’ saved [1523785255/1523785255]



In [2]:
import zipfile
with zipfile.ZipFile('/content/crawl-300d-2M.vec.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/')

In [3]:
!cp /content/drive/My\ Drive/colab/Approaching_Any_ML_Problem/chapter11_text/input/IMDB\ Dataset.csv .

In [1]:
pwd

'/content'

In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# fasttext.py

import io
import numpy as np
import pandas as pd

from nltk.tokenize import word_tokenize
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
from sklearn.feature_extraction.text import TfidfVectorizer


def load_vectors(fname):
    # taken from
    fin = io.open(
        fname,
        'r',
        encoding='utf-8',
        newline='\n',
        errors='ignore'
    )
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = list(map(float, tokens[1:]))
    return data


def sentence_to_vec(s, embedding_dict, stop_words, tokenizer):
    """
    Given a sentence and other information,
    this function returns embedding for the whole sentence
    :param s: sentence, string
    :param embedding_dict: dictionary word: vector
    :param stop_words: list of step words, 
    :param tokenizer:
    :return:
    """
    # convert sentence to string and lowercase it
    words = str(s).lower()

    # tokenizer the sentence
    words = tokenizer(words)

    # remove stop word tokens
    words = [w for w in words if not stop_words]

    # keep only alpha-numeric tokens
    words = [w for w in words if w.isalpha()]

    # initialize empty list to store embeddings
    M = []

    for w in words:
        # for every word, fetch the embedding from
        # the dictionary and append to list of
        # embeddings
        if w in embedding_dict:
            M.append(embedding_dict[w])

    # if we don't have any vectors, return zeros
    if len(M) == 0:
        return np.zeros(300)

    # convert list of embeddings to array
    M = np.array(M)

    # calculate sum over axis=0
    v = M.sum(axis=0)

    # return the normalized vector
    return v / np.sqrt((v ** 2).sum())


if __name__ == '__main__':
    # read the training data
    df = pd.read_csv("./IMDB Dataset.csv")

    # map positive to 1 and negative to 0
    df.sentiment = df.sentiment.apply(
        lambda x: 1 if x == "positive" else 0
    )

    # the next step is to randomize the rows of the data
    df = df.sample(frac=1).reset_index(drop=True)

    # load embeddings into memory
    print("Loading embeddings")
    # https://www.kaggle.com/yekenot/fasttext-crawl-300d-2m
    embeddings = load_vectors("./crawl-300d-2M.vec")

    # create sentence embeddings
    print("Creating sentence vectors")
    vectors = []

    for review in df.review.values:
        vectors.append(
            sentence_to_vec(
                s=review,
                embedding_dict=embeddings,
                stop_words=[],
                tokenizer=word_tokenize
            )
        )

    vectors = np.array(vectors)

    # fetch labels
    y = df.sentiment.values

    # initiate the kfold class from model_selection module
    kf = model_selection.StratifiedKFold(n_splits=5)

    # fill the new kfold column
    for fold_, (t_, v_) in enumerate(kf.split(X=vectors, y=y)):
        print(f"Training fold: {fold_}")
        # temporary dataframes for train and test
        xtrain = vectors[t_, :]
        ytrain = y[t_]

        xtest = vectors[v_, :]
        ytest = y[v_]

        # initialize logistic regression model
        model = linear_model.LogisticRegression()

        # fit the model on training data reviews and sentiment
        model.fit(xtrain, ytrain)

        # make predictions on test data
        # threshold for predictions is 0.5
        preds = model.predict(xtest)

        # calculation accuracy
        accuracy = metrics.accuracy_score(ytest, preds)

        print(f"Accuracy = {accuracy}")
        print("")

Loading embeddings
Creating sentence vectors
Training fold: 0
Accuracy = 0.8646

Training fold: 1
Accuracy = 0.8576

Training fold: 2
Accuracy = 0.8624

Training fold: 3
Accuracy = 0.8585

Training fold: 4
Accuracy = 0.8586



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
