In [3]:
### Text Classification

# The text classification workflow begins by cleaning and preparing the corpus out of the dataset. Then this corpus is represented by any of the different text representation methods which are hen followed by modeling.


In [4]:
# - Example text classification dataset   
# use the data from tweets,the task is to predict which tweets are about real disasters and which ones are not.

In [5]:
import pandas as pd 

tweet = pd.read_csv(r'data/tweets/train.csv')
test = pd.read_csv(r'data/tweets/test.csv')

tweet.head(3)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1


In [6]:
# We will only consider the tweets to predict the target.
print('There are {} rows and {} columns in train'.format(tweet.shape[0],tweet.shape[1]))
print('There are {} rows and {} columns in test'.format(test.shape[0],test.shape[1]))

There are 7613 rows and 5 columns in train
There are 3263 rows and 4 columns in test


In [8]:
# Text data preparation

# Tokenize: the process by which sentences are converted to a list of tokens or words.
# Remove stopwords: drop words like ‘a’ or ‘the’
# Lemmatize: reduce the inflectional forms of each word into a common base or root (“studies”, “studying” -> “study”).
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords

def preprocess_texts(df, col):
    new_corpus = []

    lem = WordNetLemmatizer()
    for text in df[col]:
        words = [w for w in word_tokenize(text)  if (w not in stopwords)]
        words = [lem.lemmatize(w) for w in words]
        new_corpus.append(words)
    
    return new_corpus


In [9]:
# Text Representation

# Text cannot be used directly as input to a machine learning model but needs to be represented in the numeric format first. This is known as text representation.

In [10]:
# 1. Countvectorizer
# Countvectorizer provides an easy method to vectorize and represent a collection of text documents. It tokenizes the input text and builds a vocabulary of known words and then represents the documents using this vocabulary.

from sklearn.feature_extraction.text import CountVectorizer

# list of text documents
text = ['She sells seashells in the seashore']
vectorizer = CountVectorizer()  # create the transform
vectorizer.fit(text)  # tokenize and build vocab

print(vectorizer.vocabulary_)  

vector = vectorizer.transform(text)  # encode document
print(vector.shape)
print(type(vector))
print(vector.toarray())

{&#39;she&#39;: 4, &#39;sells&#39;: 3, &#39;seashells&#39;: 1, &#39;in&#39;: 0, &#39;the&#39;: 5, &#39;seashore&#39;: 2}
(1, 6)
&lt;class &#39;scipy.sparse.csr.csr_matrix&#39;&gt;
[[1 1 1 1 1 1]]


In [11]:
# the Coutvectorizer has built a vocabulary out of the given text and then represented the words using a numpy sparse matrix. We can try and transfer another text using this vocabulary and observe the output to get a better understanding.
vector = vectorizer.transform(['I sell seashells in the seashore'])
print(vector.toarray())

[[1 1 1 0 0 1]]


In [12]:
# 2. TfidfVectorizer
# One issue with Countvectorizer is that common words like “the” will appear many times (unless you remove them at the preprocessing stage) and these words are not actually important. One popular alternative is Tfidfvectorizer. It is an acronym for Term frequency-inverse document frequency.

# Term Frequency: This summarizes how often a given word appears within a document.
# Inverse Document Frequency: This downscales words that appear a lot across documents.

from sklearn.feature_extraction.text import TfidfVectorizer

# list of text documents
text = ["She sells seashells by the seashore","The sea.","The seashore"]
vectorizer = TfidfVectorizer()  # create the transform
vectorizer.fit(text)  # tokenize and build vocab

print(vectorizer.vocabulary_)
print(vectorizer.idf_)

vector = vectorizer.transform([text[0]])  # encode document
print(vector.shape)
print(vector.toarray())

{&#39;she&#39;: 5, &#39;sells&#39;: 4, &#39;seashells&#39;: 2, &#39;by&#39;: 0, &#39;the&#39;: 6, &#39;seashore&#39;: 3, &#39;sea&#39;: 1}
[1.69314718 1.69314718 1.69314718 1.28768207 1.69314718 1.69314718
 1.        ]
(1, 7)
[[0.45050407 0.         0.45050407 0.34261996 0.45050407 0.45050407
  0.26607496]]


In [13]:
# The vocabulary again consists of 6 words and the inverse document frequency is calculated for each word, assigning the lowest score to “the” which occurred 4 times.

# Then the scores are normalized between 0 and 1 and this text representation can be used as input into any machine learning model.

In [14]:
# 3. word2vec
# The big issue with the above approaches is that the context of the word is lost when representing it. Word embeddings are learned by understanding the context in which the word occurs. Specifically, it looks at co-occurring words.The basic idea of word embedding is words that occur in similar context tend to be closer to each other in vector space. 

# Word2vec is composed of two different models:

# Continuous Bag of Words (CBOW) model can be thought of as learning word embeddings by training a model to predict a word given its context.
# Skip-Gram model is the opposite, learning word embeddings by training a model to predict context given a word.

In [22]:
# use pre-trained word vectors instead of training vectors from our corpus. 

from gensim.models import keyedvectors

def load_word2vec(pretrained_path):
    word2vec_dict = keyedvectors.load(pretrained_path, binary=True, unicode_errors='ignore')
    embeddings_index = dict()
    for word in word2vec_dict.wv.vocab:
        embeddings_index[word] = word2vec_dict.word_vec(word)
    return embeddings_index

In [23]:
# w2v_model = load_word2vec('dataset')
# print(w2v_model['London'].shape)

In [24]:
# the word is represented using a 300-dimensional vector. So every word in your corpus can be represented like this and this embedding matrix is used to train your model.

In [26]:
# 4. FastText
# It supports both Continuous Bag of Words and Skip-Gram models. The main difference between previous models and FastText is that it breaks the word in several n-grams.

# Let’s take the word orange for example.

# The trigrams of word orange are,org,ran,ang,nge(ignoring the starting and ending boundaries of the word).

# The word embedding vector (text representation)for orange will be the sum of these n-grams. Rare words or typos can now be properly represented since it is highly likely that some of their n-grams also appears in other words.

# For example, for a word like stupedofantabulouslyfantastic, which might never have been in any corpus, gensim might return any two of the following solutions: a zero vector or a random vector with low magnitude.

# FastText, however, can produce better vectors by breaking the word into chunks and using the vectors for those chunks to create a final vector for the word. In this particular case, the final vector might be closer to the vectors of fantastic and fantabulous.

In [35]:
import numpy as np 
from tqdm import tqdm
from gensim.models import FastText

def load_fasttext(fpath):
    print('loading fasttext word embeddings...')
    embeddings_index = {}
    with open(fpath, encoding='utf-8') as fr:
        for line in tqdm(fr):
            values = line.strip().rsplit(' ')
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
        
        print(f'found {len(embeddings_index)}s word vectors.')
    
    return embeddings_index

In [36]:
w2v_fasttext = load_fasttext(r'/Users/liuzhi/models/w2v/wiki-news-300d-1M-subword.vec')
print(w2v_fasttext['London'].shape)

824it [00:00, 8237.04it/s]loading fasttext word embeddings...
999995it [01:17, 12984.39it/s]found 999995s word vectors.
(300,)



In [29]:
# 5. GloVe (Global vectors for word representation)

# GloVe主要的思想是基于词的共现矩阵，类似于Word2Vec，但略有差别。
# Word2vec relies only on local information of language.
# GloVe captures both global statistics and local statistics of a corpus, in order to come up with word vectors.
# Given a corpus having V words, the co-occurrence matrix X will be a V x V matrix, where the i th row and j th column of X, X_ij denotes how many times word i has co-occurred with word j.
'''
Consider the entity
P_ik/P_jk where P_ik = X_ik/X_i
Here P_ik denotes the probability of seeing word i and k together, which is computed by dividing the number of times i and k appeared together (X_ik) by the total number of times word i appeared in the corpus (X_i).
You can see that given two words, i.e. ice and steam, if the third word k (also called the “probe word”),
is very similar to ice but irrelevant to steam (e.g. k=solid), P_ik/P_jk will be very high (>1),
is very similar to steam but irrelevant to ice (e.g. k=gas), P_ik/P_jk will be very small (<1),
is related or unrelated to either words, then P_ik/P_jk will be close to 1
So, if we can find a way to incorporate P_ik/P_jk to computing word vectors we will be achieving the goal of using global statistics when learning word vectors.
'''

In [30]:
def load_glove(fpath):
    print('loading glove word embeddings...')
    embeddings_dict = {}
    with open(fpath, encoding='utf-8') as fr:
        for line in fr:
            values = line.split()
            word = values[0]
            vectors = np.asarray(values[1:], 'float32')
            embeddings_dict[word] = vectors
    
    print(f'found {len(embeddings_dict)}s word vectors.')
    return embeddings_dict

In [31]:
# w2v_glove = load_glove('dataset')
# print(w2v_glove['London'].shape)

In [32]:
# 6. Universal Sentence Encoding

# Sometimes we need to explore sentence level operations. These encoders are called sentence encoders.
# A good sentence encoder is expected to encode sentences in such a way that the vectors of similar sentences have a minimal distance between them in the vector space.

In [None]:
# import tensorflow as tf
# import tensorflow_hub as hub
# module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3"
# # Import the Universal Sentence Encoder's TF Hub module
# embed = hub.Module(module_url)

# # Compute a representation for each message, showing various lengths supported.
# messages = ["That band rocks!", "That song is really cool."]

# with tf.Session() as session:
#   session.run([tf.global_variables_initializer(), tf.tables_initializer()])
#   message_embeddings = session.run(embed(messages))