In [1]:
!python --version

Python 3.7.11


In [2]:
# !pip install -U bcolz

In [3]:
# !python -m spacy download en_core_web_sm

# Load pretrained word embeddings

In [35]:
'''
python 3.7
bcolz              1.2.1
numpy              1.21.5
pytorch  1.11.0

'''
import os
import bcolz
import numpy as np
import pickle
import torch

def pretrained_word_embeddings(embed_path:str, over_writte:bool, special_tk:bool=True, freeze:bool=False):
    ''' return a torch.nn.Embedding layer, utilizing the pre-trained word vector (e.g., Glove), add 'bos', 'eos', 'unk' and 'pad'.

    :param embed_path: the path where pre-trained matrix cached (e.g., './glove.6B.300d.txt').
    :param over_writte: force to rewritte the existing matrix.
    :param special_tk: whether adding special token -- 'pad', 'unk', bos' and 'eos', at position 0, 1, 2 and 3 by default.
    :param freeze: whether trainable.
    :return: embed -> nn.Embedding, weights_matrix -> np.array, word2idx -> function, idx2word -> function, embed_dim -> int
    '''
    root_dir = embed_path.rsplit(".",1)[0]+".dat"
    out_dir_word = embed_path.rsplit(".",1)[0]+"_words.pkl"
    out_dir_idx = embed_path.rsplit(".",1)[0]+"_idx.pkl"
    out_dir_idx2word = embed_path.rsplit(".", 1)[0] + "_idx2word.pkl"
    if not all([os.path.exists(root_dir),os.path.exists(out_dir_word),os.path.exists(out_dir_idx)]) or over_writte:
        ## process and cache glove ===========================================
        words = []
        idx = 0
        _word2idx = {}
        _idx2word = {}
        vectors = bcolz.carray(np.zeros(1), rootdir=root_dir, mode='w')
        with open(os.path.join(embed_path),"rb") as f:
            for l in f:
                line = l.decode().split()
                word = line[0]
                words.append(word)
                _word2idx[word] = idx
                _idx2word[idx]=word
                idx += 1
                vect = np.array(line[1:]).astype(float)
                vectors.append(vect)
        vectors = bcolz.carray(vectors[1:].reshape((idx, vect.shape[0])), rootdir=root_dir, mode='w')
        vectors.flush()
        pickle.dump(words, open(out_dir_word, 'wb'))
        pickle.dump(_word2idx, open(out_dir_idx, 'wb'))
        pickle.dump(_idx2word,open(out_dir_idx2word,'wb'))
        print("dump word/idx at {}".format(embed_path.rsplit("/",1)[0]))
        ## =======================================================
    ## load glove
    vectors = bcolz.open(root_dir)[:]
    words = pickle.load(open(embed_path.rsplit(".",1)[0]+"_words.pkl", 'rb'))
    _word2idx = pickle.load(open(embed_path.rsplit(".",1)[0]+"_idx.pkl", 'rb'))
    _idx2word=pickle.load(open(embed_path.rsplit(".", 1)[0] + "_idx2word.pkl",'rb'))
    print("Successfully load Golve from {}, the shape of cached matrix: {}".format(embed_path.rsplit("/",1)[0],vectors.shape))

    word_num, embed_dim = vectors.shape
    word_num += 4  if special_tk else 0  ## e.g., 400004
    embedding_matrix = np.zeros((word_num, embed_dim))
    if special_tk:
        embedding_matrix[1] = np.random.normal(scale=0.6, size=(embed_dim, ))
        embedding_matrix[2] = np.random.normal(scale=0.6, size=(embed_dim,))
        embedding_matrix[3] = np.random.normal(scale=0.6, size=(embed_dim,))
        embedding_matrix[4:,:] = vectors
        weights_matrix_tensor = torch.FloatTensor(embedding_matrix)
        pad_idx,unk_idx, bos_idx,eos_idx = 0,1,2,3
        embed_layer = torch.nn.Embedding(word_num, embed_dim,padding_idx=pad_idx)
        embed_layer.from_pretrained(weights_matrix_tensor,freeze=freeze,padding_idx=pad_idx)
        _word2idx = dict([(k,v+4) for k,v in _word2idx.items()])
        _idx2word = dict([(k+4,v) for k,v in _idx2word.items()])
        assert len(_word2idx) + 4 == embedding_matrix.shape[0]
    else:
        embedding_matrix[:,:] = vectors
        weights_matrix_tensor = torch.FloatTensor(embedding_matrix)
        embed_layer = torch.nn.Embedding(word_num, embed_dim)
        embed_layer.from_pretrained(weights_matrix_tensor,freeze=freeze)
        assert len(_word2idx) == embedding_matrix.shape[0]

    def word2idx(word:str):
        if word == '<pad>': return 0
        elif word == '<bos>': return 2
        elif word == '<eos>': return 3
        return _word2idx.get(word,1)
    def idx2word(idx:int):
        if idx == 0: return '<pad>'
        elif idx == 1: return '<unk>'
        elif idx == 2: return '<bos>'
        elif idx == 3: return '<eos>'
        return _idx2word.get(idx,'')
    return embed_layer, embedding_matrix, word2idx,idx2word, embed_dim

# Load Data

In [5]:
import pandas as pd
data = pd.read_csv('../../datasets/IMDB Dataset.csv')

In [6]:
reviews = data['review'].values
sentiment = data['sentiment'].values

print(type(reviews), type(sentiment))
print(reviews.shape, sentiment.shape)
print(reviews[0])
print(sentiment[0])

<class 'numpy.ndarray'> <class 'numpy.ndarray'>
(50000,) (50000,)
One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br

# Text Normalization

In [18]:
import spacy
def spacy_process(text,print_result=False):
    '''
    text: Union(str,doc)
    print_result: if True, print the results to console

    return: tokenLists -> List; lemmaList ->List; filteredList -> List
    '''
    # lowercasing
    text = text.lower().strip()

    spacy.prefer_gpu()
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)

    #only tokenization
    token_list = []
    for token in doc:
        token_list.append(str(token))


    #Tokenization and lemmatization are done with the spacy nlp pipeline commands
    lemma_list = []
    for token in doc:
        lemma_list.append(token.lemma_)

    #Filter the stopword
    filtered_sentence =[]
    for word in lemma_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_sentence.append(word)

    #Remove punctuation
    punctuations="?:!.,;"
    for word in filtered_sentence:
        if word in punctuations:
            filtered_sentence.remove(word)

    if print_result:
        print("Only Tokeinze")
        print(token_list)
        print()
        print("Tokenize+Lemmatize:")
        print(lemma_list)
        print(" ")
        print("Remove stopword & punctuation: ")
        print(filtered_sentence)
    return token_list, lemma_list, filtered_sentence



In [8]:
# normlized_reviews = np.array([spacy_process(x)[0] for x in reviews])

In [20]:
from tqdm import trange
tmp = []
for i in trange(len(reviews)):
    tmp.append(spacy_process(reviews[i])[0])

100%|██████████| 50000/50000 [7:22:47<00:00,  1.88it/s]  


In [22]:
print(tmp[5])
print(reviews[5])

['probably', 'my', 'all', '-', 'time', 'favorite', 'movie', ',', 'a', 'story', 'of', 'selflessness', ',', 'sacrifice', 'and', 'dedication', 'to', 'a', 'noble', 'cause', ',', 'but', 'it', "'s", 'not', 'preachy', 'or', 'boring', '.', 'it', 'just', 'never', 'gets', 'old', ',', 'despite', 'my', 'having', 'seen', 'it', 'some', '15', 'or', 'more', 'times', 'in', 'the', 'last', '25', 'years', '.', 'paul', 'lukas', "'", 'performance', 'brings', 'tears', 'to', 'my', 'eyes', ',', 'and', 'bette', 'davis', ',', 'in', 'one', 'of', 'her', 'very', 'few', 'truly', 'sympathetic', 'roles', ',', 'is', 'a', 'delight', '.', 'the', 'kids', 'are', ',', 'as', 'grandma', 'says', ',', 'more', 'like', '"', 'dressed', '-', 'up', 'midgets', '"', 'than', 'children', ',', 'but', 'that', 'only', 'makes', 'them', 'more', 'fun', 'to', 'watch', '.', 'and', 'the', 'mother', "'s", 'slow', 'awakening', 'to', 'what', "'s", 'happening', 'in', 'the', 'world', 'and', 'under', 'her', 'own', 'roof', 'is', 'believable', 'and', 's

In [27]:
print(type(tmp))
print(len(tmp))

<class 'list'>
50000


# Sequentialization

In [None]:
'''
TODO:
- word to index
- add bos and eos to each sequence
- then padding
- store locally
'''

In [36]:
embed_layer, embedding_matrix, word2idx,idx2word, embed_dim = pretrained_word_embeddings('../../word embeddings/glove.6B.50d.txt',True,True,True)

dump word/idx at ../../word embeddings
Successfully load Golve from ../../word embeddings, the shape of cached matrix: (400000, 50)


In [32]:
word2idx('<bos>')

2

In [33]:
# word to index
# add bos and eos to each sequence
for j in trange(len(tmp)):
    i = tmp[j]
    i.insert(0,'<bos>')
    i.append('<eos>')
    for idx in range(len(i)):
        i[idx] = word2idx(i[idx])

100%|██████████| 50000/50000 [00:06<00:00, 7256.38it/s]


In [34]:
print(tmp[0])

[2, 52, 7, 4, 72, 17965, 35, 3046, 16, 53, 2645, 124, 180, 12965, 1946, 85, 773, 34, 12769, 6, 43, 36, 252, 5, 23, 41, 18, 2800, 106, 1583, 21, 1, 1, 1, 62, 877, 16, 1873, 289, 63, 12965, 19, 51, 13159, 9, 45659, 3472, 7, 718, 5, 46, 212, 10, 252, 29, 4, 1392, 246, 6, 1857, 289, 5, 41, 18, 40, 11, 277, 14, 4, 17177, 21364, 50, 25444, 6, 41, 277, 11114, 88, 14775, 21, 9485, 8, 1607, 5, 1743, 50, 718, 6, 51, 18, 16361, 5, 10, 4, 2396, 238, 7, 4, 1, 1, 1, 18, 179, 12965, 23, 16, 18, 4, 7402, 458, 8, 4, 17868, 2968, 198, 96, 1, 6, 24, 6549, 1678, 17, 19724, 119, 5, 33, 6126, 1523, 7, 4, 1001, 115, 68, 4, 2778, 37, 2851, 11278, 9, 625, 67375, 5, 104, 5314, 18, 40, 156, 17, 4, 2808, 6, 23612, 119, 18, 167, 8, 113, 1, 86437, 5, 2483, 5, 146506, 5, 15547, 5, 4995, 5, 8597, 5, 1842, 9, 60, 6942, 104, 31999, 5, 340, 31661, 5, 59477, 9608, 9, 18399, 3210, 36, 336, 376, 1, 1, 1, 58, 207, 4, 448, 1578, 7, 4, 277, 18, 449, 8, 4, 857, 16, 24, 1436, 115, 72, 974, 58, 74, 12252, 6, 4462, 1926, 2494, 46

In [37]:
def seq2words(seq):
    for i in seq:
        print(idx2word(i),end=' ')
    print()


In [39]:
seq2words(tmp[5])

<bos> probably my all - time favorite movie , a story of selflessness , sacrifice and dedication to a noble cause , but it 's not preachy or boring . it just never gets old , despite my having seen it some 15 or more times in the last 25 years . paul lukas ' performance brings tears to my eyes , and bette davis , in one of her very few truly sympathetic roles , is a delight . the kids are , as grandma says , more like " dressed - up midgets " than children , but that only makes them more fun to watch . and the mother 's slow awakening to what 's happening in the world and under her own roof is believable and startling . if i had a dozen thumbs , they 'd all be " up " for this movie . <eos> 


In [48]:
def pad(seqs:list,length:int,mode = 'post',padding = 0):
    if mode == 'post':
        for idx in range(len(seqs)):
            seq = seqs[idx]
            if len(seq)<length:
                seqs[idx].extend([padding]*(length-len(seq)))
            elif len(seq) >length:
                seqs[idx] = seq[:length]
    elif mode == 'pre':
        for idx in range(len(seqs)):
            seq = seqs[idx]
            if len(seq)<length:
                seqs[idx] = [padding]*(length-len(seq)) + seq
            elif len(seq) >length:
                seqs[idx] = seq[:length]
    else:
        raise NotImplementedError('The padding mode is not implemented.')



In [49]:
pad(tmp,256)

In [52]:
print(tmp[1])
print(seq2words(tmp[1]))

[2, 11, 5209, 337, 622, 6, 19799, 30414, 1, 1, 7278, 5004, 18, 195, 1, 195, 171, 15, 83, 15, 3096, 2539, 9, 1833, 11, 22956, 5, 9, 1075, 119716, 5, 1384, 7, 14630, 8, 4, 1456, 2369, 6, 19799, 30414, 1, 1, 3830, 36, 2712, 147, 1, 789, 19352, 40, 95, 12, 35, 409, 68, 4, 328926, 12, 38, 22, 35, 68, 4, 5864, 139, 4676, 321, 809, 85, 90, 4706, 257, 4, 25096, 9746, 6570, 25, 4, 3081, 8, 1321, 61, 9545, 9595, 5, 40, 95, 18, 24, 147, 1093, 4, 2645, 38, 24, 18, 11, 1, 986, 9, 1793, 2369, 6, 11, 31531, 622, 63, 52, 7, 4, 357, 2338, 13, 7, 2845, 9, 30, 218, 6, 19799, 30414, 1, 1, 14630, 592, 938, 167, 21, 4, 337, 658, 49, 4, 5851, 7, 4, 1394, 46, 5, 875, 77, 238, 4, 1278, 61, 2899, 61, 4280, 952, 3055, 131, 19358, 6, 24, 1385, 17, 166, 2493, 9, 166, 15040, 5, 1117, 21, 4, 3472, 4884, 26378, 9, 54797, 9, 4, 2307, 27, 1117, 7, 48, 2987, 21, 54797, 13, 20376, 22728, 363, 2287, 28, 36, 14911, 147, 755, 6, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [53]:
normalized_reviews = np.array(tmp)

In [54]:
print(normalized_reviews.shape)

(50000, 256)


In [58]:
#保存文件
np.save("normalized_reviews.npy",normalized_reviews)