https://www.kaggle.com/c/word2vec-nlp-tutorial#part-1-for-beginners-bag-of-words

In [47]:
import numpy as np
import pandas as pd
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm

In [2]:
data_path = '/media/mike/tera/data/nlp/kaggle_imdb/' # Point this to the path to where IMDB data is stored

In [3]:
train = pd.read_csv(data_path + 'labeledTrainData.tsv', header=0, delimiter='\t', quoting=3)

In [4]:
train.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [72]:
train['review'][0]

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally

In [74]:
nltk.download('stopwords') # Download the stopwords
stopwords = nltk.corpus.stopwords.words('english')
# stopwords.append('br') 
stopwords

[nltk_data] Downloading package stopwords to /home/mike/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 'her',
 'hers',
 'herself',
 'it',
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each',
 'few',
 'more',
 'most',
 'other',
 'some',
 'such',
 'no',
 'nor',
 '

In [75]:
def unstopper(toklist, stoplist=None):
    toklist = [w for w in toklist if not w in stoplist]
    wordstr = ' '.join(toklist)
    return wordstr

In [103]:
# Unleash the power of PANDAS! This is technically one line of code. But since I dislike perl-esque unreadable one-liners, I've split each operation to its own line

train_phrases = train['review']\
                .str.replace(r'<br \/>', ' ')\
                .str.replace(r'[^a-zA-Z]', ' ')\
                .str.lower()\

if False:
    train_phrases = train_phrases.str.split()\
                    .apply(unstopper, stoplist=stopwords)
    
# Note the use of backslash to split to multiple lines for readability
# Remove linebreak <br /> tags
# Replace all non-alphabetic with spaces
# Lowercase only
# Tokenize
# Remove all stopwords


#.str.replace(r' +', ' ')

In [104]:
train.head()['review'].str.split().str.join(' ')

0    "With all this stuff going down at the moment ...
1    "\"The Classic War of the Worlds\" by Timothy ...
2    "The film starts with a manager (Nicholas Bell...
3    "It must be assumed that those who praised thi...
4    "Superbly trashy and wondrously unpretentious ...
Name: review, dtype: object

In [105]:
train_phrases[0]

' with all this stuff going down at the moment with mj i ve started listening to his music  watching the odd documentary here and there  watched the wiz and watched moonwalker again  maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  some of it has subtle messages about mj s feeling towards the press and also the obvious message of drugs are bad m kay   visually impressive but of course this is all about michael jackson so unless you remotely like mj in anyway then you are going to hate this and find it boring  some may call mj an egotist for consenting to the making of this movie but mj and most of his fans would say that he made it for the fans which if true is really nice of him   the actual feature film bit when it finally starts is only on for 

In [106]:
# Now we need to concatenate all the reviews into a single list so we can apply Bag of Words. 
big_list_train_phrases = []
for sentence in train_phrases.values:
    big_list_train_phrases.append(sentence)
print(len(big_list_train_phrases))
# big_list_train_phrases

25000


In [107]:
# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 


In [108]:
train_data_features = vectorizer.fit_transform(big_list_train_phrases)

In [109]:
type(train_data_features)

scipy.sparse.csr.csr_matrix

In [110]:
train_data_features = train_data_features.toarray()
train_data_features.shape

(25000, 5000)

In [112]:
vocab = vectorizer.get_feature_names()

In [113]:
# Sum up the counts of each vocabulary word
freq = np.sum(train_data_features, axis=0)

# For each, print the vocabulary word and the number of times it 
# appears in the training set

In [114]:
df_vocab = pd.DataFrame(list(zip(vocab, freq)), columns=['vocab', 'freq'])

In [115]:
df_vocab

Unnamed: 0,vocab,freq
0,abandoned,187
1,abc,125
2,abilities,108
3,ability,454
4,able,1259
5,about,17375
6,above,819
7,abraham,85
8,absence,116
9,absent,83


In [141]:
df_vocab = df_vocab.sort_values(by='freq', ascending=False)
df_vocab.reset_index(drop=True, inplace=True)
df_vocab.index = df_vocab.index + 1   # We need to increase this to make room for our null character
df_vocab.head(10)

Unnamed: 0,vocab,freq
1,the,336758
2,and,164143
3,of,145867
4,to,135724
5,is,107337
6,it,96472
7,in,93981
8,this,76007
9,that,73287
10,was,48209


In [130]:
# Invert word/int pairs to get our lookup
vocab_idx = {key:value for (key, value) in zip(df_vocab['vocab'], df_vocab.index)}

In [137]:
def words_to_index(wordlist, vocab=None):
    """Minifunction for pandas.apply(). Replaces each word with respective index"""
    return [vocab[word] if word in vocab else 0 for word in wordlist]

In [140]:
train_idx = train_phrases.str.split().apply(words_to_index, vocab=vocab_idx)
train_idx.head()

0    [13, 26, 8, 521, 163, 174, 27, 1, 538, 13, 0, ...
1    [1, 342, 313, 3, 1, 3179, 29, 3694, 0, 5, 0, 5...
2    [1, 16, 498, 13, 0, 2943, 4522, 3869, 719, 230...
3    [6, 200, 24, 4991, 9, 141, 31, 0, 8, 16, 1, 80...
4    [3502, 4262, 2, 0, 0, 0, 2155, 0, 1, 1691, 870...
Name: review, dtype: object

In [163]:
def get_vocab_index(data_phrases, verbose=False):
    """ Proccess an array-like of strings and generate Bag of Words vocab index
    """
    big_list_phrases = []
    for sentence in data_phrases.values:
        big_list_phrases.append(sentence)

    vectorizer = CountVectorizer(analyzer = "word",   \
                         tokenizer = None,    \
                         preprocessor = None, \
                         stop_words = None,   \
                         max_features = 5000) 

    if verbose: print('Vectorizing')
    data_features = vectorizer.fit_transform(big_list_phrases)
    data_features = data_features.toarray()
    freq = np.sum(train_data_features, axis=0)

    vocab = vectorizer.get_feature_names()
    df_vocab = pd.DataFrame(list(zip(vocab, freq)), columns=['vocab', 'freq'])
    df_vocab = df_vocab.sort_values(by='freq', ascending=False)
    df_vocab.reset_index(drop=True, inplace=True)
    df_vocab.index = df_vocab.index + 1   # We need to increase this to make room for our null character
    vocab_idx = {key:value for (key, value) in zip(df_vocab['vocab'], df_vocab.index)}
    return vocab_idx
    

def load_and_process_imdb_csv(file, vocab_idx=None, stopwords=None, verbose=False):
    if verbose: print('Loading')
    data = pd.read_csv(file, header=0, delimiter='\t', quoting=3)
    if verbose: print('Preprocesing')
    data_phrases = data['review'].str.replace(r'<br \/>', ' ')\
                    .str.replace(r'[^a-zA-Z]', ' ').str.lower()

    if stopwords:
        data_phrases = data_phrases.str.split()\
                        .apply(unstopper, stoplist=stopwords)
            
    if vocab_idx is None:
        vocab_idx = get_vocab_index(data_phrases, verbose=verbose)

    if verbose: print('Indexing')
    data_idx = data_phrases.str.split().apply(words_to_index, vocab=vocab_idx)
    data['vectors'] = data_idx
    return data, vocab_idx
    
    

In [170]:
train_data, vocab_idx = load_and_process_imdb_csv(data_path + 'labeledTrainData.tsv', verbose=1)

Loading
Preprocesing
Vectorizing
Indexing


In [168]:
train_data

Unnamed: 0,id,sentiment,review,vectors
0,"""5814_8""",1,"""With all this stuff going down at the moment ...","[13, 26, 8, 521, 163, 174, 27, 1, 538, 13, 0, ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ...","[1, 342, 313, 3, 1, 3176, 29, 3694, 0, 5, 0, 5..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell...","[1, 16, 498, 13, 0, 2942, 4523, 3867, 719, 230..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi...","[6, 200, 24, 4988, 9, 141, 31, 0, 8, 16, 1, 80..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ...","[3506, 4262, 2, 0, 0, 0, 2155, 0, 1, 1687, 870..."
5,"""8196_8""",1,"""I dont know why people think this is such a b...","[0, 0, 117, 130, 73, 97, 8, 5, 135, 0, 71, 14,..."
6,"""7166_2""",0,"""This movie could have been very good, but com...","[8, 14, 92, 23, 72, 50, 45, 15, 255, 51, 89, 3..."
7,"""10633_1""",0,"""I watched this video at a friend's house. I'm...","[0, 286, 8, 359, 27, 0, 434, 0, 297, 0, 0, 123..."
8,"""319_1""",0,"""A friend of mine bought this film for £1, and...","[0, 434, 3, 1866, 1211, 8, 16, 12, 2, 54, 88, ..."
9,"""8713_10""",1,"""<br /><br />This movie is full of references....","[8, 14, 5, 355, 3, 2021, 34, 1125, 2295, 1464,..."


In [172]:
test_data, _ = load_and_process_imdb_csv(data_path + 'testData.tsv', verbose=1)

Loading
Preprocesing
Vectorizing
Indexing


In [173]:
test_data

Unnamed: 0,id,review,vectors
0,"""12311_10""","""Naturally in a film who's main themes are of ...","[3800, 4716, 0, 1807, 4840, 0, 4619, 147, 1573..."
1,"""8348_2""","""This movie is a disaster within a disaster fi...","[3766, 705, 1073, 0, 1298, 2336, 0, 1298, 1807..."
2,"""5828_4""","""All in all, this is a movie for kids. We saw ...","[1622, 4716, 1622, 3766, 1073, 0, 705, 2270, 3..."
3,"""7186_2""","""Afraid of the Dark left me with the impressio...","[96, 1540, 2499, 4909, 2505, 66, 199, 2499, 18..."
4,"""12128_7""","""A very accurate depiction of small time mob l...","[0, 3309, 1813, 2341, 1540, 4223, 4480, 2093, ..."
5,"""2913_8""","""...as valuable as King Tut's tomb! (OK, maybe...","[4656, 133, 4656, 678, 0, 0, 0, 4814, 271, 316..."
6,"""4396_1""","""This has to be one of the biggest misfires ev...","[3766, 760, 3411, 4500, 2991, 1540, 2499, 2885..."
7,"""395_2""","""This is one of those movies I watched, and wo...","[3766, 1073, 2991, 1540, 3895, 431, 0, 2136, 3..."
8,"""10616_1""","""The worst movie i've seen in years (and i've ...","[2499, 281, 705, 0, 2423, 4194, 4716, 1203, 31..."
9,"""9074_9""","""Five medical students (Kevin Bacon, David Lab...","[4553, 1708, 2950, 3360, 0, 3300, 0, 1646, 71,..."


In [154]:
def package_dataset(file, x_train, y_train, x_test, y_test):
    np.savez(file, ((x_train, y_train), (x_test, y_test)))
# np.save(data_path + 'y_train')

In [145]:
package_dataset(data_path + 'imdb_train.npz', np.array(train_idx), train['sentiment']

In [146]:
data =np.load()

In [147]:
type(data)

numpy.lib.npyio.NpzFile

In [152]:
data['arr_0'].shape

(2, 25000)