In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import sqlite3    ## SQL Interface
import pickle     ## Used to save your data - Converts objects to byte stream and vice versa

from sklearn.feature_extraction.text import CountVectorizer  ## BOW Model
from sklearn.feature_extraction.text import TfidfVectorizer  ## TFIDF Model

## Modules to perform Text Preprocessing
import re
import nltk
from nltk.corpus import stopwords

import gensim    ## To build Word2Vec model



#### Create 4 dummy reviews

In [2]:
r1 = "This pasta is very tasty and affordable"
r2 = "This pasta is not tasty and affordable"
r3 = "This pasta is delicious and cheap"
r4 = "Pasta i tasty and pasta tastes good"

#### Add reviews to a dataframe

In [4]:
d = pd.DataFrame([r1, r2, r3, r4], columns = ['review'])
d

Unnamed: 0,review
0,This pasta is very tasty and affordable
1,This pasta is not tasty and affordable
2,This pasta is delicious and cheap
3,Pasta i tasty and pasta tastes good


#### values attribute returns an array of column values

In [6]:
d['review'].values

array(['This pasta is very tasty and affordable',
       'This pasta is not tasty and affordable',
       'This pasta is delicious and cheap',
       'Pasta i tasty and pasta tastes good'], dtype=object)

In [7]:
type(d['review'].values)

numpy.ndarray

## Bag Of Words

In [8]:
bow_vec = CountVectorizer()
bow = bow_vec.fit_transform(d['review'].values)

In [9]:
type(bow)

scipy.sparse.csr.csr_matrix

**Observe:** The return value is a scipy sparse matrix - We cannot directly view it as an array

**toarray()** function returns a numpy array from the sparse matrix so that we can view

In [13]:
bow.toarray()

array([[1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1],
       [1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0],
       [0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0],
       [0, 1, 0, 0, 1, 0, 0, 2, 1, 1, 0, 0]], dtype=int64)

**get_feature_names** returns the individual words/features/columns of the sparse matrix

In [10]:
bow_vec_feat = bow_vec.get_feature_names()
bow_vec_feat

['affordable',
 'and',
 'cheap',
 'delicious',
 'good',
 'is',
 'not',
 'pasta',
 'tastes',
 'tasty',
 'this',
 'very']

Creating a DataFrame to properly visualize the entire array along with features

In [12]:
bow_df = pd.DataFrame(bow.toarray(), columns = bow_vec_feat)
bow_df

Unnamed: 0,affordable,and,cheap,delicious,good,is,not,pasta,tastes,tasty,this,very
0,1,1,0,0,0,1,0,1,0,1,1,1
1,1,1,0,0,0,1,1,1,0,1,1,0
2,0,1,1,1,0,1,0,1,0,0,1,0
3,0,1,0,0,1,0,0,2,1,1,0,0


#### Binary Bag Of Words

In [16]:
bow_vec_bin = CountVectorizer(binary = True)
bow_bin = bow_vec_bin.fit_transform(d['review'].values)
bow_vec_bin_feat = bow_vec_bin.get_feature_names()
bow_bin_df = pd.DataFrame(bow_bin.toarray(), columns = bow_vec_bin_feat)
bow_bin_df

Unnamed: 0,affordable,and,cheap,delicious,good,is,not,pasta,tastes,tasty,this,very
0,1,1,0,0,0,1,0,1,0,1,1,1
1,1,1,0,0,0,1,1,1,0,1,1,0
2,0,1,1,1,0,1,0,1,0,0,1,0
3,0,1,0,0,1,0,0,1,1,1,0,0


**Observe:** All the values are either 0 or 1

## Creating Bi-grams BOW

In [15]:
## Take a note of how to give the ngram_range
bow_vec_bi = CountVectorizer(ngram_range=(2,2))
bow_bi = bow_vec_bi.fit_transform(d['review'].values)
bow_vec_bi_feat = bow_vec_bi.get_feature_names()
bow_bi_df = pd.DataFrame(bow_bi.toarray(), columns = bow_vec_bi_feat)
bow_bi_df

Unnamed: 0,and affordable,and cheap,and pasta,delicious and,is delicious,is not,is very,not tasty,pasta is,pasta tastes,pasta tasty,tastes good,tasty and,this pasta,very tasty
0,1,0,0,0,0,0,1,0,1,0,0,0,1,1,1
1,1,0,0,0,0,1,0,1,1,0,0,0,1,1,0
2,0,1,0,1,1,0,0,0,1,0,0,0,0,1,0
3,0,0,1,0,0,0,0,0,0,1,1,1,1,0,0


## Creating both Unigrams and Bigrams BOW

In [18]:
## Take a note of how to give the ngram_range
bow_vec_ub = CountVectorizer(ngram_range=(1,2))
bow_ub = bow_vec_ub.fit_transform(d['review'].values)
bow_vec_ub_feat = bow_vec_ub.get_feature_names()
bow_ub_df = pd.DataFrame(bow_ub.toarray(), columns = bow_vec_ub_feat)
bow_ub_df

Unnamed: 0,affordable,and,and affordable,and cheap,and pasta,cheap,delicious,delicious and,good,is,...,pasta tastes,pasta tasty,tastes,tastes good,tasty,tasty and,this,this pasta,very,very tasty
0,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,1,1,1,1,1,1
1,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,1,1,1,1,0,0
2,0,1,0,1,0,1,1,1,0,1,...,0,0,0,0,0,0,1,1,0,0
3,0,1,0,0,1,0,0,0,1,0,...,1,1,1,1,1,1,0,0,0,0


In [19]:
bow_vec_ub_feat

['affordable',
 'and',
 'and affordable',
 'and cheap',
 'and pasta',
 'cheap',
 'delicious',
 'delicious and',
 'good',
 'is',
 'is delicious',
 'is not',
 'is very',
 'not',
 'not tasty',
 'pasta',
 'pasta is',
 'pasta tastes',
 'pasta tasty',
 'tastes',
 'tastes good',
 'tasty',
 'tasty and',
 'this',
 'this pasta',
 'very',
 'very tasty']

## TF-IDF

In [20]:
tf_idf_vec = TfidfVectorizer()
tf_idf = tf_idf_vec.fit_transform(d['review'].values)
tf_idf_feat = tf_idf_vec.get_feature_names()
tf_idf_df = pd.DataFrame(tf_idf.toarray(), columns = tf_idf_feat)
tf_idf_df

Unnamed: 0,affordable,and,cheap,delicious,good,is,not,pasta,tastes,tasty,this,very
0,0.428304,0.28349,0.0,0.0,0.0,0.346749,0.0,0.28349,0.0,0.346749,0.346749,0.543249
1,0.428304,0.28349,0.0,0.0,0.0,0.346749,0.543249,0.28349,0.0,0.346749,0.346749,0.0
2,0.0,0.284711,0.545589,0.545589,0.0,0.348242,0.0,0.284711,0.0,0.0,0.348242,0.0
3,0.0,0.268798,0.0,0.0,0.515094,0.0,0.0,0.537595,0.515094,0.328778,0.0,0.0


## Creating Bi-grams TFIDF

In [21]:
tf_idf_bi_vec = TfidfVectorizer(ngram_range=(2,2))
tf_idf_bi = tf_idf_bi_vec.fit_transform(d['review'].values)
tf_idf_bi_feat = tf_idf_bi_vec.get_feature_names()
tf_idf_bi_df = pd.DataFrame(tf_idf_bi.toarray(), columns = tf_idf_bi_feat)
tf_idf_bi_df

Unnamed: 0,and affordable,and cheap,and pasta,delicious and,is delicious,is not,is very,not tasty,pasta is,pasta tastes,pasta tasty,tastes good,tasty and,this pasta,very tasty
0,0.402134,0.0,0.0,0.0,0.0,0.0,0.510056,0.0,0.325562,0.0,0.0,0.0,0.325562,0.325562,0.510056
1,0.402134,0.0,0.0,0.0,0.0,0.510056,0.0,0.510056,0.325562,0.0,0.0,0.0,0.325562,0.325562,0.0
2,0.0,0.511992,0.0,0.511992,0.511992,0.0,0.0,0.0,0.326798,0.0,0.0,0.0,0.0,0.326798,0.0
3,0.0,0.0,0.47633,0.0,0.0,0.0,0.0,0.0,0.0,0.47633,0.47633,0.47633,0.304035,0.0,0.0


** Similarly we can create n-grams in any range**

## Word2Vec

**Note:** First we need to create a list of lists where outer list contains all the reviews and each inner list contains words in that review

In [22]:
## List of sentences:
list_of_sent=[]
for se in d['review'].values:
    sent = []
    for w in se.split():
        sent.append(w.lower())
    list_of_sent.append(sent)
list_of_sent

[['this', 'pasta', 'is', 'very', 'tasty', 'and', 'affordable'],
 ['this', 'pasta', 'is', 'not', 'tasty', 'and', 'affordable'],
 ['this', 'pasta', 'is', 'delicious', 'and', 'cheap'],
 ['pasta', 'i', 'tasty', 'and', 'pasta', 'tastes', 'good']]

#### Using Gensim module to build W2V models

In [23]:
## Default min_count = 5
w2v_model = gensim.models.Word2Vec(list_of_sent, min_count = 1, size = 5)

We can get all the unique words in the model by typecasting the **wv.vocab** attribute to list

In [25]:
words = list(w2v_model.wv.vocab)
words

['pasta',
 'this',
 'affordable',
 'not',
 'cheap',
 'is',
 'very',
 'and',
 'i',
 'delicious',
 'good',
 'tasty',
 'tastes']

## Pickle Files in Python
* The pickle module implements a fundamental, but powerful algorithm for serializing and de-serializing a Python object structure.
* **Pickling** - is the process whereby a Python object hierarchy is converted into a byte stream
* **Unpickling** - is the inverse operation, whereby a byte stream is converted back into an object hierarchy.

Pickling (and unpickling) is alternatively known as serialization, marshalling, or flattening.

Refer link for detailed explanation - http://www.diveintopython3.net/serializing.html

#### Saving Data to the pickle file
* The pickle module takes a Python data structure and saves it to a file.
* Be sure to open your pickle files in binary mode, or the data will get corrupted during writing.
* Not every Python data structure can be serialized by the pickle module.

In [2]:
x = [1, 2, 3, 4]

In [3]:
with open('dummy.pkl', 'wb') as f:
    pickle.dump(x, f)

#### Loading data from a pickle file
* Open the relevant pickle file.
* Load the serialized data into a new variable.
* Now you’ve read the serialized data from that file and created a perfect replica of the original data structure.

In [4]:
with open('dummy.pkl', 'rb') as f:
    y = pickle.load(f)

In [5]:
y

[1, 2, 3, 4]