In [89]:
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
import numpy as np

In [90]:
train = pd.read_csv("Replaced.csv", encoding='ISO-8859-1')

#### Number of Words

This gives number of words for each reviews

In [92]:
train['word_count'] = train['text'].apply(lambda x: len(str(x).split(" ")))
train[['text','word_count']].head()

Unnamed: 0,text,word_count
0,i love this album. it's very good. more to the...,41
1,Good flavor. This review was collected as part...,11
2,Good flavor.,2
3,I read through the reviews on here before look...,124
4,My husband bought this gel for us. The gel cau...,25


#### Number of characters

This gives number of characters for each reviews

In [94]:
train['char_count'] = train['text'].str.len() ## this also includes spaces
train[['text','char_count']].head()

Unnamed: 0,text,char_count
0,i love this album. it's very good. more to the...,201.0
1,Good flavor. This review was collected as part...,62.0
2,Good flavor.,12.0
3,I read through the reviews on here before look...,696.0
4,My husband bought this gel for us. The gel cau...,132.0


#### Average Word Length

This gives average length of words for each review

#### Eliminating empty cells or cells containing NaN values

It is common to filter out such data, before applying your further operations, using 'notnull' idiom on your dataframe

In [96]:
train = train[train['text'].notnull()]

In [97]:
def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))

train['avg_word'] = train['text'].apply(lambda x: avg_word(x))
train[['text','avg_word']].head()

Unnamed: 0,text,avg_word
0,i love this album. it's very good. more to the...,3.926829
1,Good flavor. This review was collected as part...,4.727273
2,Good flavor.,5.5
3,I read through the reviews on here before look...,4.620968
4,My husband bought this gel for us. The gel cau...,4.32


#### Number of stopwords

Stop words are words which are filtered out before or after processing of natural language data (text). We remove some of the most common words—including lexical words, such as "want" in order to improve performance.

In [99]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

train['stopwords'] = train['text'].apply(lambda x: len([x for x in x.split() if x in stop]))
train[['text','stopwords']].head()

Unnamed: 0,text,stopwords
0,i love this album. it's very good. more to the...,21
1,Good flavor. This review was collected as part...,4
2,Good flavor.,0
3,I read through the reviews on here before look...,57
4,My husband bought this gel for us. The gel cau...,9


#### Number of special characters

Find the count of special characters such as '#' , '@' etc present in the reviews

In [101]:
train['hastags'] = train['text'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
train[['text','hastags']].head()

Unnamed: 0,text,hastags
0,i love this album. it's very good. more to the...,0
1,Good flavor. This review was collected as part...,0
2,Good flavor.,0
3,I read through the reviews on here before look...,0
4,My husband bought this gel for us. The gel cau...,0


In [102]:
train['@ character'] = train['text'].apply(lambda x: len([x for x in x.split() if x.startswith('@')]))
train[['text','@ character']].head()

Unnamed: 0,text,@ character
0,i love this album. it's very good. more to the...,0
1,Good flavor. This review was collected as part...,0
2,Good flavor.,0
3,I read through the reviews on here before look...,0
4,My husband bought this gel for us. The gel cau...,0


#### Number of numerics

Find the count of digits present in the reviews

In [104]:
train['numerics'] = train['text'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
train[['text','numerics']].head()

Unnamed: 0,text,numerics
0,i love this album. it's very good. more to the...,0
1,Good flavor. This review was collected as part...,0
2,Good flavor.,0
3,I read through the reviews on here before look...,0
4,My husband bought this gel for us. The gel cau...,0


#### Number of Uppercase words

Find the count of upper case words present in the reviews

In [106]:
train['upper'] = train['text'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
train[['text','upper']].head()

Unnamed: 0,text,upper
0,i love this album. it's very good. more to the...,2
1,Good flavor. This review was collected as part...,0
2,Good flavor.,0
3,I read through the reviews on here before look...,7
4,My husband bought this gel for us. The gel cau...,1


### Basic Pre-processing

#### Lower case

Convert all the words in the reviews to lowercase for processing

In [109]:
train['text'] = train['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train['text'].head()

0    i love this album. it's very good. more to the...
1    good flavor. this review was collected as part...
2                                         good flavor.
3    i read through the reviews on here before look...
4    my husband bought this gel for us. the gel cau...
Name: text, dtype: object

#### Removing Punctuation

Remove any punctuations present in the reviews for processing

In [111]:
train['text'] = train['text'].str.replace('[^\w\s]','')
train['text'].head()

0    i love this album its very good more to the hi...
1    good flavor this review was collected as part ...
2                                          good flavor
3    i read through the reviews on here before look...
4    my husband bought this gel for us the gel caus...
Name: text, dtype: object

#### Removal of Stop Words

Stop words are words which are filtered out before or after processing of natural language data (text). We remove some of the most common words—including lexical words, such as "want" in order to improve performance.

In [113]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
train['text'] = train['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
train['text'].head()

0    love album good hip hop side current pop sound...
1          good flavor review collected part promotion
2                                          good flavor
3    read reviews looking buying one couples lubric...
4    husband bought gel us gel caused irritation fe...
Name: text, dtype: object

#### Common word removal

We also remove commonly occurring words from our text data.

In [115]:
freq = pd.Series(' '.join(train['text']).split()).value_counts()[:10]
freq

great        20936
product      20247
movie        19729
review       18906
part         18665
promotion    17733
collected    17725
love         16885
use          15946
good         12197
dtype: int64

#### Now, let’s remove these words as their presence will not of any use in classification of our text data.

In [117]:
freq = list(freq.index)
train['text'] = train['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
train['text'].head()

0    album hip hop side current pop sound hype list...
1                                               flavor
2                                               flavor
3    read reviews looking buying one couples lubric...
4    husband bought gel us gel caused irritation fe...
Name: text, dtype: object

#### Rare words removal

Remove the words which rarely occur in the product reviews.
Because they’re so rare, the association between them and other words is dominated by noise. 
You can replace rare words with a more general form and then this will have higher counts

In [119]:
freq = pd.Series(' '.join(train['text']).split()).value_counts()[-10:]
freq

moviehighly    1
natureand      1
brownies       1
graduating     1
prodeict       1
minimalized    1
invisioned     1
wellbad        1
muffle         1
mulitple       1
dtype: int64

In [120]:
freq = list(freq.index)
train['text'] = train['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
train['text'].head()

0    album hip hop side current pop sound hype list...
1                                               flavor
2                                               flavor
3    read reviews looking buying one couples lubric...
4    husband bought gel us gel caused irritation fe...
Name: text, dtype: object

#### All these pre-processing steps are essential and help us in reducing our vocabulary clutter so that the features produced in the end are more effective.

#### Spelling correction

Spelling correction is a useful pre-processing step because this also will help us in reducing multiple copies of words. For example, “Analytics” and “analytcs” will be treated as different words even if they are used in the same sense.

To achieve this we will use the textblob library.

In [123]:
## install textBlob to avoid getting error for running below command using
# conda install -c conda-forge textblob 

In [124]:
from textblob import TextBlob
train['text'][:5].apply(lambda x: str(TextBlob(x).correct()))

0    album hip hop side current pop sound hope list...
1                                               flavor
2                                               flavor
3    read reviews looking buying one couples lubric...
4    husband bought get us get caused irritation fe...
Name: text, dtype: object

#### Tokenization

Tokenization refers to dividing the text into a sequence of words or sentences. 
We have used the textblob library to first transform our reviews into a blob and then converted them into a series of words.

In [126]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Vrushali
[nltk_data]     Shah\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [127]:
TextBlob(train['text'][0]).words

WordList(['album', 'hip', 'hop', 'side', 'current', 'pop', 'sound', 'hype', 'listen', 'everyday', 'gym', 'give', '5star', 'rating', 'way', 'metaphors', 'crazy'])

#### Stemming

Stemming refers to the removal of suffices, like “ing”, “ly”, “s”, etc. by a simple rule-based approach. For this purpose, we will use PorterStemmer from the NLTK library.

In [129]:
from nltk.stem import PorterStemmer
st = PorterStemmer()
train['text'][:5].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

0    album hip hop side current pop sound hype list...
1                                               flavor
2                                               flavor
3    read review look buy one coupl lubric ultim di...
4    husband bought gel us gel caus irrit felt like...
Name: text, dtype: object

#### Lemmatization

Lemmatization is a more effective option than stemming because it converts the word into its root word, rather than just stripping the suffices. It makes use of the vocabulary and does a morphological analysis to obtain the root word. Therefore, we usually prefer using lemmatization over stemming.

In [131]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to C:\Users\Vrushali
[nltk_data]     Shah\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [132]:
from textblob import Word
train['text'] = train['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
train['text'].head()

0    album hip hop side current pop sound hype list...
1                                               flavor
2                                               flavor
3    read review looking buying one couple lubrican...
4    husband bought gel u gel caused irritation fel...
Name: text, dtype: object

### Advance Text Processing

#### N-grams

N-grams are the combination of multiple words used together. Ngrams with N=1 are called unigrams. Similarly, bigrams (N=2), trigrams (N=3) and so on can also be used.

In [135]:
TextBlob(train['text'][0]).ngrams(2)

[WordList(['album', 'hip']),
 WordList(['hip', 'hop']),
 WordList(['hop', 'side']),
 WordList(['side', 'current']),
 WordList(['current', 'pop']),
 WordList(['pop', 'sound']),
 WordList(['sound', 'hype']),
 WordList(['hype', 'listen']),
 WordList(['listen', 'everyday']),
 WordList(['everyday', 'gym']),
 WordList(['gym', 'give']),
 WordList(['give', '5star']),
 WordList(['5star', 'rating']),
 WordList(['rating', 'way']),
 WordList(['way', 'metaphor']),
 WordList(['metaphor', 'crazy'])]

#### Term frequency

Term frequency is simply the ratio of the count of a word present in a sentence, to the length of the sentence.

Therefore, we can generalize term frequency as:

TF = (Number of times term T appears in the particular row) / (number of terms in that row)

In [137]:
tf1 = (train['text'][1:2]).apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0).reset_index()
tf1.columns = ['words','tf']
tf1

Unnamed: 0,words,tf
0,flavor,1


#### Inverse Document Frequency

The intuition behind inverse document frequency (IDF) is that a word is not of much use to us if it’s appearing in all the documents.

Therefore, the IDF of each word is the log of the ratio of the total number of rows to the number of rows in which that word is present.

IDF = log(N/n), where, N is the total number of rows and n is the number of rows in which the word was present.

In [139]:
for i,word in enumerate(tf1['words']):
  tf1.loc[i, 'idf'] = np.log(train.shape[0]/(len(train[train['text'].str.contains(word)])))

tf1

Unnamed: 0,words,tf,idf
0,flavor,1,5.141692


The more the value of IDF, the more unique is the word.

#### Term Frequency – Inverse Document Frequency (TF-IDF)

TF-IDF is the multiplication of the TF and IDF which we calculated above.

In [142]:
tf1['tfidf'] = tf1['tf'] * tf1['idf']
tf1

Unnamed: 0,words,tf,idf,tfidf
0,flavor,1,5.141692,5.141692


We don’t have to calculate TF and IDF every time beforehand and then multiply it to obtain TF-IDF. Instead, sklearn has a separate function to directly obtain it:

***TfidfVectorizer***

The TfidfVectorizer will tokenize documents, learn the vocabulary and inverse document frequency weightings, and allow you to encode new documents.

In [144]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
 stop_words= 'english',ngram_range=(1,1))
train_vect = tfidf.fit_transform(train['text'])

train_vect

<70967x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 721460 stored elements in Compressed Sparse Row format>

#### Bag of Words

Bag of Words (BoW) refers to the representation of text which describes the presence of words within the text data. The intuition behind this is that two similar text fields will contain similar kind of words, and will therefore have a similar bag of words. Further, that from the text alone we can learn something about the meaning of the document.

***CountVectorizer***

The CountVectorizer provides a simple way to both tokenize a collection of text documents and build a vocabulary of known words, but also to encode new documents using that vocabulary.

In [146]:
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(max_features=1000, lowercase=True, ngram_range=(1,1),analyzer = "word")
train_bow = bow.fit_transform(train['text'])

train_bow

<70967x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 852741 stored elements in Compressed Sparse Row format>

#### Sentiment Analysis

It is a process of computationally identifying and categorizing opinions expressed in a piece of text, especially in order to determine whether the writer's attitude towards a particular topic, product, etc., is positive, negative, or neutral.

In [148]:
train['text'][:5].apply(lambda x: TextBlob(x).sentiment)

0                 (-0.09999999999999999, 0.575)
1                                    (0.0, 0.0)
2                                    (0.0, 0.0)
3    (0.014090909090909083, 0.6594444444444445)
4                                    (0.0, 0.0)
Name: text, dtype: object

Above, you can see that it returns a tuple representing polarity and subjectivity of each tweet. Here, we only extract polarity as it indicates the sentiment as value nearer to 1 means a positive sentiment and values nearer to -1 means a negative sentiment. This can also work as a feature for building a machine learning model.

In [150]:
train['sentiment'] = train['text'].apply(lambda x: TextBlob(x).sentiment[0] )
train[['text','sentiment']].head()

Unnamed: 0,text,sentiment
0,album hip hop side current pop sound hype list...,-0.1
1,flavor,0.0
2,flavor,0.0
3,read review looking buying one couple lubrican...,0.014091
4,husband bought gel u gel caused irritation fel...,0.0


#### Hashing with HashingVectorizer

***HashingVectorizer***

The HashingVectorizer class implements this approach that can be used to consistently hash words, then tokenize and encode documents as needed.

In [162]:
from sklearn.feature_extraction.text import HashingVectorizer
# create the transform
vectorizer = HashingVectorizer(n_features=20)
# encode document
vector = vectorizer.transform(train['text'])

vector

<70967x20 sparse matrix of type '<class 'numpy.float64'>'
	with 648047 stored elements in Compressed Sparse Row format>

In [163]:
train['text'][:5].apply(lambda x: TextBlob(x).sentiment)

0                 (-0.09999999999999999, 0.575)
1                                    (0.0, 0.0)
2                                    (0.0, 0.0)
3    (0.014090909090909083, 0.6594444444444445)
4                                    (0.0, 0.0)
Name: text, dtype: object

In [164]:
train['sentiment'] = train['text'].apply(lambda x: TextBlob(x).sentiment[0] )
train[['text','sentiment']].head()

Unnamed: 0,text,sentiment
0,album hip hop side current pop sound hype list...,-0.1
1,flavor,0.0
2,flavor,0.0
3,read review looking buying one couple lubrican...,0.014091
4,husband bought gel u gel caused irritation fel...,0.0


These methods will help in extracting more information which in return will help you in building better models.