In [1]:
#importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer

In [2]:
#importing the dataset
import sqlite3
con = sqlite3.connect('./database.sqlite')
#filtering only positive and negative reviews
data = pd.read_sql_query("""
SELECT *
FROM Reviews
WHERE Score != 3
""", con)

In [3]:
data = data[['Text', 'Score']]

In [4]:
def map_score(score):
    if score>3:
        return 'Positive'
    else:
        return 'Negative'
data['Score'] = data['Score'].apply(map_score)

In [5]:
data.head()

Unnamed: 0,Text,Score
0,I have bought several of the Vitality canned d...,Positive
1,Product arrived labeled as Jumbo Salted Peanut...,Negative
2,This is a confection that has been around a fe...,Positive
3,If you are looking for the secret ingredient i...,Negative
4,Great taffy at a great price. There was a wid...,Positive


In [None]:
#deduplication 


# TEXT PREPROCESSING

In [6]:
#Getting Stopwords
import string
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
stop = stopwords.words('english')
sno = nltk.stem.SnowballStemmer('english')
print (stop)
print('***************************************')
print(sno.stem('tasty'))

[nltk_data] Downloading package stopwords to C:\Users\TANMAY
[nltk_data]     LATA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', '

In [9]:
#Functions for cleaning HTML tags and punctuation 
def clean_html(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', sentence)
    return cleantext
def clean_punc(word):
    cleaned = re.sub(r'[?|!|\'|#]', r'', word)
    cleaned = re.sub(r'[.|,|)|(|\|/]', r' ', cleaned)
    return cleaned

In [10]:
#Creating a list of filtered sentences:
final_string = []
s = ''
for sentence in data['Text'].values:
    filtered_sentence = []
    sentence = clean_html(sentence)
    for word in sentence.split():
        for cleaned_word in clean_punc(word).split():
            if (cleaned_word.isalpha() and (len(cleaned_word) > 2) and cleaned_word not in stop):
                s = (sno.stem(cleaned_word.lower())).encode('utf8')
                filtered_sentence.append(s)
            else:
                continue
                 
    strl = b' '.join(filtered_sentence)
    final_string.append(strl)

In [11]:
data['Cleaned Text'] = final_string

In [12]:
data.head()

Unnamed: 0,Text,Score,Cleaned Text
0,I have bought several of the Vitality canned d...,Positive,b'bought sever vital can dog food product foun...
1,Product arrived labeled as Jumbo Salted Peanut...,Negative,b'product arriv label jumbo salt peanut peanut...
2,This is a confection that has been around a fe...,Positive,b'this confect around centuri light pillowi ci...
3,If you are looking for the secret ingredient i...,Negative,b'look secret ingredi robitussin believ found ...
4,Great taffy at a great price. There was a wid...,Positive,b'great taffi great price there wide assort yu...


# Bag of Words

In [14]:
count_vect = CountVectorizer()
bow_data = count_vect.fit_transform(data['Cleaned Text'].values)

In [17]:
bow_data

<525814x70780 sparse matrix of type '<class 'numpy.int64'>'
	with 17378570 stored elements in Compressed Sparse Row format>