In [1]:
import warnings

warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", FutureWarning)
warnings.simplefilter("ignore", DeprecationWarning)

In [2]:
import os
import pandas as pd

dataset = "tarun.csv"

if os.path.isfile(dataset):
    df = pd.read_csv("tarun.csv")
else:
    url = r"http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz"
    df = pd.read_json(url, compression='gzip', lines=True)
display(df.head(10))

Unnamed: 0.1,Unnamed: 0,0,1,2
0,2698442,2,An Amalgam,This book is an amalgam of bits and pieces and...
1,2646715,5,Great!!!,"Well, not much to say. If you saw the first se..."
2,2119569,2,Hit&Miss,Babyface in his hey day always had a Cut that ...
3,816322,5,Great Buy,This text is considered The Bible for any poli...
4,1476562,3,What time is it anyway?,I thought the other reviews weren't serious ab...
5,17087,3,PRETTY FUNNY,"GOOD,BUT UNREALISTIC.THE GUY JUST QUIT GOING T..."
6,1838666,5,Lacy J. Dalton,I saw Lacy on Bill Anderson's Country Reunion ...
7,1442704,4,Great,Easy and enjoyable to watch. I would recommend...
8,1114640,3,Be careful!!!!,I am a very advanced exerciser and have used t...
9,1308335,5,the best book in the world!!!,Sahara special is one of the best books I have...


In [3]:
df = df.drop('Unnamed: 0',axis=1)

In [4]:
df.columns=['overall','title','reviewText']

In [5]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   overall     750000 non-null  int64 
 1   title       749983 non-null  object
 2   reviewText  750000 non-null  object
dtypes: int64(1), object(2)
memory usage: 17.2+ MB
None


In [6]:
print(df["reviewText"].iloc[0])

This book is an amalgam of bits and pieces and techniques and strategies from other books about creative thinking on the market. If the reader has read very little in the field, then this book might at least be a start.


# NLP Pre-Processing

In [7]:
sample_review = df["reviewText"].iloc[749999]
print(sample_review)

I have been searching and searching for a good litter box. I Have been through so many and have wasted so much money. When I got this, our smaller and younger cat was all for it. She used it immediately. Our bigger, older kitty was hesitant. I kept an eye on him and now he is using it! There is no mess and it's so much easier to clean! I highly recommend it!!!!


In [8]:
# HTML Entities

In [9]:
import html

decoded_review = html.unescape(sample_review)
print(decoded_review)

I have been searching and searching for a good litter box. I Have been through so many and have wasted so much money. When I got this, our smaller and younger cat was all for it. She used it immediately. Our bigger, older kitty was hesitant. I kept an eye on him and now he is using it! There is no mess and it's so much easier to clean! I highly recommend it!!!!


In [10]:
pattern = r"\&\#[0-9]+\;"
df = df.sample(frac=0.05)
df["preprocessed"] = df["reviewText"].str.replace(pat=pattern, repl="", regex=True)

print(df["preprocessed"].iloc[1])

The acting is okay, but ending leaves you wondering what the point of the movie was. I wouldn't waste my money of this film.


In [11]:
%%time
import re
import nltk

from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from nltk.corpus import wordnet

#import nltk resources
#resources = ["wordnet", "stopwords", "punkt", \
 #            "averaged_perceptron_tagger", "maxent_treebank_pos_tagger"]
resources = ["wordnet", "stopwords"]
for resource in resources:
    try:
        nltk.data.find("tokenizers/" + resource)
    except LookupError:
        nltk.download(resource)

#create Lemmatizer object
lemma = WordNetLemmatizer()

def lemmatize_word(tagged_token):
    """ Returns lemmatized word given its tag"""
    root = []
    for token in tagged_token:
        tag = token[1][0]
        word = token[0]
        if tag.startswith('J'):
            root.append(lemma.lemmatize(word, wordnet.ADJ))
        elif tag.startswith('V'):
            root.append(lemma.lemmatize(word, wordnet.VERB))
        elif tag.startswith('N'):
            root.append(lemma.lemmatize(word, wordnet.NOUN))
        elif tag.startswith('R'):
            root.append(lemma.lemmatize(word, wordnet.ADV))
        else:          
            root.append(word)
    return root

def lemmatize_doc(document):
    """ Tags words then returns sentence with lemmatized words"""
    lemmatized_list = []
    tokenized_sent = sent_tokenize(document)
    for sentence in tokenized_sent:
        no_punctuation = re.sub(r"[`'\",.!?()]", " ", sentence)
        tokenized_word = word_tokenize(no_punctuation)
        tagged_token = pos_tag(tokenized_word)
        lemmatized = lemmatize_word(tagged_token)
        lemmatized_list.extend(lemmatized)
    return " ".join(lemmatized_list)

#apply our functions
df["preprocessed"] = df["preprocessed"].apply(lambda row: lemmatize_doc(row))

print(df["preprocessed"].iloc[1])

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


The acting be okay but end leaf you wonder what the point of the movie be I wouldn t waste my money of this film
Wall time: 4min 9s


In [12]:
from unicodedata import normalize

remove_accent = lambda text: normalize("NFKD", text).encode("ascii", "ignore").decode("utf-8", "ignore")

df["preprocessed"] = df["preprocessed"].apply(remove_accent)

print(df["preprocessed"].iloc[1])

The acting be okay but end leaf you wonder what the point of the movie be I wouldn t waste my money of this film


In [13]:
pattern = r"[^\w\s]"

df["preprocessed"] = df["preprocessed"].str.replace(pat=pattern, repl=" ", regex=True)

print(df["preprocessed"].iloc[1])

The acting be okay but end leaf you wonder what the point of the movie be I wouldn t waste my money of this film


In [14]:
df["preprocessed"] = df["preprocessed"].str.lower()

print(df["preprocessed"].iloc[1])

the acting be okay but end leaf you wonder what the point of the movie be i wouldn t waste my money of this film


In [15]:
from nltk.corpus import stopwords

stop_words = stopwords.words("english")

stop_words = [word.replace("\'", "") for word in stop_words]

print(f"sample stop words: {stop_words[:15]} \n")

remove_stop_words = lambda row: " ".join([token for token in row.split(" ") \
                                          if token not in stop_words])
df["preprocessed"] = df["preprocessed"].apply(remove_stop_words)

print(df["preprocessed"].iloc[1])

sample stop words: ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'youre', 'youve', 'youll', 'youd', 'your', 'yours'] 

acting okay end leaf wonder point movie waste money film


In [16]:
pattern = r"[\s]+"

df["preprocessed"] = df["preprocessed"].str.replace(pat=pattern, repl=" ", regex=True)

print(df["preprocessed"].iloc[1])

acting okay end leaf wonder point movie waste money film


In [17]:
corpora = df["preprocessed"].values
tokenized = [corpus.split(" ") for corpus in corpora]

print(tokenized[1])

['acting', 'okay', 'end', 'leaf', 'wonder', 'point', 'movie', 'waste', 'money', 'film']


In [18]:
from gensim.models import Phrases
from gensim.models.phrases import Phraser

bi_gram = Phrases(tokenized, min_count=300, threshold=50)

tri_gram = Phrases(bi_gram[tokenized], min_count=300, threshold=50)

In [19]:
tri_gram

<gensim.models.phrases.Phrases at 0x28ca5777fc8>

In [20]:
uni_gram_tokens = set([token for text in tokenized for token in text])
uni_gram_tokens = set(filter(lambda x: x != "", uni_gram_tokens))

print(list(uni_gram_tokens)[:50])

['flora', 'comprises', 'foresters', 'timbaland', 'bougth', 'provoking', 'celebratory', 'flickering', 'mormon', 'playseats', 'aristocats', 'bess', 'canal', 'disapppointed', 'hiddle', 'killdozer', 'juicing', 'unthematic', 'hess', 'gregg', '00765', 'walters', 'llsee', 'inaccurately', 'goofball', 'liftmaster', 'brainstorming', 'unzips', 'commonality', 'benjamin', 'brands', 'apealling', 'militaristic', 'labrador', 'perspectiva', 'loudmouth', 'gramatical', 'jance', 'deveria', 'ffviii', 'deodorant', 'memorials', 'footbagging', 'cooperate', 'dearth', 'heartening', 'cuss', 'codependency', 'matthew', 'crystles']


In [21]:
bigram_min = bi_gram.min_count

bi_condition = lambda x: x[1] >= bigram_min

bi_gram_tokens = dict(filter(bi_condition, bi_gram.vocab.items()))
bi_gram_tokens = set([token.decode("utf-8") \
                      for token in bi_gram_tokens])

bi_grams_only = bi_gram_tokens.difference(uni_gram_tokens)
print(list(bi_grams_only)[:50])

['', 'feel_like', 'go_back', 'read_review', 'really_good', '1_2', 'even_though', 'one_best', 'good_book', 'really_like', 'sound_like', 'main_character', 'waste_money', 'work_well', 'year_old', 'find_book', 'would_like', 'like_book', 'look_forward', 'pretty_good', 'read_book', 'much_good', 'would_recommend', 'think_would', 'highly_recommend', 'first_time', 'save_money', '5_star', 'year_ago', 'recommend_book', 'waste_time', 'long_time', 'work_great', 'book_read', 'book_good', 'look_like', 'great_book', 'seem_like', 'work_fine', 'buy_book']


In [22]:
trigram_min = tri_gram.min_count

tri_condition = lambda x: x[1] >= trigram_min

tri_gram_tokens = dict(filter(tri_condition, tri_gram.vocab.items()))
tri_gram_tokens = set([token.decode("utf-8") \
                       for token in tri_gram_tokens])

tri_grams_only = tri_gram_tokens.difference(bi_gram_tokens)
print(list(tri_grams_only)[:50])

[]


In [23]:
tokenized = [list(filter(lambda x: len(x) > 1, document)) \
             for document in tokenized]

print(tokenized[1])

['acting', 'okay', 'end', 'leaf', 'wonder', 'point', 'movie', 'waste', 'money', 'film']


In [24]:
len(tokenized)

37500

In [25]:
tokenized

[['well',
  'like',
  'enjoy',
  'actual',
  'music',
  'movie',
  'love',
  'cd',
  'listen',
  'song',
  'much',
  'music',
  'memory',
  'part',
  'movie',
  'relate',
  'wish',
  'actually',
  'part',
  'enjoy',
  'almost',
  'song',
  'remind',
  'seductive',
  'part',
  'movie',
  'think',
  'wonderfully',
  'make'],
 ['acting',
  'okay',
  'end',
  'leaf',
  'wonder',
  'point',
  'movie',
  'waste',
  'money',
  'film'],
 ['many',
  'dvd',
  'even',
  'cut',
  'old',
  'vhs',
  'original',
  'wonder',
  'good',
  'buy',
  'vhs',
  'version',
  'new',
  'dvd',
  'version',
  'anyone',
  'may',
  'see',
  'version',
  'please',
  'reply'],
 ['fuss',
  'disc',
  'match',
  'hype',
  'deeper',
  'deeper',
  'song',
  'worth',
  'remembering',
  'bad',
  'club',
  'music',
  'album',
  'lame'],
 ['book',
  'excellent',
  'overview',
  'lean',
  'six',
  'sigma',
  'relevant',
  'service',
  'organization',
  'fall',
  'short',
  'provide',
  'tool',
  'need',
  'implement',
  'lean'

In [26]:
# Transform each text into a vector of word counts
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words="english",
                             preprocessor=corpora)

In [27]:
corpora[1]

'acting okay end leaf wonder point movie waste money film'

In [28]:
label = df["overall"].reset_index().drop('index',axis=1)

In [29]:
label

Unnamed: 0,overall
0,4
1,1
2,3
3,1
4,3
...,...
37495,1
37496,5
37497,2
37498,5


In [30]:
from textblob import TextBlob

model_df = pd.DataFrame(corpora, columns=['text'])
model_df["label"] = label


In [31]:
list(corpora)[1]

'acting okay end leaf wonder point movie waste money film'

In [32]:
blob = TextBlob((model_df.text[1]))
str(blob.correct())

'acting okay end leaf wonder point movie waste money film'

In [33]:
def polarity_txt(text):
  return TextBlob(text).sentiment[0]

In [34]:

def subj_txt(text):
  return  TextBlob(text).sentiment[1]

In [35]:

def len_text(text):
  if len(text.split())>0:
         return len(corpora[text.index.step])/len(text.split())
  else:
         return 0

In [36]:
model_df['polarity'] = model_df['text'].apply(polarity_txt)
model_df.head(2)

Unnamed: 0,text,label,polarity
0,well like enjoy actual music movie love cd lis...,4,0.357143
1,acting okay end leaf wonder point movie waste ...,1,0.1


In [37]:
model_df['subjectivity'] = model_df['text'].apply(subj_txt)
model_df.head(2)


Unnamed: 0,text,label,polarity,subjectivity
0,well like enjoy actual music movie love cd lis...,4,0.357143,0.428571
1,acting okay end leaf wonder point movie waste ...,1,0.1,0.166667


In [38]:
model_df.index.step

1

In [39]:
#model_df['len'] = model_df['text'].apply(len_text)
#model_df.head()

In [40]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction import DictVectorizer
class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]


class TextStats(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, data):
        return [{'pos':  row['polarity'], 'sub': row['subjectivity'], } for _, row in data.iterrows()]

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer_a = TfidfVectorizer()

training_features = vectorizer_a.fit_transform(train_data["text"])
test_features = vectorizer_a.transform(test_data["text"])


NameError: name 'train_data' is not defined

In [46]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('union', FeatureUnion(
        transformer_list=[

            # Pipeline for pulling features from the text
            ('text', Pipeline([
                ('selector', ItemSelector(key='text')),
                ('tfidf', vectorizer_a),
            ])),

            # Pipeline for pulling metadata features
            ('stats', Pipeline([
                ('selector', ItemSelector(key=['polarity', 'subjectivity'])),
                ('stats', TextStats()),  # returns a list of dicts
                ('vect', DictVectorizer()),  # list of dicts -> feature matrix
            ])),

        ],

        # weight components in FeatureUnion
        transformer_weights={
            'text': 0.9,
            'stats': 1.5,
        },
    ))
])

In [43]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
seed = 40
X = model_df[['text', 'polarity', 'subjectivity']]
y =model_df['label']
encoder = LabelEncoder()
y = encoder.fit_transform(y)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed, stratify=y)

In [47]:
pipeline.fit?

In [48]:
pipeline.fit(x_train)


Pipeline(memory=None,
         steps=[('union',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('text',
                                                 Pipeline(memory=None,
                                                          steps=[('selector',
                                                                  ItemSelector(key='text')),
                                                                 ('tfidf',
                                                                  TfidfVectorizer(analyzer='word',
                                                                                  binary=False,
                                                                                  decode_error='strict',
                                                                                  dtype=<class 'numpy.float64'>,
                                                                                  encoding='utf-8',
                      

In [49]:

%%time
train_vec = pipeline.transform(x_train)
test_vec = pipeline.transform(x_test)
print("Checking that the number of features in train and test correspond: %s - %s" % (train_vec.shape, test_vec.shape))

Checking that the number of features in train and test correspond: (30000, 54661) - (7500, 54661)
Wall time: 4.5 s


In [51]:
from sklearn.linear_model import SGDClassifier
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from datetime import date

clf_sv = LinearSVC(C=1, class_weight='balanced', multi_class='ovr', random_state=40, max_iter=10000) #Support Vector machines
clf_sgd = SGDClassifier(max_iter=200,) # Stochastic Gradient Classifier

In [52]:
%%time
from sklearn.model_selection import cross_val_score

clfs = [clf_sv, clf_sgd]
cv = 3
for clf in clfs:
    scores = cross_val_score(clf,train_vec, y_train, cv=cv, scoring="accuracy" )
    print (scores)
    print (("Mean score: {0:.3f} (+/-{1:.3f})").format(
        np.mean(scores), np.std(scores)))

[0.4118 0.4123 0.4081]
Mean score: 0.411 (+/-0.002)
[0.4291 0.4202 0.4238]
Mean score: 0.424 (+/-0.004)
Wall time: 7.97 s


In [53]:

%%time
from sklearn.metrics import classification_report
clf_sv.fit(train_vec, y_train )
y_pred = clf_sv.predict(test_vec)
list_result =[]
list_result.append(("SVC",accuracy_score(y_test, y_pred)))
clf_sgd.fit(train_vec, y_train )
y_pred = clf_sgd.predict(test_vec)
list_result.append(("SGD",accuracy_score(y_test, y_pred)))

Wall time: 4.42 s


In [54]:
list_result

[('SVC', 0.42173333333333335), ('SGD', 0.4252)]

In [55]:

#vectorizer
cv=CountVectorizer()
msk = np.random.rand(len(model_df)) < 0.8# this steps generates word counts for the words in your docs
train_data = model_df[msk]
test_data = model_df[~msk]

training_features = cv.fit_transform(train_data["text"])    
test_features = cv.transform(test_data["text"])

# Training
model = LinearSVC()
model.fit(training_features, train_data["label"])
y_pred = model.predict(test_features)

# Evaluation
acc = accuracy_score(test_data["label"], y_pred)

print("Accuracy on the IMDB dataset: {:.2f}".format(acc*100))

Accuracy on the IMDB dataset: 36.59


In [57]:
print((train_data["text"]))

0        well like enjoy actual music movie love cd lis...
2        many dvd even cut old vhs original wonder good...
3        fuss disc match hype deeper deeper song worth ...
4        book excellent overview lean six sigma relevan...
5        think soil work great tomato soil really heavy...
                               ...                        
37495    longtime amazon customer first time ever drive...
37496    trick fill base little hot water honey flow fa...
37497    sell 100 length 14 gauge price wire appear 14 ...
37498    really like night cream skin dry hydrate nicel...
37499    ok take word back find solution connection pro...
Name: text, Length: 30068, dtype: object


In [58]:



# Transform each text into a vector of word counts
vectorizer_a = TfidfVectorizer()

training_features = vectorizer_a.fit_transform(train_data["text"])
test_features = vectorizer_a.transform(test_data["text"])

# Training
model = LinearSVC()
model.fit(training_features, train_data["label"])
y_pred = model.predict(test_features)

# Evaluation
acc = accuracy_score(test_data["label"], y_pred)

print("Accuracy on the IMDB dataset: {:.2f}".format(acc*100))

Accuracy on the IMDB dataset: 40.61


In [59]:
from sklearn import tree

model = tree.DecisionTreeClassifier()
model.fit(training_features, train_data["label"])
y_pred = model.predict(test_features)
# Evaluation
acc = accuracy_score(test_data["label"], y_pred)

print("Accuracy on the Amazon dataset: {:.2f}".format(acc*100))

Accuracy on the Amazon dataset: 30.34


In [60]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(training_features, train_data["label"])
y_pred = model.predict(test_features)
# Evaluation
acc = accuracy_score(test_data["label"], y_pred)

print("Accuracy on the Amazon dataset: {:.2f}".format(acc*100))

Accuracy on the Amazon dataset: 41.23
