# Amazon Reviews Data Wrangling

In [1]:
import warnings

warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", FutureWarning)
warnings.simplefilter("ignore", DeprecationWarning)

In [2]:
import os
import pandas as pd

dataset = "tarun.csv"

if os.path.isfile(dataset):
    df = pd.read_csv("tarun.csv")
else:
    url = r"http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz"
    df = pd.read_json(url, compression='gzip', lines=True)
display(df.head(10))

Unnamed: 0.1,Unnamed: 0,0,1,2
0,2698442,2,An Amalgam,This book is an amalgam of bits and pieces and...
1,2646715,5,Great!!!,"Well, not much to say. If you saw the first se..."
2,2119569,2,Hit&Miss,Babyface in his hey day always had a Cut that ...
3,816322,5,Great Buy,This text is considered The Bible for any poli...
4,1476562,3,What time is it anyway?,I thought the other reviews weren't serious ab...
5,17087,3,PRETTY FUNNY,"GOOD,BUT UNREALISTIC.THE GUY JUST QUIT GOING T..."
6,1838666,5,Lacy J. Dalton,I saw Lacy on Bill Anderson's Country Reunion ...
7,1442704,4,Great,Easy and enjoyable to watch. I would recommend...
8,1114640,3,Be careful!!!!,I am a very advanced exerciser and have used t...
9,1308335,5,the best book in the world!!!,Sahara special is one of the best books I have...


In [3]:
df = df.drop('Unnamed: 0',axis=1)

In [4]:
df.columns=['overall','title','reviewText']

In [5]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   overall     750000 non-null  int64 
 1   title       749983 non-null  object
 2   reviewText  750000 non-null  object
dtypes: int64(1), object(2)
memory usage: 17.2+ MB
None


The unixReviewTime is converted from Unix time to the more intuitive datetime datatype.

In [6]:
print(df["reviewText"].iloc[0])

This book is an amalgam of bits and pieces and techniques and strategies from other books about creative thinking on the market. If the reader has read very little in the field, then this book might at least be a start.


Each review is associated with a rating stored under the overall field. This serves as the quantified summary of a given review and will thus be used as the ground truth labels for the model.

In [7]:
print(df.overall.unique())

[2 5 3 4 1]


# NLP Pre-Processing

In [8]:
sample_review = df["reviewText"].iloc[749999]
print(sample_review)

I have been searching and searching for a good litter box. I Have been through so many and have wasted so much money. When I got this, our smaller and younger cat was all for it. She used it immediately. Our bigger, older kitty was hesitant. I kept an eye on him and now he is using it! There is no mess and it's so much easier to clean! I highly recommend it!!!!


# HTML Entities

Some special characters like the apostrophe (’) and the en dash (–) are expressed as a set of numbers prefixed by &# and suffixed by ;. This is because the dataset was scraped from an HTML parser, and the dataset itself includes data that predated the universal UTF-8 standard.

These HTML Entities can be decoded by importing the html library.

In [9]:
import html

decoded_review = html.unescape(sample_review)
print(decoded_review)

I have been searching and searching for a good litter box. I Have been through so many and have wasted so much money. When I got this, our smaller and younger cat was all for it. She used it immediately. Our bigger, older kitty was hesitant. I kept an eye on him and now he is using it! There is no mess and it's so much easier to clean! I highly recommend it!!!!


Since punctuation marks do not add value in the way we'll perform NLP, all the HTML entities in the review texts can be dropped. The output series preprocessed is our reviewText but without the special characters.

In [10]:
pattern = r"\&\#[0-9]+\;"
df = df.sample(frac=0.03)
df["preprocessed"] = df["reviewText"].str.replace(pat=pattern, repl="", regex=True)

print(df["preprocessed"].iloc[1])

She is so Amazing! And so hard kore rawk! It's insane!Her new album is gonna be totally hawt!(sarcasm)


In [11]:
df

Unnamed: 0,overall,title,reviewText,preprocessed
406586,1,Outset QS70 Square Stainless Steel Grill Wok,very thin and cheap not what I expected at all...,very thin and cheap not what I expected at all...
128092,1,Awesome album from a Rock Legend,She is so Amazing! And so hard kore rawk! It's...,She is so Amazing! And so hard kore rawk! It's...
36081,1,don't waste your time,The video actually says it teaches your dog ho...,The video actually says it teaches your dog ho...
357321,1,Not so durable blades,"You get what you pay for. In this case, you ge...","You get what you pay for. In this case, you ge..."
605023,5,excellent description of NVLD for kids needing...,this book really helps older elementary and mi...,this book really helps older elementary and mi...
...,...,...,...,...
100302,2,I almost quite reading before I got to the bet...,This book is a slow starter and its beginning ...,This book is a slow starter and its beginning ...
196010,3,good not great,I like this mouse but some buttons seem a litt...,I like this mouse but some buttons seem a litt...
50024,1,This phone...,I've owned this phone for about 2 weeks now an...,I've owned this phone for about 2 weeks now an...
499883,4,HURRIED ENDING,For the most part i really enjoyed this book. ...,For the most part i really enjoyed this book. ...


In [12]:
%%time
import re
import nltk

from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from nltk.corpus import wordnet

#import nltk resources
#resources = ["wordnet", "stopwords", "punkt", \
 #            "averaged_perceptron_tagger", "maxent_treebank_pos_tagger"]
resources = ["wordnet", "stopwords"]
for resource in resources:
    try:
        nltk.data.find("tokenizers/" + resource)
    except LookupError:
        nltk.download(resource)

#create Lemmatizer object
lemma = WordNetLemmatizer()

def lemmatize_word(tagged_token):
    """ Returns lemmatized word given its tag"""
    root = []
    for token in tagged_token:
        tag = token[1][0]
        word = token[0]
        if tag.startswith('J'):
            root.append(lemma.lemmatize(word, wordnet.ADJ))
        elif tag.startswith('V'):
            root.append(lemma.lemmatize(word, wordnet.VERB))
        elif tag.startswith('N'):
            root.append(lemma.lemmatize(word, wordnet.NOUN))
        elif tag.startswith('R'):
            root.append(lemma.lemmatize(word, wordnet.ADV))
        else:          
            root.append(word)
    return root

def lemmatize_doc(document):
    """ Tags words then returns sentence with lemmatized words"""
    lemmatized_list = []
    tokenized_sent = sent_tokenize(document)
    for sentence in tokenized_sent:
        no_punctuation = re.sub(r"[`'\",.!?()]", " ", sentence)
        tokenized_word = word_tokenize(no_punctuation)
        tagged_token = pos_tag(tokenized_word)
        lemmatized = lemmatize_word(tagged_token)
        lemmatized_list.extend(lemmatized)
    return " ".join(lemmatized_list)

#apply our functions
df["preprocessed"] = df["preprocessed"].apply(lambda row: lemmatize_doc(row))

print(df["preprocessed"].iloc[1])

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


She be so Amazing And so hard kore rawk It s insane Her new album be gon na be totally hawt sarcasm
Wall time: 2min 37s


In [13]:
from unicodedata import normalize

remove_accent = lambda text: normalize("NFKD", text).encode("ascii", "ignore").decode("utf-8", "ignore")

df["preprocessed"] = df["preprocessed"].apply(remove_accent)

print(df["preprocessed"].iloc[1])

She be so Amazing And so hard kore rawk It s insane Her new album be gon na be totally hawt sarcasm


In [14]:
pattern = r"[^\w\s]"

df["preprocessed"] = df["preprocessed"].str.replace(pat=pattern, repl=" ", regex=True)

print(df["preprocessed"].iloc[1])

She be so Amazing And so hard kore rawk It s insane Her new album be gon na be totally hawt sarcasm


In [15]:
df["preprocessed"] = df["preprocessed"].str.lower()

print(df["preprocessed"].iloc[1])

she be so amazing and so hard kore rawk it s insane her new album be gon na be totally hawt sarcasm


In [16]:
from nltk.corpus import stopwords

stop_words = stopwords.words("english")

stop_words = [word.replace("\'", "") for word in stop_words]

print(f"sample stop words: {stop_words[:15]} \n")

remove_stop_words = lambda row: " ".join([token for token in row.split(" ") \
                                          if token not in stop_words])
df["preprocessed"] = df["preprocessed"].apply(remove_stop_words)

print(df["preprocessed"].iloc[1])

sample stop words: ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'youre', 'youve', 'youll', 'youd', 'your', 'yours'] 

amazing hard kore rawk insane new album gon na totally hawt sarcasm


In [17]:
pattern = r"[\s]+"

df["preprocessed"] = df["preprocessed"].str.replace(pat=pattern, repl=" ", regex=True)

print(df["preprocessed"].iloc[1])

amazing hard kore rawk insane new album gon na totally hawt sarcasm


In [18]:
corpora = df["preprocessed"].values
tokenized = [corpus.split(" ") for corpus in corpora]

print(tokenized[1])

['amazing', 'hard', 'kore', 'rawk', 'insane', 'new', 'album', 'gon', 'na', 'totally', 'hawt', 'sarcasm']


In [19]:
corpora

array(['thin cheap expect could use worth money time return',
       'amazing hard kore rawk insane new album gon na totally hawt sarcasm',
       'video actually say teach dog get tissue instruction last 40 second renee sit chair tell use get command bring command whenever sneeze step step instruction type tell waste hard earn cash blow away video wish could return unsatisfied much detailed free information clicker trick training internet',
       ...,
       'phone 2 week hate live high rise actually dig old cord phone plug wall get many complaint people talk buzz phone get static time button handset poorly design try hold shoulder disconnect half time caller id dial work properly ',
       'part really enjoy book usually read history novel find enjoyable especially personal account give real sense condition time disappointment lie end book find hurried incomplete big end climax afterall win freedom independace hardly mention say would recommend book find insightful',
       'good so

In [20]:
from gensim.models import Phrases
from gensim.models.phrases import Phraser

bi_gram = Phrases(tokenized, min_count=300, threshold=50)

tri_gram = Phrases(bi_gram[tokenized], min_count=300, threshold=50)

In [21]:
tri_gram

<gensim.models.phrases.Phrases at 0x1c89b235fc8>

In [22]:
uni_gram_tokens = set([token for text in tokenized for token in text])
uni_gram_tokens = set(filter(lambda x: x != "", uni_gram_tokens))

print(list(uni_gram_tokens)[:50])

['otep', 'unputdownable', 'hamlin', 'tyrant', 'plotline', 'ultraviolet', 'overlook', 'gospel', 'sightsee', 'gladitorial', 'expand', 'cappuccino', 'diskette', 'anne', 'voltaire', 'manhandle', 'rot', 'solgar', 'tmac', 'seguro', 'unchecked', 'misterio', 'affirmative', 'tiled', 'pets', 'monastic', 'fable', 'getup', 'fists', 'hdcp', 'contraditions', 'bouhgt', 'staats', 'luhrmann', 'gadget', 'silk', 'sprouts', 'dimeolas', 'humility', 'desktop', 'quantine', 'turnoff', 'sheppard', 'danelectro', 'sent', 'journaler', 'swam', 'utts', 'practise', 'tried']


In [23]:
bigram_min = bi_gram.min_count

bi_condition = lambda x: x[1] >= bigram_min

bi_gram_tokens = dict(filter(bi_condition, bi_gram.vocab.items()))
bi_gram_tokens = set([token.decode("utf-8") \
                      for token in bi_gram_tokens])

bi_grams_only = bi_gram_tokens.difference(uni_gram_tokens)
print(list(bi_grams_only)[:50])

['', 'read_book', 'book_read', 'year_old', 'even_though', 'work_well', 'waste_money', 'year_ago', 'much_good', 'buy_book', 'would_recommend']


In [24]:
trigram_min = tri_gram.min_count

tri_condition = lambda x: x[1] >= trigram_min

tri_gram_tokens = dict(filter(tri_condition, tri_gram.vocab.items()))
tri_gram_tokens = set([token.decode("utf-8") \
                       for token in tri_gram_tokens])

tri_grams_only = tri_gram_tokens.difference(bi_gram_tokens)
print(list(tri_grams_only)[:50])

[]


In [25]:
#tokenized = [Phraser(tri_gram)[Phraser(bi_gram)[i]] for i in tokenized]

In [26]:
tokenized = [list(filter(lambda x: len(x) > 1, document)) \
             for document in tokenized]

print(tokenized[1])

['amazing', 'hard', 'kore', 'rawk', 'insane', 'new', 'album', 'gon', 'na', 'totally', 'hawt', 'sarcasm']


In [27]:
len(tokenized)

22500

In [28]:
tokenized

[['thin',
  'cheap',
  'expect',
  'could',
  'use',
  'worth',
  'money',
  'time',
  'return'],
 ['amazing',
  'hard',
  'kore',
  'rawk',
  'insane',
  'new',
  'album',
  'gon',
  'na',
  'totally',
  'hawt',
  'sarcasm'],
 ['video',
  'actually',
  'say',
  'teach',
  'dog',
  'get',
  'tissue',
  'instruction',
  'last',
  '40',
  'second',
  'renee',
  'sit',
  'chair',
  'tell',
  'use',
  'get',
  'command',
  'bring',
  'command',
  'whenever',
  'sneeze',
  'step',
  'step',
  'instruction',
  'type',
  'tell',
  'waste',
  'hard',
  'earn',
  'cash',
  'blow',
  'away',
  'video',
  'wish',
  'could',
  'return',
  'unsatisfied',
  'much',
  'detailed',
  'free',
  'information',
  'clicker',
  'trick',
  'training',
  'internet'],
 ['get',
  'pay',
  'case',
  'get',
  'package',
  'multiple',
  'blade',
  'necessity',
  'nature',
  'coaxial',
  'helicopter',
  'counterrotating',
  'upper',
  'low',
  'blade',
  'design',
  'possibility',
  'upper',
  'low',
  'blade',
  '

In [29]:
# Transform each text into a vector of word counts
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words="english",
                             preprocessor=corpora)

In [30]:
#vectorizer
cv=CountVectorizer()
 
# this steps generates word counts for the words in your docs
word_count_vector=cv.fit_transform(corpora)

In [31]:
word_count_vector

<22500x46169 sparse matrix of type '<class 'numpy.int64'>'
	with 727300 stored elements in Compressed Sparse Row format>

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer 
 
# settings that you use for count vectorizer will go here
tfidf_vectorizer=TfidfVectorizer(use_idf=True)
 
# just send in all your docs here
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(corpora)

In [33]:
features = tfidf_vectorizer_vectors.toarray()

#labels = df["overall"].reset_index().drop('index',axis=1)

In [34]:

# get the first vector out (for the first document)
first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[0]
 
# place tf-idf values in a pandas data frame
df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
df.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
thin,0.473187
cheap,0.388417
return,0.357289
expect,0.336365
worth,0.328890
...,...
fastaccess,0.000000
fastback,0.000000
fasten,0.000000
fastener,0.000000


In [35]:
from gensim.corpora.dictionary import Dictionary

vocabulary = Dictionary(tokenized)

vocabulary_keys = list(vocabulary.token2id)[0:10]

for key in vocabulary_keys:
    print(f"ID: {vocabulary.token2id[key]}, Token: {key}")

ID: 0, Token: cheap
ID: 1, Token: could
ID: 2, Token: expect
ID: 3, Token: money
ID: 4, Token: return
ID: 5, Token: thin
ID: 6, Token: time
ID: 7, Token: use
ID: 8, Token: worth
ID: 9, Token: album


In [36]:
bow = [vocabulary.doc2bow(doc) for doc in tokenized]

for idx, freq in bow[0]:
    print(f"Word: {vocabulary.get(idx)}, Frequency: {freq}")

Word: cheap, Frequency: 1
Word: could, Frequency: 1
Word: expect, Frequency: 1
Word: money, Frequency: 1
Word: return, Frequency: 1
Word: thin, Frequency: 1
Word: time, Frequency: 1
Word: use, Frequency: 1
Word: worth, Frequency: 1


In [37]:
from gensim.models.tfidfmodel import TfidfModel

tfidf = TfidfModel(bow)

for idx, weight in tfidf[bow[0]]:
    print(f"Word: {vocabulary.get(idx)}, Weight: {weight:.3f}")

Word: cheap, Weight: 0.402
Word: could, Weight: 0.258
Word: expect, Weight: 0.334
Word: money, Weight: 0.300
Word: return, Weight: 0.361
Word: thin, Weight: 0.513
Word: time, Weight: 0.183
Word: use, Weight: 0.194
Word: worth, Weight: 0.324


In [38]:
bow

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)],
 [(9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1)],
 [(1, 1),
  (4, 1),
  (7, 1),
  (12, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 2),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 2),
  (35, 1),
  (36, 2),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 2),
  (46, 1),
  (47, 2),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 2),
  (54, 1),
  (55, 1),
  (56, 1)],
 [(12, 1),
  (17, 1),
  (34, 4),
  (38, 1),
  (57, 1),
  (58, 13),
  (59, 1),
  (60, 2),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 2),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 3),
  (75, 1),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 3),
  (80, 1),
  (81, 1),
  (82, 1),
  (83, 1),
  (84, 1),
  

In [47]:
%%time
import numpy as np

from gensim.models import word2vec

np.set_printoptions(suppress=True)

feature_size = 100
context_size = 20
min_word = 1

word_vec= word2vec.Word2Vec(tokenized, size=feature_size, \
                            window=context_size, min_count=min_word, \
                            iter=50, seed=42)

Wall time: 1min


In [48]:
word_vec_unpack = [(word, idx.index) for word, idx in \
                   word_vec.wv.vocab.items()]

tokens, indexes = zip(*word_vec_unpack)

word_vec_df = pd.DataFrame(word_vec.wv.syn0[indexes, :], index=tokens)

display(word_vec_df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
thin,-2.658507,-4.084463,-1.459471,-1.159673,1.743592,0.936565,-0.976168,-0.695074,-1.143526,1.171085,...,2.427705,1.720065,-2.864466,1.389447,-0.975306,0.509940,-2.603029,-1.264015,-2.090482,-4.080546
cheap,-4.028717,-0.487422,-2.720645,-1.154012,-4.195673,-0.775110,3.806038,1.174394,2.763609,-2.312006,...,-1.635463,-2.525317,0.601533,-0.616232,-0.382258,4.259027,-1.020663,-5.860319,-0.063841,-0.977372
expect,-4.659333,-0.300162,-0.318432,-2.759488,-3.238514,0.284600,-0.917823,1.348914,0.942516,2.464395,...,-1.535789,-2.374413,-2.018214,2.910182,-2.886860,1.707934,-1.086707,1.021194,1.292049,1.316051
could,-3.002735,-1.492128,0.235752,-2.371997,-0.682817,-3.520417,-0.235535,-1.854224,-2.062336,-0.333326,...,-1.162829,-2.023323,0.290676,-3.284590,6.793865,-5.374689,-2.368987,-3.419137,4.311099,-3.560594
use,1.121677,-2.426744,-1.297437,-2.757247,-0.922019,-1.507802,0.800179,4.470390,2.550404,2.662603,...,-0.032624,1.741054,-1.181481,1.599653,0.480829,-0.975408,1.135979,-1.132163,-2.615594,-3.842664
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
whitch,0.047453,0.000057,0.011607,-0.191402,-0.312069,-0.019523,0.097112,0.024005,-0.056111,-0.040248,...,-0.102037,0.127079,0.092747,0.178266,0.603606,0.076164,-0.105839,0.192329,0.019203,0.030581
critcal,0.338732,0.079276,-0.046350,-0.336302,0.105893,-0.248647,-0.170922,0.436404,0.042087,0.050823,...,-0.147034,0.444166,0.048883,0.060503,0.196462,0.074783,0.061464,0.235806,-0.533111,-0.212527
independace,0.281313,0.232110,-0.206806,0.274250,-0.549115,0.019873,-0.032162,-0.088670,-0.087654,-0.170693,...,0.088467,0.118050,-0.080900,0.158758,-0.425912,0.035622,-0.124071,0.025784,-0.058304,0.409114
handberg,-0.186610,0.699234,0.150196,0.606491,0.039964,0.382936,0.293633,0.314727,-0.312018,-0.112028,...,-0.107345,-0.469463,-0.172449,0.710024,0.518394,0.470835,0.631631,0.379792,-0.582608,0.194769


In [49]:
word_vec_unpack

[('thin', 716),
 ('cheap', 239),
 ('expect', 106),
 ('could', 40),
 ('use', 10),
 ('worth', 99),
 ('money', 74),
 ('time', 9),
 ('return', 136),
 ('amazing', 579),
 ('hard', 92),
 ('kore', 22628),
 ('rawk', 16428),
 ('insane', 3875),
 ('new', 42),
 ('album', 35),
 ('gon', 1635),
 ('na', 1094),
 ('totally', 465),
 ('hawt', 13271),
 ('sarcasm', 8150),
 ('video', 145),
 ('actually', 130),
 ('say', 24),
 ('teach', 557),
 ('dog', 339),
 ('get', 4),
 ('tissue', 7471),
 ('instruction', 522),
 ('last', 94),
 ('40', 919),
 ('second', 149),
 ('renee', 6940),
 ('sit', 391),
 ('chair', 1159),
 ('tell', 110),
 ('command', 2292),
 ('bring', 348),
 ('whenever', 1882),
 ('sneeze', 6941),
 ('step', 531),
 ('type', 268),
 ('waste', 148),
 ('earn', 2620),
 ('cash', 1526),
 ('blow', 835),
 ('away', 154),
 ('wish', 208),
 ('unsatisfied', 6492),
 ('much', 20),
 ('detailed', 1279),
 ('free', 477),
 ('information', 173),
 ('clicker', 8151),
 ('trick', 1636),
 ('training', 1402),
 ('internet', 860),
 ('pay', 1

In [50]:
word_vec_df.count()

0     46169
1     46169
2     46169
3     46169
4     46169
      ...  
95    46169
96    46169
97    46169
98    46169
99    46169
Length: 100, dtype: int64

In [52]:
%%time
tokenized_array = np.array(tokenized)

model_array = np.array([word_vec_df.loc[doc].mean(axis=0) for doc in tokenized_array])

Wall time: 18.8 s


In [53]:
model_array

array([[-1.29535151, -0.94365948, -0.52042919, ..., -1.74994707,
         0.9128651 , -1.04414606],
       [ 0.66336852,  0.37289441, -0.21401395, ...,  0.85375619,
         1.5512284 , -0.93955177],
       [ 1.07415867,  0.27355301,  1.22067666, ..., -0.88834018,
        -1.00538337, -1.07181978],
       ...,
       [ 2.30837917, -1.11106014, -0.16023977, ..., -0.5279091 ,
         0.49521396, -1.47565615],
       [-0.62955135,  1.13929427,  0.27937394, ...,  1.29243398,
         0.64265013,  0.08614144],
       [ 0.39124182,  0.48259464,  0.47732383, ...,  0.6099323 ,
        -0.21049672, -0.8024196 ]])

In [56]:
df

Unnamed: 0,tfidf
00,0.0
000,0.0
0000,0.0
000001,0.0
000lbs,0.0
...,...
zz,0.0
zzzzzzzzzzz,0.0
zzzzzzzzzzzz,0.0
zzzzzzzzzzzzzzzzz,0.0


In [55]:
a=df["overall"].reset_index().drop('index',axis=1)

KeyError: 'overall'

In [None]:
a

In [None]:
model_df = pd.DataFrame(model_array)
model_df["label"] = a

display(model_df.head())

In [None]:
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA

#sampling the model_df population
pca_df = model_df.reset_index()
pca_df = model_df.dropna(axis=0).iloc[:,1:]
#pca_df = pca_df.iloc[::50]

#setting up PCA
pca = PCA(n_components=3, random_state=42)
pca = pca.fit_transform(pca_df.iloc[:, :-1])
labels = pca_df["label"]

#setting up plot components
x_axis = pca[:,0]
y_axis = pca[:,1]
color_map = pca_df["label"].map({1:"blue", \
                                 2:"red", \
                                 3:"yellow", \
                                 4:"green", \
                                 5:"orange"})

#plotting PCA
f, axes = plt.subplots(figsize=(20,10))
plt.scatter(x_axis, y_axis, color=color_map, s=1)
plt.show()

In [None]:
pca

In [None]:
model_pca = pd.DataFrame(pca)
model_pca["label"] = a

display(model_pca.head())

In [None]:
model_pca

In [None]:
word_bank = ["nook", "phone", "tv", "good", "price"]

for word in word_bank[:]:
    related_vec = word_vec.wv.most_similar(word, topn=5)
    related_words = np.array(related_vec)[:,0]
    word_bank.extend(related_words)
    print(f"{word}: {related_words}")

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, perplexity=5, n_iter=1000, random_state=42)

sample_vecs = word_vec.wv[set(word_bank)]
sample_tsne = tsne.fit_transform(sample_vecs)
tsne_x = sample_tsne[:, 0]
tsne_y = sample_tsne[:, 1]

f, axes = plt.subplots(figsize=(20,7))
ax = plt.scatter(x=tsne_x, y=tsne_y)

for label, x, y in zip(word_bank, tsne_x, tsne_y):
    plt.annotate(label, xy=(x+3, y+3))

plt.show()

In [None]:
word_vec.wv.most_similar(positive=["books", "away"], \
                      negative=[], topn=1)

In [None]:
word_vec.wv.most_similar(positive=["cheap"], \
                      negative=["quality"], topn=1)

In [None]:
word_vec.wv.most_similar(positive=["tablet"], \
                      negative=["phone"], topn=1)

In [None]:
df["reviewText"].iloc[1000]

In [None]:
#helpful = df["helpful"].tolist()
#most_helpful = max(helpful, key=lambda x: x[0])

#most_helpful_idx = df["helpful"].astype(str) == str(most_helpful)
#most_helpful_idx = df[most_helpful_idx].index

most_helpful_text = df["reviewText"].iloc[1000]

print(most_helpful_text)

In [None]:
%%time
import spacy

from collections import defaultdict

ner = spacy.load('en_core_web_sm')

ner_helpful = ner(most_helpful_text)

ner_dict = defaultdict(list)
for entity in ner_helpful.ents:
    ner_dict[entity.label_].append(entity)

for NER, name in ner_dict.items():
    print(f"{NER}:\n{name}\n")

In [None]:
from spacy import displacy

displacy.render(ner_helpful, style="ent", jupyter=True)

In [None]:
def ner_displacy(sentence):
    ner_sentence = ner(sentence)
    displacy.render(ner_sentence, jupyter=True, \
                    options={"compact": False, \
                             "distance": 90, \
                             "word_spacing":20, \
                             "arrow_spacing":10, \
                             "arrow_stroke": 2, \
                             "arrow_width": 5})

for sentence in most_helpful_text.split(".")[0:3]:
    ner_displacy(sentence)

In [None]:
%%time
import multiprocessing

from gensim.models.ldamulticore import LdaMulticore

cores = multiprocessing.cpu_count()

num_topics = 10
bow_lda = LdaMulticore(bow, num_topics=num_topics, id2word=vocabulary, \
                       passes=5, workers=cores, random_state=42)

for token, frequency in bow_lda.show_topic(0, topn=5):
    print(token, frequency)

In [None]:
for topic in range(0, num_topics):
    print(f"\nTopic {topic+1}:")
    for token, frequency in bow_lda.show_topic(topic, topn=5):
        print(f" {token}, {frequency}")

In [None]:
import pyLDAvis.gensim

lda_idm = pyLDAvis.gensim.prepare(bow_lda, bow, vocabulary)

pyLDAvis.display(lda_idm)

In [None]:
lda_idm

In [None]:
nan_list = model_df[model_df[0].isna()].index
nan_list = nan_list.tolist()

print(nan_list[0:50])

In [None]:
print(*[tokenized[blank] for blank in nan_list[0:5]])

In [None]:
for blank in nan_list[0:5]:
    display(df["reviewText"].iloc[blank])

In [None]:
print(f"Original 'model_df' count: {len(model_df)}")
print(f"Final 'model_df' count: {len(model_df.dropna(axis=0))}")

model_df = model_df.dropna(axis=0)
display(model_df.head())

In [None]:
model_df.label.value_counts()

In [None]:
import seaborn as sns

f, axes = plt.subplots(figsize=(20,7))
ax = sns.countplot(x=df["overall"], palette="OrRd_r")
ax.set(title="Distribution of Product Ratings", \
       xlabel="Rating", ylabel="Number of Reviews")
plt.show()

In [None]:
majority = df["overall"] == 5
majority_ratio = len(df[majority]) / len(df)

print(f"{majority_ratio*100:.2f}%")

In [None]:
count = len(model_df[model_df["label"] == 2])
print(f"Size of the most underrepresented class: {count}")

In [None]:
 model_df["label"]

In [None]:
condition = model_df["label"] == 5

In [None]:
#trim the majority class
condition = model_df["label"] == 5.0
trimmed_df = model_df[condition].sample(n=count, random_state=42,replace=True)

#trim other class and add on to the trimmed_df
for rating in [1, 2, 3, 4]:
    condition = model_df["label"] == rating
    if len(model_df[condition]) >= count:
        add_df = model_df[condition].sample(n=count, random_state=42)
    else:
        add_df = model_df[condition]
    trimmed_df = pd.concat([trimmed_df, add_df], ignore_index=False)

#display new class sizes of trimmed_df
for rating in [1, 2, 3, 4, 5]:
    class_size = len(trimmed_df[trimmed_df["label"] == rating])
    print(f"Size of Class {rating}: {class_size}")

In [None]:
trimmed_df = trimmed_df.sort_values(by="label")
display(trimmed_df.tail())

In [None]:
f, axes = plt.subplots(figsize=(20,7))
ax = sns.countplot(x=trimmed_df["label"], palette="OrRd_r")
ax.set(title="Distribution of Product Ratings after Underrepresentation", \
       xlabel="Rating", ylabel="Number of Reviews")
plt.show()

In [None]:
f, axes = plt.subplots(figsize=(20,7))
ax = sns.countplot(x=model_pca["label"], palette="OrRd_r")
ax.set(title="Distribution of Product Ratings after Underrepresentation", \
       xlabel="Rating", ylabel="Number of Reviews")
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

X = model_pca.iloc[:, :-1]
y = model_pca.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.4, random_state=42)

In [None]:
# Training
model = LinearSVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluation
acc = accuracy_score(y_test, y_pred)

print("Accuracy on the Amazon dataset: {:.2f}".format(acc*100))

In [None]:
model_df

In [None]:
from sklearn.model_selection import train_test_split

X_new = model_df.iloc[:, :-1]
y_new = model_df.iloc[:, -1]

X_train_inf, X_test_hist, y_train_inf, y_test_hist = train_test_split(X_new, y_new, stratify=y_new, test_size=0.5, random_state=8)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from datetime import date

# Training
model = LinearSVC()
model.fit(X_train_inf, y_train_inf)
y_pred = model.predict(X_test_hist)

# Evaluation
acc = accuracy_score(y_test_hist, y_pred)

print("Accuracy on the Amazon dataset: {:.2f}".format(acc*100))



# train classifier
#clf = SVC(probability=True, kernel='rbf')
#clf.fit(X_train_inf, y_train_inf)

# predict and evaluate predictions
#predictions = clf.predict_proba(X_test)
#print('ROC-AUC yields ' + str(roc_auc_score(y_test, predictions[:,1])))

In [None]:

from sklearn import metrics
# Print the confusion matrix

print(metrics.confusion_matrix(y_test_hist, y_pred))



# Print the precision and recall, among other metrics

print(metrics.classification_report(y_test_hist, y_pred, digits=3))

In [None]:
from sklearn.linear_model import LogisticRegression
# Training
model = LogisticRegression()
model.fit(X_train_inf, y_train_inf)
y_pred = model.predict(X_test_hist)

# Evaluation
acc = accuracy_score(y_test_hist, y_pred)

print("Accuracy on the Amazon dataset: {:.2f}".format(acc*100))

In [None]:
#from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

model = tree.DecisionTreeClassifier()
model.fit(X_train_inf, y_train_inf)
y_pred = model.predict(X_test_hist)

# Evaluation
acc = accuracy_score(y_test_hist, y_pred)

print("Accuracy on the Amazon dataset: {:.2f}".format(acc*100))

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train_inf, y_train_inf)
y_pred = model.predict(X_test_hist)

# Evaluation
acc = accuracy_score(y_test_hist, y_pred)

print("Accuracy on the Amazon dataset: {:.2f}".format(acc*100))

In [None]:
from sklearn import metrics
# Print the confusion matrix

print(metrics.confusion_matrix(y_test_hist, y_pred))



# Print the precision and recall, among other metrics

print(metrics.classification_report(y_test_hist, y_pred, digits=3))

In [None]:
from sklearn.model_selection import train_test_split

X = model_df.iloc[:, :-1]
y = model_df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state=42)

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=2, class_weight={1:10, 2:12, 3:7, 4:9, 5:1}, \
                                criterion="entropy", random_state=42)
forest.fit(X_train, y_train)
y_pred_rf = forest.predict(X_train)

accuracy = metrics.accuracy_score(y_train, y_pred_rf)
f1_score = metrics.f1_score(y_train, y_pred_rf, average="micro")


#print(f"Training Set Accuracy: {accuracy*100:.3f}%")
#print(f"Training Set F1 Score: {f1_score*100:.3f}%")


def eval_predictions(y_test, y_pred):
    print ('accuracy:', metrics.accuracy_score(y_test, y_pred))
    print ('precision:', metrics.precision_score(y_test, y_pred, average='weighted'))
    print ('recall:', metrics.recall_score(y_test, y_pred, average='weighted'))
    print ('F-measure:', metrics.f1_score(y_test, y_pred, average='weighted'))
eval_predictions(y_train, y_pred_rf)


In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=2, class_weight={1:10, 2:12, 3:7, 4:9, 5:10}, \
                                criterion="gini", random_state=42)
forest.fit(X_train, y_train)
y_pred_rf = forest.predict(X_train)

accuracy = metrics.accuracy_score(y_train, y_pred_rf)
f1_score = metrics.f1_score(y_train, y_pred_rf, average="micro")


#print(f"Training Set Accuracy: {accuracy*100:.3f}%")
#print(f"Training Set F1 Score: {f1_score*100:.3f}%")


def eval_predictions(y_test, y_pred):
    print ('accuracy:', metrics.accuracy_score(y_test, y_pred))
    print ('precision:', metrics.precision_score(y_test, y_pred, average='weighted'))
    print ('recall:', metrics.recall_score(y_test, y_pred, average='weighted'))
    print ('F-measure:', metrics.f1_score(y_test, y_pred, average='weighted'))
eval_predictions(y_train, y_pred_rf)


In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=10, class_weight={1:10, 2:12, 3:7, 4:9, 5:1}, \
                                criterion="entropy", random_state=42)
forest.fit(X_train, y_train)
y_pred_rf = forest.predict(X_train)

accuracy = metrics.accuracy_score(y_train, y_pred_rf)
f1_score = metrics.f1_score(y_train, y_pred_rf, average="micro")


#print(f"Training Set Accuracy: {accuracy*100:.3f}%")
#print(f"Training Set F1 Score: {f1_score*100:.3f}%")


def eval_predictions(y_test, y_pred):
    print ('accuracy:', metrics.accuracy_score(y_test, y_pred))
    print ('precision:', metrics.precision_score(y_test, y_pred, average='weighted'))
    print ('recall:', metrics.recall_score(y_test, y_pred, average='weighted'))
    print ('F-measure:', metrics.f1_score(y_test, y_pred, average='weighted'))
eval_predictions(y_train, y_pred_rf)


In [None]:
from sklearn.metrics import confusion_matrix

#create the confusion matrix of the training set
confusion_train = confusion_matrix(y_train, y_pred_rf)
confusion_train = confusion_train.astype("float") / \
                   confusion_train.sum(axis=1)[:, np.newaxis]
confusion_train = np.around(confusion_train, decimals=3)*100

#create confusion matrix heat map
f, axes = plt.subplots(figsize=(20,10))
im = axes.imshow(confusion_train, interpolation="nearest", cmap=plt.cm.Reds)

axes.figure.colorbar(im, ax=axes)
axes.set(title="Confusion Matrix for Training Set", \
         xticks=np.arange(confusion_train.shape[1]), \
         yticks=np.arange(confusion_train.shape[0]), \
         xticklabels=range(1, 6), yticklabels=range(1, 6), \
         xlabel="Predicted", ylabel="Truth")

#add clear annotations to the confusion matrix
threshold = confusion_train.max()/1.5
for i in range(confusion_train.shape[0]):
    for j in range(confusion_train.shape[1]):
        axes.text(j, i, f"{confusion_train[i, j]:.3f}%",
                ha="center", va="center",
                color="white" if confusion_train[i, j] > threshold else "black")
f.tight_layout()
plt.show()

# Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score

cross_val_accuracy = cross_val_score(forest, X_train, y_train, \
                               cv=3, scoring="accuracy")
cross_val_f1 = cross_val_score(forest, X_train, y_train, \
                               cv=3, scoring="f1_micro")

cross_val_accuracy = np.mean(cross_val_accuracy)
cross_val_f1 = np.mean(cross_val_f1)

print(f"Training Set Accuracy: {cross_val_accuracy*100:.3f}%")
print(f"Training Set F1 Score: {cross_val_f1:.3f}")

In [None]:
import xgboost as xgb

y_train_shifted = y_train-1
y_test_shifted = y_test-1

train_set = xgb.DMatrix(X_train, label=y_train_shifted)
test_set = xgb.DMatrix(X_test, label=y_test_shifted)

#parameters = {"max_depth": 5, "eta": 0.2, "silent": 1, \
      #        "objective": "multi:softprob", "num_class": 5}

#boost = xgb.train(parameters, train_set, 100)
boost = xgb.train(train_set,10)

In [None]:
import xgboost as xgb

y_train_shifted = y_train-1
y_test_shifted = y_test-1

train_set = xgb.DMatrix(X_train, label=y_train_shifted)
test_set = xgb.DMatrix(X_test, label=y_test_shifted)

parameters = {"max_depth": 5, "eta": 0.2, "silent": 1, \
              "objective": "multi:softprob", "num_class": 5}

boost = xgb.train(parameters, train_set, 100)

In [None]:
y_pred = boost.predict(train_set)
y_pred = y_pred.argmax(axis=1)
y_pred = y_pred+1

accuracy = metrics.accuracy_score(y_train, y_pred)
f1_score = metrics.f1_score(y_train, y_pred, average="micro")

print(f"Training Set Accuracy: {accuracy*100:.3f}%")
print(f"Training Set F1 Score: {f1_score:.3f}")

In [None]:
penalty = ['l1', 'l2']

C = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]

class_weight = [{1:0.5, 0:0.5}, {1:0.4, 0:0.6}, {1:0.6, 0:0.4}, {1:0.7, 0:0.3}]

solver = ['liblinear', 'saga']



param_grid = dict(penalty=penalty,

                  C=C,

                  class_weight=class_weight,

                  solver=solver)



grid = GridSearchCV(estimator=logistic,

                    param_grid=param_grid,

                    scoring='roc_auc',

                    verbose=1,

                    n_jobs=-1)

grid_result = grid.fit(X_train, y_train)



print('Best Score: ', grid_result.best_score_)

print('Best Params: ', grid_result.best_params_)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

pipe = Pipeline([('classifier' , RandomForestClassifier())])

# pipe = Pipeline([('classifier', RandomForestClassifier())])



# Create param grid.



param_grid = [

    {'classifier' : [LogisticRegression()],

     'classifier__penalty' : ['l1', 'l2'],

    'classifier__C' : np.logspace(-4, 4, 20),

    'classifier__solver' : ['liblinear']},

    {'classifier' : [RandomForestClassifier()],

    'classifier__n_estimators' : list(range(10,101,10)),

    'classifier__max_features' : list(range(6,32,5))}

]



# Create grid search object



clf = GridSearchCV(pipe, param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1)



# Fit on data



best_clf = clf.fit(X_train, y_train)

In [None]:
print('Best Score: ', best_clf.best_score_)

print('Best Params: ', best_clf.best_params_)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

clr = LogisticRegression(penalty='l2', C=0.1, random_state=8)
clr.fit(X_train, y_train.values.ravel())
y_pred_lr = clr.predict(X_train)
scores = clr.score(X_train, y_train) # accuracy
print("Logistic Regression : " , scores)


def eval_predictions(y_test, y_pred):
    print ('accuracy:', metrics.accuracy_score(y_test, y_pred))
    print ('precision:', metrics.precision_score(y_test, y_pred, average='weighted'))
    print ('recall:', metrics.recall_score(y_test, y_pred, average='weighted'))
    print ('F-measure:', metrics.f1_score(y_test, y_pred, average='weighted'))
eval_predictions(y_train, y_pred_lr)

In [None]:
best_clf = clf.fit(X_train, y_train)

# XGBoost

In [None]:
#import XGBoost classifier and accuracy

from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score



#instantiate model and train

model = XGBClassifier(learning_rate = 0.05, n_estimators=300, max_depth=9,num_round=20)

model.fit(X_train, y_train)



# make predictions for test set

y_pred = model.predict(X_test)

predictions = [round(value) for value in y_pred]



accuracy = accuracy_score(y_test, predictions)

print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
import matplotlib.pyplot as plt
from scipy import integrate
def capcurve(y_values, y_preds_proba):

    num_pos_obs = np.sum(y_values)

    num_count = len(y_values)

    rate_pos_obs = float(num_pos_obs) / float(num_count)

    ideal = pd.DataFrame({'x':[0,rate_pos_obs,1],'y':[0,1,1]})

    xx = np.arange(num_count) / float(num_count - 1)

    y_cap = np.c_[y_values,y_preds_proba]

    y_cap_df_s = pd.DataFrame(data=y_cap)

    y_cap_df_s = y_cap_df_s.sort_values([1], ascending=False).reset_index(level = y_cap_df_s.index.names, drop=True)

    

    print(y_cap_df_s.head(20))

    

    yy = np.cumsum(y_cap_df_s[0]) / float(num_pos_obs)
    yy = np.append([0], yy[0:num_count-1]) #add the first curve point (0,0) : for xx=0 we have yy=0
    percent = 0.5
    #row_index = np.trunc(num_count * percent)
    row_index = int(np.trunc(num_count * percent))
    val_y1 = yy[row_index]
    val_y2 = yy[row_index+1]
    if val_y1 == val_y2:
        val = val_y1*1.0
    else:
        val_x1 = xx[row_index]
        val_x2 = xx[row_index+1]
    val = val_y1 + ((val_x2 - percent)/(val_x2 - val_x1))*(val_y2 - val_y1) 
    sigma_ideal = 1 * xx[len(xx) - 1 ] / 2 + (xx[num_count - 1] - xx[len(xx)-1]) * 1
   # sigma_ideal=5.95
    sigma_model = integrate.simps(yy,xx)
    sigma_random = integrate.simps(xx,xx)
    print(sigma_random)
    ar_value = (sigma_model - sigma_random) / (sigma_ideal - sigma_random)
    #ar_label = 'ar value = %s' % ar_value
    fig, ax = plt.subplots(nrows = 1, ncols = 1)
    ax.plot(ideal['x'],ideal['y'], color='grey', label='Perfect Model')
    ax.plot(xx,yy, color='red', label='User Model')
    #ax.scatter(xx,yy, color='red')
    ax.plot(xx,xx, color='blue', label='Random Model')
    ax.plot([percent, percent], [0.0, val], color='green', linestyle='--', linewidth=1)
    ax.plot([0, percent], [val, val], color='green', linestyle='--', linewidth=1, label=str(val*100)+'% of positive obs at '+str(percent*100)+'%')
    plt.xlim(0, 1.02)
    plt.ylim(0, 1.25)
    plt.title("CAP Curve - a_r value ="+str(ar_value))
    plt.xlabel('% of the data')
    plt.ylabel('% of positive obs')
    plt.legend()
    plt.show()

In [None]:
y_pred_proba = model.predict_proba(X_test)
capcurve(y_values=y_test, y_preds_proba=y_pred_proba[:,1])

In [None]:
y_pred_proba = forest.predict_proba(X=X_test)
capcurve(y_values=y_test, y_preds_proba=y_pred_proba[:,1])

In [None]:
y_pred_proba = clr.predict_proba(X=X_test)
capcurve(y_values=y_test, y_preds_proba=y_pred_proba[:,1])

# SVM

In [None]:
X_train.shape
y_train.shape

In [None]:
from sklearn import svm
# instantiate and train model, kernel=rbf 
svm_rbf = svm.SVC(random_state=12345,probability=True)
svm_rbf.fit(X_train, y_train)

# evaulate model
y_pred_1 = svm_rbf.predict(X_train)
print(" SVM : ", y_pred_1)


def eval_predictions(y_test, y_pred):
    print ('accuracy:', metrics.accuracy_score(y_test, y_pred))
    print ('precision:', metrics.precision_score(y_test, y_pred, average='weighted'))
    print ('recall:', metrics.recall_score(y_test, y_pred, average='weighted'))
    print ('F-measure:', metrics.f1_score(y_test, y_pred, average='weighted'))
eval_predictions(y_train, y_pred_1)


In [None]:
y_pred_proba = svm_rbf.predict_proba(X=X_test)
capcurve(y_values=y_test, y_preds_proba=y_pred_proba[:,1])

# GBoost model with learning curves

In [None]:
## How to evaluate XGBoost model with learning curves
## DataSet: skleran.datasets.load_breast_cancer()

    
import warnings
warnings.filterwarnings("ignore")

# load libraries
import numpy as np
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve

# load the datasets
   

# Create CV training and test scores for various training set sizes
train_sizes, train_scores, test_scores = learning_curve(XGBClassifier(),
                                               X, y, cv=2, scoring='accuracy', n_jobs=-1,
                                               # 50 different sizes of the training set
                                               train_sizes=np.linspace(0.01, 1.0, 5))

In [None]:
# Create means and standard deviations of training set scores
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)

In [None]:
    # Create means and standard deviations of test set scores
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

    # Draw lines
plt.subplots(1, figsize=(7,7))
plt.plot(train_sizes, train_mean, '--', color="#111111",  label="Training score")
plt.plot(train_sizes, test_mean, color="#111111", label="Cross-validation score")

    # Draw bands
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="#DDDDDD")
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color="#DDDDDD")

    # Create plot
plt.title("Learning Curve")
plt.xlabel("Training Set Size"), plt.ylabel("Accuracy Score"), plt.legend(loc="best")
plt.tight_layout(); plt.show()

# Linear Regression model with learning curves

In [None]:
from sklearn.linear_model import LinearRegression

# Create CV training and test scores for various training set sizes
train_sizes, train_scores, validation_scores = learning_curve(LinearRegression(),
                                               X, y, cv=5, scoring='neg_mean_squared_error')
print('Training scores:\n\n', train_scores)
print('\n', '-' * 70) # separator to make the output easy to read
print('\nValidation scores:\n\n', validation_scores)

#learning_curves(RandomForestRegressor(max_leaf_nodes = 350), df, features, target, train_sizes, 5)

In [None]:
train_scores_mean = -train_scores.mean(axis = 1)
validation_scores_mean = -validation_scores.mean(axis = 1)
print('Mean training scores\n\n', pd.Series(train_scores_mean, index = train_sizes))
print('\n', '-' * 20) # separator
print('\nMean validation scores\n\n',pd.Series(validation_scores_mean, index = train_sizes))

In [None]:
import matplotlib.pyplot as plt

plt.style.use('seaborn')
plt.plot(train_sizes, train_scores_mean, label = 'Training error')
plt.plot(train_sizes, validation_scores_mean, label = 'Validation error')
plt.ylabel('MSE', fontsize = 14)
plt.xlabel('Training set size', fontsize = 14)
plt.title('Learning curves for a linear regression model', fontsize = 18, y = 1.03)
plt.legend()


# SVM Learning Curve

In [None]:
from sklearn.model_selection import ShuffleSplit
cv = ShuffleSplit(n_splits=3, test_size=0.2, random_state=0)

In [None]:
from sklearn.svm import SVC

# Create CV training and test scores for various training set sizes
train_sizes, train_scores, validation_scores = learning_curve(SVC(),
                                               X, y, cv=cv)
print('Training scores:\n\n', train_scores)
print('\n', '-' * 70) # separator to make the output easy to read
print('\nValidation scores:\n\n', validation_scores)

In [None]:
train_scores_mean = -train_scores.mean(axis = 1)
validation_scores_mean = -validation_scores.mean(axis = 1)
print('Mean training scores\n\n', pd.Series(train_scores_mean, index = train_sizes))
print('\n', '-' * 20) # separator
print('\nMean validation scores\n\n',pd.Series(validation_scores_mean, index = train_sizes))

In [None]:
import matplotlib.pyplot as plt

plt.style.use('seaborn')
plt.plot(train_sizes, train_scores_mean, label = 'Training error')
plt.plot(train_sizes, validation_scores_mean, label = 'Validation error')
plt.ylabel('Score', fontsize = 14)
plt.xlabel('Training set size', fontsize = 14)
plt.title('Learning curves for a SVC model', fontsize = 18, y = 1.03)
plt.legend()

# Naive Bayes Learning Curve

In [None]:
from sklearn.naive_bayes import GaussianNB
# Create CV training and test scores for various training set sizes
#cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
train_sizes, train_scores, validation_scores = learning_curve(GaussianNB(),
                                               X, y, cv=cv)
print('Training scores:\n\n', train_scores)
print('\n', '-' * 70) # separator to make the output easy to read
print('\nValidation scores:\n\n', validation_scores)

In [None]:
train_scores_mean = -train_scores.mean(axis = 1)
validation_scores_mean = -validation_scores.mean(axis = 1)
print('Mean training scores\n\n', pd.Series(train_scores_mean, index = train_sizes))
print('\n', '-' * 20) # separator
print('\nMean validation scores\n\n',pd.Series(validation_scores_mean, index = train_sizes))

In [None]:
## plt.style.use('seaborn')
plt.plot(train_sizes, train_scores_mean, label = 'Training error')
plt.plot(train_sizes, validation_scores_mean, label = 'Validation error')
plt.ylabel('Score', fontsize = 14)
plt.xlabel('Training set size', fontsize = 14)
#plt.xlim(-100000, 110000)
#plt.ylim(-100000, 100000);
plt.title('Learning curves for a Naive Bayes model', fontsize = 18, y = 1.03)
plt.legend()



# Train Error vs Test Error

In [None]:
from sklearn import linear_model
# #############################################################################
# Compute train and test errors
alphas = np.logspace(-5, 1, 60)
enet = linear_model.ElasticNet(l1_ratio=0.7, max_iter=10000)
train_errors = list()
test_errors = list()
for alpha in alphas:
    enet.set_params(alpha=alpha)
    enet.fit(X_train, y_train)
    train_errors.append(enet.score(X_train, y_train))
    test_errors.append(enet.score(X_test, y_test))

i_alpha_optim = np.argmax(test_errors)
alpha_optim = alphas[i_alpha_optim]
print("Optimal regularization parameter : %s" % alpha_optim)

In [None]:
# Estimate the coef_ on full data with optimal regularization parameter
enet.set_params(alpha=alpha_optim)
coef_ = enet.fit(X, y).coef_

In [None]:
# #############################################################################
# Plot results functions

import matplotlib.pyplot as plt
plt.subplot(2, 1, 1)
plt.semilogx(alphas, train_errors, label='Train')
plt.semilogx(alphas, test_errors, label='Test')
plt.vlines(alpha_optim, plt.ylim()[0], np.max(test_errors), color='k',
           linewidth=3, label='Optimum on test')
plt.legend(loc='lower left')

plt.xlabel('Regularization parameter')
plt.ylabel('Performance')

In [None]:
# Show estimated coef_ vs true coef
n_features = 500
coef = np.random.randn(n_features)
coef[50:] = 0.0  # only the top 10 features are impacting the model
plt.subplot(2, 1, 2)
plt.plot(coef, label='True coef')
plt.plot(coef_, label='Estimated coef')
plt.legend()
plt.subplots_adjust(0.09, 0.04, 0.94, 0.94, 0.26, 0.26)
plt.show()

# Cross Validation Classification Accuracy

In [None]:
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
array = df.values
array.size
X = model_df.iloc[:, :-1]
y = model_df.iloc[:, -1]
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = LogisticRegression()
scoring = 'accuracy'
results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
#print("Accuracy: %.3f (%.3f) - ") % (results.mean(), results.std())  
print("Accuracy : results.mean()", results.mean())
print("Accuracy : results.std()", results.std())
      

In [None]:
from sklearn.neighbors import KNeighborsClassifier
# check classification accuracy of KNN with K=5
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
results = metrics.accuracy_score(y_test, y_pred)
print("Accuracy : results.mean()", results.mean())
print("Accuracy : results.std()", results.std())

# Log Loss

In [None]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = LogisticRegression()
scoring = 'neg_log_loss'
results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
print("Accuracy : results.mean()", results.mean())
print("Accuracy : results.std()", results.std())

# Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X_train, y_train, test_size=test_size, random_state=seed)
model = LogisticRegression()
model.fit(X_train, Y_train)
predicted = model.predict(X_test)
matrix = confusion_matrix(Y_test, predicted)
print(matrix)


# Classification Report

In [None]:
from sklearn.metrics import classification_report

test_size = 0.33
seed = 7
model = LogisticRegression()
model.fit(X_train, Y_train)
predicted = model.predict(X_train)
report = classification_report(Y_train, predicted)
print(report)

In [None]:
y_pred = forest.predict(X_train)

accuracy = metrics.accuracy_score(Y_train, y_pred)
f1_score = metrics.f1_score(Y_train, y_pred, average="micro")

print(f"Training Set Accuracy: {accuracy*100:.3f}%")
print(f"Training Set F1 Score: {f1_score:.3f}")





In [None]:
from sklearn.metrics import confusion_matrix

#create the confusion matrix of the training set
confusion_train = confusion_matrix(Y_train, y_pred)
confusion_train = confusion_train.astype("float") / \
                   confusion_train.sum(axis=1)[:, np.newaxis]
confusion_train = np.around(confusion_train, decimals=3)*100

#create confusion matrix heat map
f, axes = plt.subplots(figsize=(20,10))
im = axes.imshow(confusion_train, interpolation="nearest", cmap=plt.cm.Reds)

axes.figure.colorbar(im, ax=axes)
axes.set(title="Confusion Matrix for Training Set", \
         xticks=np.arange(confusion_train.shape[1]), \
         yticks=np.arange(confusion_train.shape[0]), \
         xticklabels=range(1, 6), yticklabels=range(1, 6), \
         xlabel="Predicted", ylabel="Truth")

#add clear annotations to the confusion matrix
threshold = confusion_train.max()/1.5
for i in range(confusion_train.shape[0]):
    for j in range(confusion_train.shape[1]):
        axes.text(j, i, f"{confusion_train[i, j]:.3f}%",
                ha="center", va="center",
                color="white" if confusion_train[i, j] > threshold else "black")
f.tight_layout()
plt.show()

In [None]:
from sklearn.model_selection import cross_val_score

cross_val_accuracy = cross_val_score(forest, X_train, Y_train, \
                               cv=3, scoring="accuracy")
cross_val_f1 = cross_val_score(forest, X_train, Y_train, \
                               cv=3, scoring="f1_micro")

cross_val_accuracy = np.mean(cross_val_accuracy)
cross_val_f1 = np.mean(cross_val_f1)

print(f"Training Set Accuracy: {cross_val_accuracy*100:.3f}%")
print(f"Training Set F1 Score: {cross_val_f1:.3f}")

In [None]:
%%time
import xgboost as xgb

y_train_shifted = Y_train-1
y_test_shifted = Y_test-1

train_set = xgb.DMatrix(X_train, label=y_train_shifted)
test_set = xgb.DMatrix(X_test, label=y_test_shifted)

parameters = {"max_depth": 4, "eta": 0.2, "silent": 1, \
              "objective": "multi:softprob", "num_class": 5}

boost = xgb.train(parameters, train_set, 100)

In [None]:
y_pred = boost.predict(train_set)
y_pred = y_pred.argmax(axis=1)
y_pred = y_pred+1

accuracy = metrics.accuracy_score(Y_train, y_pred)
f1_score = metrics.f1_score(Y_train, y_pred, average="micro")

print(f"Training Set Accuracy: {accuracy*100:.3f}%")
print(f"Training Set F1 Score: {f1_score:.3f}")

In [None]:
#create the confusion matrix of the training set
confusion_train = confusion_matrix(Y_train, y_pred)
confusion_train = confusion_train.astype("float") / \
                   confusion_train.sum(axis=1)[:, np.newaxis]
confusion_train = np.around(confusion_train, decimals=3)*100

#create confusion matrix heat map
f, axes = plt.subplots(figsize=(20,10))
im = axes.imshow(confusion_train, interpolation="nearest", cmap=plt.cm.Reds)

axes.figure.colorbar(im, ax=axes)
axes.set(title="Confusion Matrix for Training Set", \
         xticks=np.arange(confusion_train.shape[1]), \
         yticks=np.arange(confusion_train.shape[0]), \
         xticklabels=range(1, 6), yticklabels=range(1, 6), \
         xlabel="Predicted", ylabel="Truth")

#add clear annotations to the confusion matrix
threshold = confusion_train.max()/1.5
for i in range(confusion_train.shape[0]):
    for j in range(confusion_train.shape[1]):
        axes.text(j, i, f"{confusion_train[i, j]:.3f}%",
                ha="center", va="center",
                color="white" if confusion_train[i, j] > threshold else "black")
f.tight_layout()
plt.show()

In [None]:
%%time
boost_cv = xgb.cv(dtrain=train_set, params=parameters, nfold=3, \
                  num_boost_round=50, early_stopping_rounds=10, \
                  metrics="merror", as_pandas=True, seed=42)

display(boost_cv.head())

In [None]:
cross_val_accuracy = boost_cv.iloc[-1,2]
cross_val_accuracy = 1-cross_val_accuracy

print(f"Training Set Accuracy: {cross_val_accuracy*100:.3f}%")

In [None]:
y_pred = boost.predict(test_set)
y_pred = y_pred.argmax(axis=1)
y_pred = y_pred+1

accuracy = metrics.accuracy_score(Y_test, y_pred)
f1_score = metrics.f1_score(Y_test, y_pred, average="micro")

print(f"Test Set Accuracy: {accuracy*100:.3f}%")
print(f"Test Set F1 Score: {f1_score:.3f}")

In [None]:
#create the confusion matrix of the test set
confusion_train = confusion_matrix(Y_test, y_pred)
confusion_train = confusion_train.astype("float") / \
                   confusion_train.sum(axis=1)[:, np.newaxis]
confusion_train = np.around(confusion_train, decimals=3)*100

#create confusion matrix heat map
f, axes = plt.subplots(figsize=(20,10))
im = axes.imshow(confusion_train, interpolation="nearest", cmap=plt.cm.Reds)

axes.figure.colorbar(im, ax=axes)
axes.set(title="Confusion Matrix for Test Set", \
         xticks=np.arange(confusion_train.shape[1]), \
         yticks=np.arange(confusion_train.shape[0]), \
         xticklabels=range(1, 6), yticklabels=range(1, 6), \
         xlabel="Predicted", ylabel="Truth")

#add clear annotations to the confusion matrix
threshold = confusion_train.max()/1.5
for i in range(confusion_train.shape[0]):
    for j in range(confusion_train.shape[1]):
        axes.text(j, i, f"{confusion_train[i, j]:.3f}%",
                ha="center", va="center",
                color="white" if confusion_train[i, j] > threshold else "black")
f.tight_layout()
plt.show()

In [None]:
X = trimmed_df.iloc[:, :-1]
y = trimmed_df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.5, random_state=42)

In [None]:
y_train_shifted = y_train-1
y_test_shifted = y_test-1

train_set = xgb.DMatrix(X_train, label=y_train_shifted)
test_set = xgb.DMatrix(X_test, label=y_test_shifted)

y_pred = boost.predict(test_set)
y_pred = y_pred.argmax(axis=1)
y_pred = y_pred+1

accuracy = metrics.accuracy_score(y_test, y_pred)
f1_score = metrics.f1_score(y_test, y_pred, average="micro")

print(f"Balanced Test Set Accuracy: {accuracy*100:.3f}%")
print(f"Balanced Test Set F1 Score: {f1_score:.3f}")

In [None]:
#create the confusion matrix of the balanced test set
confusion_train = confusion_matrix(y_test, y_pred)
confusion_train = confusion_train.astype("float") / \
                   confusion_train.sum(axis=1)[:, np.newaxis]
confusion_train = np.around(confusion_train, decimals=3)*100

#create confusion matrix heat map
f, axes = plt.subplots(figsize=(20,10))
im = axes.imshow(confusion_train, interpolation="nearest", cmap=plt.cm.Reds)

axes.figure.colorbar(im, ax=axes)
axes.set(title="Confusion Matrix for Balanced Test Set", \
         xticks=np.arange(confusion_train.shape[1]), \
         yticks=np.arange(confusion_train.shape[0]), \
         xticklabels=range(1, 6), yticklabels=range(1, 6), \
         xlabel="Predicted", ylabel="Truth")

#add clear annotations to the confusion matrix
threshold = confusion_train.max()/1.5
for i in range(confusion_train.shape[0]):
    for j in range(confusion_train.shape[1]):
        axes.text(j, i, f"{confusion_train[i, j]:.3f}%",
                ha="center", va="center",
                color="white" if confusion_train[i, j] > threshold else "black")
f.tight_layout()
plt.show()

In [None]:
from wordcloud import WordCloud

wordcloud = WordCloud(stopwords = set(stop_words), min_font_size=10, \
                      max_font_size=50, max_words=50, \
                      background_color="white", colormap = "Oranges")

one_star_text = " ".join(df[df["overall"]==1]["reviewText"].values).lower()
two_star_text = " ".join(df[df["overall"]==2]["reviewText"].values).lower()
three_star_text = " ".join(df[df["overall"]==3]["reviewText"].values).lower()
four_star_text = " ".join(df[df["overall"]==4]["reviewText"].values).lower()
five_star_text = " ".join(df[df["overall"]==5]["reviewText"].values).lower()

text_list = [one_star_text, two_star_text, three_star_text, \
             four_star_text, five_star_text]

for index, text in enumerate(text_list):
    f, axes = plt.subplots(figsize=(10,7))
    wordcloud.generate(text)
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.title(f"Word Cloud for {index+1}-Star Ratings")
    plt.axis("off")
    plt.show()

In [None]:
df['overall'].unique()

In [None]:
df['overall'].value_counts()

# Ensemble Method

In [None]:
# Get some classifiers to evaluate
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier,VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import SVC

seed = 1075
np.random.seed(seed)

X = model_df.iloc[:, :-1]
y = model_df.iloc[:, -1]

# Create classifiers
rf = RandomForestClassifier()
et = ExtraTreesClassifier()
knn = KNeighborsClassifier()
svc = SVC()
rg = RidgeClassifier()

clf_array = [rf, et, knn, svc, rg]

for clf in clf_array:
    vanilla_scores = cross_val_score(clf, X, y, cv=2, n_jobs=-1)
    bagging_clf = BaggingClassifier(clf, 
       max_samples=0.4, max_features=10, random_state=seed)
    bagging_scores = cross_val_score(bagging_clf, X, y, cv=10, 
       n_jobs=-1)
    
    print("Mean of: {1:.3f}, std: (+/-) {2:.3f} [{0}]".format(clf.__class__.__name__, 
                                                              vanilla_scores.mean(), vanilla_scores.std()))
    print("Mean of: {1:.3f}, std: (+/-) {2:.3f} [Bagging {0}]\n".format(clf.__class__.__name__, 
                                                                        bagging_scores.mean(), bagging_scores.std()))

In [None]:
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier,VotingClassifier
# Set up voting
eclf = VotingClassifier(estimators=[('Random Forests', rf), ('Extra Trees', et), 
                                    ('KNeighbors', knn), ('SVC', svc), ('Ridge Classifier', rg)], voting='hard')

for clf, label in zip([rf, et, knn, svc, rg, eclf], ['Random Forest', 'Extra Trees', 
                                                     'KNeighbors', 'SVC', 'Ridge Classifier', 'Ensemble']):
    scores = cross_val_score(clf, X, y, cv=2, scoring='accuracy')
    print("Mean: {0:.3f}, std: (+/-) {1:.3f} [{2}]".format(scores.mean(), scores.std(), label))

In [None]:
# Set up ensemble voting for bagging
ebclf_array = []

for clf in clf_array:
    ebclf_array.append(BaggingClassifier(clf, max_samples=0.25, 
                                   max_features=10, random_state=seed))

v_eclf = VotingClassifier(estimators=zip(['Bagging Random Forest', 'Bagging Extra Trees', 'Bagging KNeighbors',
                                          'Bagging SVC', 'Bagging Ridge Classifier'],
                                         ebclf_array), 
                          voting='hard')

ebclf_array.append(v_eclf)

for clf, label in zip(ebclf_array, ['Bagging Random Forest', 'Bagging Extra Trees', 'Bagging KNeighbors',
                              'Bagging SVC', 'BaggingRidge Classifier']):
    scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
    print("Mean: {0:.3f}, std: (+/-) {1:.3f} [{2}]".format(scores.mean(), scores.std(), label))

In [None]:
# Now plot the decision regions with only two features
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_decision_regions
import matplotlib.gridspec as gridspec
import itertools

gs = gridspec.GridSpec(3, 3)
fig = plt.figure(figsize=(12, 10))
labels = ['Random Forest', 'Extra Trees', 'KNN', 'Support Vector',
          'Ridge Reg.', 'Ensemble']

for clf, lab, grd in zip([rf, et, knn, svc, rg, eclf], 
                         labels,
                         itertools.product([0, 1, 2], repeat = 2)):

    clf.fit(X[[1,2]], y)
    ax = plt.subplot(gs[grd[0], grd[1]])
    fig = plot_decision_regions(X=np.array(X[[1,2]]), 
                                y=np.array(y), clf=clf)
    plt.title(lab)

In [None]:
from mlxtend.classifier import EnsembleVoteClassifier
import warnings
from xgboost import XGBClassifier, plot_importance
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

warnings.filterwarnings('ignore')

# Create boosting classifiers
ada_boost = AdaBoostClassifier()
grad_boost = GradientBoostingClassifier()
xgb_boost = XGBClassifier()

boost_array = [ada_boost, grad_boost, xgb_boost]

eclf = EnsembleVoteClassifier(clfs=[ada_boost, grad_boost, xgb_boost], voting='hard')

labels = ['Ada Boost', 'Grad Boost', 'XG Boost', 'Ensemble']

for clf, label in zip([ada_boost, grad_boost, xgb_boost, eclf], labels):
    scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
    print("Mean: {0:.3f}, std: (+/-) {1:.3f} [{2}]".format(scores.mean(), scores.std(), label))

In [None]:
gs = gridspec.GridSpec(2, 2)

fig = plt.figure(figsize=(12, 10))

labels = ['Ada Boost', 'Grad Boost', 'XG Boost', 'Ensemble']

for clf, lab, grd in zip([ada_boost, grad_boost, xgb_boost, eclf],
                         labels,
                         itertools.product([0, 1], repeat = 2)):

    clf.fit(X[[1, 2]].values, y.values)
    ax = plt.subplot(gs[grd[0], grd[1]])
    fig = plot_decision_regions(X=np.array(X[[1, 2]].values), 
                                y=np.array(y.values), clf=clf)

In [None]:
from mlens.ensemble import SuperLearner
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

lr = LogisticRegression()

seed = 1075

ensemble = SuperLearner(scorer = accuracy_score, 
                        random_state=seed, 
                        folds=10,
                        verbose = 2)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed)

# Build the first layer
ensemble.add([rf, et, knn, rg])
# Attach the final meta estimator
ensemble.add_meta(lr)

ensemble.fit(X_train, y_train)
preds = ensemble.predict(X_test)
print("Fit data:\n%r" % ensemble.data)
print("Accuracy score: {:.3f}".format(accuracy_score(preds, y_test)))

In [None]:
from itertools import combinations

names = ['Random Forest', 'Extra Trees', 'KNeighbors', 'SVC', 'Ridge Classifier']

def zip_stacked_classifiers(*args):
    to_zip = []
    for arg in args:
        combined_items = sum([list(map(list, combinations(arg, i))) for i in range(len(arg) + 1)], [])
        combined_items = filter(lambda x: len(x) > 0, combined_items)
        to_zip.append(combined_items)
    
    return zip(to_zip[0], to_zip[1])

stacked_clf_list = zip_stacked_classifiers(clf_array, names)

best_combination = [0.00, ""]

for clf in stacked_clf_list:
    
    ensemble = SuperLearner(scorer = accuracy_score, 
                            random_state = seed, 
                            folds = 10)
    ensemble.add(clf[0])
    ensemble.add_meta(lr)
    ensemble.fit(X_train, y_train)
    preds = ensemble.predict(X_test)
    accuracy = accuracy_score(preds, y_test)
    
    if accuracy > best_combination[0]:
        best_combination[0] = accuracy
        best_combination[1] = clf[1]
    
    print("Accuracy score: {:.3f} {}".format(accuracy, clf[1]))

print("\nBest stacking model is {} with accuracy of: {:.3f}".format(best_combination[1], best_combination[0]))