In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import math
import string
import datetime
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.corpus import words
from nltk.corpus import stopwords
from contractions import CONTRACTION_MAP
from stopwords import stop_words

dat = pd.read_csv('review_ver2.csv', encoding = "ISO-8859-1")
# Change the display size
pd.set_option('display.max_columns',20)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)

In [2]:
#this function expands words such as I'll to I will
def expand_contractions(word):
    expanded = ' '.join([CONTRACTION_MAP[t] if t in CONTRACTION_MAP else t for t in word.split(" ") ])
    return expanded

In [3]:
#this function gets the wordnet pos tag
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N":wordnet.NOUN, "V":wordnet.VERB, "R":wordnet.ADV}
    
    return tag_dict.get(tag,wordnet.NOUN)
    
#this function preprocesses the review texts
def preprocessing_text(text):
    #contractions
    expanded_text=expand_contractions(text)
    #remove numbers
    numbers_removed = re.sub(r'\d+','',expanded_text)
    #remove punctuation
    punct_removed = re.sub(r'[^\w\s]','',numbers_removed)
    #tokenization
    tokens = nltk.word_tokenize(punct_removed.lower())
    
    #remove stop words and lemmatization
    lem_words = []
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    for word in tokens:
        temp_word = lemmatizer.lemmatize(word,get_wordnet_pos(word))
        if  temp_word not in stop_words:
            lem_words.append(temp_word)

    return lem_words

    return tokens


In [4]:
#this function gets the bigram of the review texts
def get_bigram(lem_words):
    
    if len(lem_words) <= 1: #meaning token is just one, unable to perform bigram
        return lem_words
    
    else:
        #bigram is in the form of [('wordA','wordB'),('wordB,'wordC'),...]
        bigrm = list(nltk.bigrams(lem_words))

        #make the bigram in this format ['wordA wordB','wordB wordC',...]
        bigrm_list = []
        separator = ' '
        for i in range(len(bigrm)):
            bigrm_list.append(separator.join(bigrm[i]))   
        return bigrm_list

In [5]:
#applies the preprocessing_text function on all items in the review column
print(datetime.datetime.now())
lem_tokens = dat['review'].apply(preprocessing_text)
print("done 1")
print(datetime.datetime.now())
#applies the get_bigram function on all the items in the review column
bigram_list = lem_tokens.apply(get_bigram)
print("done 2")
print(datetime.datetime.now())

2019-04-29 14:25:25.387006
done 1
2019-04-29 14:46:08.659269
done 2
2019-04-29 14:46:09.475085


In [6]:
#putting the series of review texts into data frame
df_bigram = bigram_list.to_frame()
#concatenating the new data frame with ratings column
result = pd.concat([df_bigram,dat['rating']],axis=1)
print(result.head(10))

                                              review rating
0  [part magic, magic grow, grow boy, boy buy, bu...      4
1  [amaze detail, detail every, every credit, cre...      5
2  [purchase behalf, behalf dad, dad always, alwa...      5
3  [everything really, really need, need see, see...      5
4  [collect glossy, glossy picture, picture great...      5
5  [great book, book extremely, extremely useful,...      5
6  [useful info, info someonelike, someonelike st...      5
7  [well produce, produce good, good quality, qua...      5
8     [happy communication, communication funkybuys]      4
9                                        [great buy]      5


In [7]:
# tf-idf using built-in function
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
#Version 1: using tf-idf max_features to select features

X_train_1 = result['review'].values
# X_train_1 = [['ho','true','ho'],['meow','fish'],['ho','meow','hey','hi']]
Y_train_1 = result['rating'].values
# Y_train_1 = [3,4,5]

def identity_tokenizer(text):
    return text

tfidf_1 = TfidfVectorizer(tokenizer=identity_tokenizer, analyzer='word',preprocessor = identity_tokenizer, max_features =100,lowercase=True)    
X_1 = tfidf_1.fit_transform(X_train_1)

tfidf_val_1 = []
rating_list_1 = []
feature_names = tfidf_1.get_feature_names()
corpus_index = [n for n in range(len(X_train_1))]
rows, cols = X_1.nonzero()
for row, col in zip(rows, cols):
    #print((feature_names[col], corpus_index[row]), X_1[row, col])
    tfidf_val_1.append(X_1[row,col])
    
    try:
        temp = int(Y_train_1[corpus_index[row]])
        
    except:
        pass
    
    rating_list_1.append(temp)
    

df_tfidf_1 = pd.DataFrame({'tfidf_1':tfidf_val_1})
df_rating_1 = pd.DataFrame({'ratings':rating_list_1})
#concatenating the new data frame
FE_result_1 = pd.concat([df_tfidf_1,df_rating_1],axis=1)
print(FE_result_1.head(10))
    

    tfidf_1  ratings
0  0.603714        5
1  0.573855        5
2  0.553371        5
3  1.000000        5
4  1.000000        5
5  1.000000        5
6  0.604235        5
7  0.573313        5
8  0.553365        5
9  0.792581        1


In [21]:
#Version 2: sorts the idf values for each features and select the top highest idf values

X_train_2 = result['review'].values
# X_train_2 = [['ho','true','ho'],['meow','fish'],['ho','hu','hey','hi']]
# Y_train_2 = result['rating'].values
Y_train_2 = [3,4,5]

tfidf_2 = TfidfVectorizer(tokenizer=identity_tokenizer, analyzer='word',preprocessor = identity_tokenizer,lowercase=True)    
X_2 = tfidf_2.fit_transform(X_train_2)

# sorts according to idf
indices = np.argsort(tfidf_2.idf_)[::-1]
# print(indices)
features = tfidf_2.get_feature_names()

tfidf_val_2 = []
rating_list_2 = []

corpus_index = [n for n in range(len(X_train_2))]
rows, cols = X_2.nonzero()
for i in range(100):   #cannot be longer than length of indices, length of indices are the number of unique bigram
    for row, col in zip(rows,cols):
        if col == indices[i]:
#             print(features[col],corpus_index[row],X_2[row,col])
            tfidf_val_2.append(X_2[row,col])
            
            try:
                temp = int(Y_train_2[corpus_index[row]])
        
            except:
                pass
    
            rating_list_2.append(temp)
            
#             rating_list_2.append(Y_train_2[corpus_index[row]])
            
# print(tfidf_val_2)
# print(rating_list_2)
print(len(tfidf_val_2),len(rating_list_2))
df_tfidf_2 = pd.DataFrame({'tfidf_2':tfidf_val_2})
df_rating_2 = pd.DataFrame({'ratings':rating_list_2})
#concatenating the new data frame
FE_result_2 = pd.concat([df_tfidf_2,df_rating_2],axis=1)
print(FE_result_2.head(10))

    tfidf_2  ratings
0  0.223344        5
1  0.137311        5
2  0.121060        5
3  0.091637        5
4  0.222242        5
5  0.091637        5
6  0.121060        5
7  0.121060        5
8  0.089295        5
9  0.091637        5


In [16]:

# Version 3: mutual information classification
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_extraction.text import CountVectorizer

def identity_tokenizer(docs):
    return docs

X_3 = np.array(result['review'].values)
Y_3 = np.array(result['rating'].values)

cv = CountVectorizer(tokenizer=identity_tokenizer,preprocessor = identity_tokenizer, max_features = 10)
X_vec = cv.fit_transform(X_3)


# temp = mutual_info_classif(X_vec, Y_3, discrete_features=True)
# print(temp)
corpus_index = [n for n in range(len(X_3))]
row,column = X_vec.nonzero()
# print(X_vec)
# print(len(row))
# print(column)
# print(len(column))
# print(Y_3)
# print(len(Y_3))
res = dict(zip(X_vec[row],mutual_info_classif(X_vec, Y_3, discrete_features=True)))
print(res)

# mutual_info = []
# rating_list_3 = []
# feature_names = cv.get_feature_names()
# corpus_index = [n for n in range(len(X_3))]
# rows, cols = X_vec.nonzero()
# for row, col in zip(rows, cols):
#     print((feature_names[col], corpus_index[row]))
#     tfidf_val_1.append(X_1[row,col])
    
#     try:
#         temp = int(Y_train_1[corpus_index[row]])
        
#     except:
#         pass

TypeError: unhashable type: 'csr_matrix'

In [25]:
#setting each of the featured words and their ratings
#for Version 4

featured_words = []
rates = []
for i in range(len(result['review'].values)):
    for j in range(len(result['review'].values[i])):
        featured_words.append(result['review'].values[i][j])
        rates.append(result['rating'].values[i])
        
# print(featured_words)
print(len(featured_words))
# print(rates)
print(len(rates))



525240
525240


In [193]:
# Version 4: information gain
def information_gain(X, y):

    def _calIg():
        entropy_x_set = 0
        entropy_x_not_set = 0
        for c in classCnt:
            probs = classCnt[c] / float(featureTot)
            entropy_x_set = entropy_x_set - probs * np.log(probs)
            probs = (classTotCnt[c] - classCnt[c]) / float(tot - featureTot)
            entropy_x_not_set = entropy_x_not_set - probs * np.log(probs)
            
        for c in classTotCnt:
            if c not in classCnt:
                probs = classTotCnt[c] / float(tot - featureTot)
                entropy_x_not_set = entropy_x_not_set - probs * np.log(probs)
        results =  entropy_before - ((featureTot / float(tot)) * entropy_x_set
                             +  ((tot - featureTot) / float(tot)) * entropy_x_not_set)
        return results

    
    tot = X.shape[0]
    classTotCnt = {}
    entropy_before = 0
    for i in y:
        if i not in classTotCnt:
            classTotCnt[i] = 1
        else:
            classTotCnt[i] = classTotCnt[i] + 1
    for c in classTotCnt:
        probs = classTotCnt[c] / float(tot)
        entropy_before = entropy_before - probs * np.log(probs)
        

    nz = X.T.nonzero()
    pre = 0
    classCnt = {}
    featureTot = 0
    information_gain = []
    rating = []
    for i in range(0, len(nz[0])):
        if (i != 0 and nz[0][i] != pre):
            for notappear in range(pre+1, nz[0][i]):
                information_gain.append(0)
            ig = _calIg()
            information_gain.append(ig)
            pre = nz[0][i]
            classCnt = {}
            featureTot = 0
        featureTot = featureTot + 1
        yclass = y[nz[0][i]]
        if yclass not in classCnt:
            classCnt[yclass] = 1
        else:
            classCnt[yclass] = classCnt[yclass] + 1
    ig = _calIg()
    information_gain.append(ig)
    

    return np.asarray(information_gain)


# X = np.array(['ho','meow','ho','meow','fish','ho','meow','hey','hi','put','down','good'])
# Y = np.array([1,1,1,4,4,2,2,2,2,5,5,5])

X = np.array(featured_words)
Y = np.array(np.array(rates))

scores = information_gain(X,Y)

for i in range(len(X)):
#     print(i)
#     print(X[i])
#     print(scores[i])
    print(X[i],scores[i])
    


# threshold= np.percentile(sorted(scores),50)
# print(threshold)

part magic 2.954664819365682e-06
magic grow 2.954664819365682e-06
grow boy 2.954664819365682e-06
boy buy 2.954664819365682e-06
buy give 2.954664819365682e-06
give new 2.954664819365682e-06
new hornby 2.954664819365682e-06
hornby catalogue 2.954664819365682e-06
catalogue every 2.954664819365682e-06
every year 2.954664819365682e-06
year even 2.954664819365682e-06
even include 2.954664819365682e-06
include product 2.954664819365682e-06
product previous 2.954664819365682e-06
previous year 2.954664819365682e-06
year still 2.954664819365682e-06
still get 2.954664819365682e-06
get old 2.954664819365682e-06
old one 2.954664819365682e-06
one date 2.954664819365682e-06
date back 2.954664819365682e-06
back somewhere 2.954664819365682e-06
somewhere day 2.954664819365682e-06
day catalogue 2.954664819365682e-06
catalogue especially 2.954664819365682e-06
especially informative 2.954664819365682e-06
informative tell 2.954664819365682e-06
tell vintage 2.954664819365682e-06
vintage roll 2.95466481936568

wagonsof heavy 7.704243915052089e-07
heavy weight 7.704243915052089e-07
excellent ho 7.704243915052089e-07
ho scale 7.704243915052089e-07
scale diorama 7.704243915052089e-07
diorama piece 7.704243915052089e-07
piece comprise 7.704243915052089e-07
comprise white 7.704243915052089e-07
white enclose 7.704243915052089e-07
enclose coach 7.704243915052089e-07
coach pair 7.704243915052089e-07
pair white 7.704243915052089e-07
white horse 7.704243915052089e-07
horse two 7.704243915052089e-07
two figure 7.704243915052089e-07
figure red 7.704243915052089e-07
red drive 7.704243915052089e-07
drive seat 7.704243915052089e-07
seat dress 7.704243915052089e-07
dress grey 7.704243915052089e-07
grey livery 7.704243915052089e-07
livery one 7.704243915052089e-07
one man 7.704243915052089e-07
man grey 7.704243915052089e-07
grey greatcoat 7.704243915052089e-07
greatcoat top 7.704243915052089e-07
top hat 7.704243915052089e-07
hat female 7.704243915052089e-07
female grey 7.704243915052089e-07
grey coat 7.70424

sturdy well 7.704243915052089e-07
well make 7.704243915052089e-07
make interaction 7.704243915052089e-07
interaction train 7.704243915052089e-07
train good 7.704243915052089e-07
good easy 7.704243915052089e-07
easy break 7.704243915052089e-07
break easy 7.704243915052089e-07
easy couple 7.704243915052089e-07
couple train 7.704243915052089e-07
train together 7.704243915052089e-07
together great 7.704243915052089e-07
great product 7.704243915052089e-07
love tomica 7.704243915052089e-07
seem though 7.704243915052089e-07
though gradually 7.704243915052089e-07
gradually purchasing 7.704243915052089e-07
purchasing die 7.704243915052089e-07
die cast 7.704243915052089e-07
cast train 7.704243915052089e-07
train year 7.704243915052089e-07
year old 7.704243915052089e-07
old play 7.704243915052089e-07
play take 7.704243915052089e-07
take along 7.704243915052089e-07
along take 7.704243915052089e-07
take n 7.704243915052089e-07
n play 7.704243915052089e-07
play track 7.704243915052089e-07
track get 

goodquality need 2.954664819365682e-06
need flat 2.954664819365682e-06
flat solid 2.954664819365682e-06
solid base 2.954664819365682e-06
base youll 2.954664819365682e-06
youll need 2.954664819365682e-06
need spend 2.954664819365682e-06
spend time 2.954664819365682e-06
time get 2.954664819365682e-06
get crease 2.954664819365682e-06
crease supply 2.954664819365682e-06
supply xmm 2.954664819365682e-06
xmm midimat 2.954664819365682e-06
midimat toothe 2.954664819365682e-06
toothe detail 2.954664819365682e-06
detail locomotive 2.954664819365682e-06
locomotive carriage 2.954664819365682e-06
carriage pretty 2.954664819365682e-06
pretty good 2.954664819365682e-06
good conn 2.954664819365682e-06
conn rod 2.954664819365682e-06
rod move 2.954664819365682e-06
move nicely 2.954664819365682e-06
nicely see 2.954664819365682e-06
see inside 2.954664819365682e-06
inside carriage 2.954664819365682e-06
carriage white 2.954664819365682e-06
white table 2.954664819365682e-06
table cloth 2.954664819365682e-06


hard backing 2.954664819365682e-06
backing neededthe 2.954664819365682e-06
neededthe track 2.954664819365682e-06
track layout 2.954664819365682e-06
layout feel 2.954664819365682e-06
feel like 2.954664819365682e-06
like next 2.954664819365682e-06
next size 2.954664819365682e-06
size basic 2.954664819365682e-06
basic layout 2.954664819365682e-06
layout already 2.954664819365682e-06
already contains 2.954664819365682e-06
contains extension 2.954664819365682e-06
extension packthe 2.954664819365682e-06
packthe train 2.954664819365682e-06
train huge 2.954664819365682e-06
huge look 2.954664819365682e-06
look silly 2.954664819365682e-06
silly pull 2.954664819365682e-06
pull long 2.954664819365682e-06
long passenger 2.954664819365682e-06
passenger carriage 2.954664819365682e-06
carriage would 2.954664819365682e-06
would real 2.954664819365682e-06
real trainit 2.954664819365682e-06
trainit quite 2.954664819365682e-06
quite expensive 2.954664819365682e-06
expensive see 2.954664819365682e-06
see r

speechless christmas 7.704243915052089e-07
christmas memory 7.704243915052089e-07
memory never 7.704243915052089e-07
never forget 7.704243915052089e-07
forget accord 7.704243915052089e-07
accord best 7.704243915052089e-07
best present 7.704243915052089e-07
present everjorge 7.704243915052089e-07
arrive late 2.954664819365682e-06
late afternoon 2.954664819365682e-06
afternoon look 2.954664819365682e-06
look gorgeous 2.954664819365682e-06
gorgeous however 2.954664819365682e-06
however fly 2.954664819365682e-06
fly scotsman 2.954664819365682e-06
scotsman name 2.954664819365682e-06
name engine 2.954664819365682e-06
engine accord 2.954664819365682e-06
accord abc 2.954664819365682e-06
abc ian 2.954664819365682e-06
ian allan 2.954664819365682e-06
allan book 2.954664819365682e-06
book british 2.954664819365682e-06
british locomotive 2.954664819365682e-06
locomotive class 2.954664819365682e-06
class hornby 2.954664819365682e-06
hornby know 2.954664819365682e-06
know somethig 2.954664819365682e-

boiler band 2.954664819365682e-06
band handrail 2.954664819365682e-06
handrail detract 2.954664819365682e-06
detract otherwise 2.954664819365682e-06
otherwise accurate 2.954664819365682e-06
accurate kit 2.954664819365682e-06
kit fault 2.954664819365682e-06
fault however 2.954664819365682e-06
however rectify 2.954664819365682e-06
rectify care 2.954664819365682e-06
care despite 2.954664819365682e-06
despite basic 2.954664819365682e-06
basic kit 2.954664819365682e-06
kit basis 2.954664819365682e-06
basis year 2.954664819365682e-06
year countless 2.954664819365682e-06
countless motorise 2.954664819365682e-06
motorise work 2.954664819365682e-06
work model 2.954664819365682e-06
model kit 2.954664819365682e-06
kit use 2.954664819365682e-06
use build 2.954664819365682e-06
build gwr 2.954664819365682e-06
gwr city 2.954664819365682e-06
city class 2.954664819365682e-06
class loco 2.954664819365682e-06
loco determine 2.954664819365682e-06
determine kitbashers 2.954664819365682e-06
kitbashers use 2

order pair 7.704243915052089e-07
pair toy 7.704243915052089e-07
toy use 7.704243915052089e-07
use interactive 7.704243915052089e-07
interactive train 7.704243915052089e-07
train set 7.704243915052089e-07
set get 7.704243915052089e-07
get son 7.704243915052089e-07
son christmasfirst 7.704243915052089e-07
christmasfirst impression 7.704243915052089e-07
impression toy 7.704243915052089e-07
toy wife 7.704243915052089e-07
wife chuckle 7.704243915052089e-07
chuckle away 7.704243915052089e-07
away astonishment 7.704243915052089e-07
astonishment smart 7.704243915052089e-07
smart actually 7.704243915052089e-07
actually recognise 7.704243915052089e-07
recognise talk 7.704243915052089e-07
talk name 7.704243915052089e-07
name interact 7.704243915052089e-07
interact couple 7.704243915052089e-07
couple sentence 7.704243915052089e-07
sentence time 7.704243915052089e-07
time seem 7.704243915052089e-07
seem different 7.704243915052089e-07
different phrase 7.704243915052089e-07
phrase imagine 7.70424391

educational value 7.704243915052089e-07
value left 7.704243915052089e-07
left blank 7.704243915052089e-07
blank really 7.704243915052089e-07
really think 7.704243915052089e-07
think educational 7.704243915052089e-07
educational value 7.704243915052089e-07
value look 7.704243915052089e-07
look use 7.704243915052089e-07
use model 7.704243915052089e-07
model railway 7.704243915052089e-07
railway track 7.704243915052089e-07
track addition 7.704243915052089e-07
addition collection 7.704243915052089e-07
collection model 7.704243915052089e-07
model railwaythe 7.704243915052089e-07
railwaythe durable 7.704243915052089e-07
durable rating 7.704243915052089e-07
rating star 7.704243915052089e-07
star feel 7.704243915052089e-07
feel nicely 7.704243915052089e-07
nicely make 7.704243915052089e-07
make solid 7.704243915052089e-07
solid however 7.704243915052089e-07
however sure 7.704243915052089e-07
sure drop 7.704243915052089e-07
drop height 7.704243915052089e-07
height would 7.704243915052089e-07
wo

correct new 7.704243915052089e-07
new bogy 7.704243915052089e-07
bogy use 7.704243915052089e-07
use help 7.704243915052089e-07
help improve 7.704243915052089e-07
improve ride 7.704243915052089e-07
ride secondthird 7.704243915052089e-07
secondthird class 7.704243915052089e-07
class pullman 7.704243915052089e-07
pullman change 7.704243915052089e-07
change logo 7.704243915052089e-07
logo simply 7.704243915052089e-07
simply car 7.704243915052089e-07
car etci 7.704243915052089e-07
etci bought 7.704243915052089e-07
bought full 7.704243915052089e-07
full car 7.704243915052089e-07
car set 7.704243915052089e-07
set use 7.704243915052089e-07
use match 7.704243915052089e-07
match coach 7.704243915052089e-07
coach also 7.704243915052089e-07
also make 7.704243915052089e-07
make hornby 7.704243915052089e-07
hornby hornby 7.704243915052089e-07
hornby list 7.704243915052089e-07
list price 7.704243915052089e-07
price right 7.704243915052089e-07
right get 7.704243915052089e-07
get budget 7.7042439150520

toy child 7.704243915052089e-07
child amazon 7.704243915052089e-07
amazon category 7.704243915052089e-07
category reflect 7.704243915052089e-07
reflect thathowever 7.704243915052089e-07
thathowever turn 7.704243915052089e-07
turn actual 7.704243915052089e-07
actual product 7.704243915052089e-07
product sir 7.704243915052089e-07
sir nigel 7.704243915052089e-07
nigel gresley 7.704243915052089e-07
gresley pacific 7.704243915052089e-07
pacific class 7.704243915052089e-07
class locomotive 7.704243915052089e-07
locomotive model 7.704243915052089e-07
model supply 7.704243915052089e-07
supply tender 7.704243915052089e-07
tender nicely 7.704243915052089e-07
nicely present 7.704243915052089e-07
present scale 7.704243915052089e-07
scale model 7.704243915052089e-07
model highly 7.704243915052089e-07
highly detailed 7.704243915052089e-07
detailed term 7.704243915052089e-07
term paintwork 7.704243915052089e-07
paintwork wheel 7.704243915052089e-07
wheel piston 7.704243915052089e-07
piston pip 7.7042

KeyboardInterrupt: 

In [194]:
#quick sort
'''
Takes the last element as pivot,
places the pivot element at its correct position in sorted array,
then places all smaller than pivot to left of pivot and larger elements to right of pivot
'''

def partition(arr,ratings,low,high):
    i = (low - 1)  #index of smaller element
    pivot = arr[high]
    
    for j in range(low,high):
        #if current element is smaller than or equal to pivot
        if arr[j] <= pivot:
            i = i + 1   #increment index of smaller element
            arr[i],arr[j] = arr[j],arr[i]
            ratings[i],ratings[j] = ratings[j],ratings[i]
            
    arr[i+1],arr[high] = arr[high],arr[i+1]
    ratings[i+1],ratings[high] = ratings[high],ratings[i+1]
    
    return (i+1)

def quickSort(arr,ratings,low,high):
    if low < high:
        pi = partition(arr,ratings,low,high)
        quickSort(arr,ratings,low,pi-1)
        quickSort(arr,ratings,pi+1,high)
        
quickSort(scores,Y,0,len(scores)-1)
print(scores)
print(Y)
        

RecursionError: maximum recursion depth exceeded in comparison

kf KFold(n_splits=10, random_state=None, shuffle=False)
TRAIN:  [ 379  380  381 ... 3783 3784 3785] TEST:  [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
 216 217 218 219 

In [None]:
#OVERSAMPLING

from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
# from sklearn.pipeline import make_pipeline
# from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss

from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import BaggingClassifier

#Support Vector Machine
from sklearn.svm import SVC
# from sklearn.metrics import classification_report, confusion_matrix  
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report
# import matplotlib.pyplot as plt
# %matplotlib inline


def SVM_oversampling(X,Y):

    #normalize data
    scaler1 = MinMaxScaler(feature_range=(0,1))
    # tfidf_val_1 = np.array(tfidf_val_1)
    # X = X.reshape(1,-1)
    X = scaler1.fit_transform(X)

    scaler2 = MinMaxScaler(feature_range=(1,5))
    Y = scaler2.fit_transform(Y)
    Y = Y.ravel()  #prevent DataConversionError
    print(len(X),len(Y))

    accuracy = []
    precision = []
    recall = []
    f1 =[]

    cv = KFold(n_splits=10, random_state=42, shuffle = False)

    print(datetime.datetime.now())

    for train_index, test_index in cv.split(X):
        
        print("num: ",datetime.datetime.now())

        X_train,X_test = X[train_index],X[test_index]
        y_train,y_test = Y[train_index],Y[test_index]
        
        

        #Oversampling
        smt = SMOTE()
        X_train_smt,y_train_smt = smt.fit_sample(X_train,y_train)
        X_test_smt,y_test_smt = smt.fit_sample(X_test,y_test)
        
    print("2",datetime.datetime.now())

        #Undersampling
#         nr = NearMiss()
#         X_train_nr,y_train_nr = nr.fit_sample(X_train,y_train)


    svclassifier = OneVsRestClassifier(SVC(kernel='rbf',gamma='auto',cache_size=7000))
    
#         svclassifier = SVC(kernel='rbf',gamma='auto',cache_size=7000)
        
#         svclassifier = OneVsRestClassifier(BaggingClassifier(SVC(kernel='rbf',gamma='scale'),bootstrap='False'))
    
    
    
    print("between",datetime.datetime.now())
        
    svclassifier.fit(X_train_smt,y_train_smt)
        
        
    #     svclassifier.fit(X_train_nr,y_train_nr)
    y_pred = svclassifier.predict(X_test_smt)



    accuracy.append(svclassifier.score(X_test_smt,y_test_smt))


    precision.append(precision_score(y_test_smt,y_pred,average='weighted',labels=np.unique(y_pred)))



    recall.append(recall_score(y_test_smt,y_pred,average='weighted'))



    f1.append(f1_score(y_test_smt,y_pred,average='weighted',labels=np.unique(y_pred)))

    print("done loop")
    
    print("accuracy: {}".format(np.mean(accuracy)))
    print("precision: {}".format(np.mean(precision)))
    print("recall: {}".format(np.mean(recall)))
    print("f1: {}".format(np.mean(f1)))

    print("end",datetime.datetime.now())
                    
    
#     print(confusion_matrix(y_test,y_pred))  
#     print(classification_report(y_test,y_pred))  
#     score.append(y_pred)
#     score.append(best_svr.score(X_test,y_test))
    
# print(np.mean(scores))


X = FE_result_1.iloc[:,[0]]
Y = FE_result_1.iloc[:,[1]]

# X = FE_result_2.iloc[:,[0]]
# Y = FE_result_2.iloc[:,[1]]

print("start",datetime.datetime.now())

SVM_oversampling(X,Y)


start 2019-04-29 14:09:59.374929
164847 164847
2019-04-29 14:09:59.394923
num:  2019-04-29 14:09:59.394923


  return self.partial_fit(X, y)


num:  2019-04-29 14:10:02.194441
num:  2019-04-29 14:10:04.605835
num:  2019-04-29 14:10:08.312827
num:  2019-04-29 14:10:11.218533
num:  2019-04-29 14:10:14.345750
num:  2019-04-29 14:10:17.538130
num:  2019-04-29 14:10:21.360945
num:  2019-04-29 14:10:24.226423
num:  2019-04-29 14:10:27.277654
2 2019-04-29 14:10:30.268905
between 2019-04-29 14:10:30.268905


In [38]:
#UNDERSTAMPLING

from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
# from sklearn.pipeline import make_pipeline
# from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss

from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import BaggingClassifier

#Support Vector Machine
from sklearn.svm import SVC
# from sklearn.metrics import classification_report, confusion_matrix  
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report,confusion_matrix
# import matplotlib.pyplot as plt
# %matplotlib inline


def SVM_undersampling(X,Y):

    #normalize data
    scaler1 = MinMaxScaler(feature_range=(0,1))
    # tfidf_val_1 = np.array(tfidf_val_1)
    # X = X.reshape(1,-1)
    X = scaler1.fit_transform(X)

    scaler2 = MinMaxScaler(feature_range=(1,5))
    Y = scaler2.fit_transform(Y)
    Y = Y.ravel()  #prevent DataConversionError
    print(len(X),len(Y))

    accuracy = []
    precision = []
    recall = []
    f1 =[]

    cv = KFold(n_splits=10, random_state=42, shuffle = False)

    print(datetime.datetime.now())

    for train_index, test_index in cv.split(X):

        X_train,X_test = X[train_index],X[test_index]
        y_train,y_test = Y[train_index],Y[test_index]

        #Undersampling
        nr = NearMiss()
        X_train_nr,y_train_nr = nr.fit_sample(X_train,y_train)
#         X_test_nr,y_test_nr = nr.fit_sample(X_test,y_test)


#         svclassifier = OneVsRestClassifier(BaggingClassifier(SVC(kernel='rbf',gamma='scale'),bootstrap='False'))
#         svclassifier.fit(X_train_smt,y_train_smt)
#     print(X_train
    svclassifier = OneVsRestClassifier(SVC(kernel='rbf',gamma='auto',cache_size=7000))
#     svclassifier = SVC(kernel='rbf',gamma='auto',cache_size=7000)


    svclassifier.fit(X_train_nr,y_train_nr)
    y_pred = svclassifier.predict(X_test)

    accuracy.append(svclassifier.score(X_test,y_test))
    precision.append(precision_score(y_test,y_pred,average='macro',labels=np.unique(y_pred)))
    recall.append(recall_score(y_test,y_pred,average='macro'))
    f1.append(f1_score(y_test,y_pred,average='macro',labels=np.unique(y_pred)))
        
#         print("done loop")
        
    print("accuracy: {}".format(np.mean(accuracy)))
    print("precision: {}".format(np.mean(precision)))
    print("recall: {}".format(np.mean(recall)))
    print("f1: {}".format(np.mean(f1)))

    print(datetime.datetime.now())
                    
    
    print(confusion_matrix(y_test,y_pred))  
#     print(classification_report(y_test,y_pred))  
#     score.append(y_pred)
#     score.append(best_svr.score(X_test,y_test))
    
# print(np.mean(scores))


X = FE_result_1.iloc[:,[0]]
Y = FE_result_1.iloc[:,[1]]

# X = ['0.1','0.2','0.3','0.4','0.5','0.6','0.7','0.8','0.9','1.0']
# Y = [1,2,3,4,5,5,4,3,2,1]

SVM_undersampling(X,Y)


  return self.partial_fit(X, y)


29172 29172
2019-04-29 16:52:30.257464
accuracy: 0.01576962632841961
precision: 0.01576962632841961
recall: 0.2
f1: 0.0310496118798515
2019-04-29 16:52:31.970482
[[  46    0    0    0    0]
 [  30    0    0    0    0]
 [ 133    0    0    0    0]
 [ 585    0    0    0    0]
 [2123    0    0    0    0]]


In [41]:
#without resAMPLING

from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
# from sklearn.pipeline import make_pipeline
# from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss

from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import BaggingClassifier

#Support Vector Machine
from sklearn.svm import SVC
# from sklearn.metrics import classification_report, confusion_matrix  
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report,confusion_matrix
# import matplotlib.pyplot as plt
# %matplotlib inline


def SVM_undersampling(X,Y):

    #normalize data
    scaler1 = MinMaxScaler(feature_range=(0,1))
    # tfidf_val_1 = np.array(tfidf_val_1)
    # X = X.reshape(1,-1)
    X = scaler1.fit_transform(X)

    scaler2 = MinMaxScaler(feature_range=(1,5))
    Y = scaler2.fit_transform(Y)
    Y = Y.ravel()  #prevent DataConversionError
    print(len(X),len(Y))

    accuracy = []
    precision = []
    recall = []
    f1 =[]

    cv = KFold(n_splits=10, random_state=42, shuffle = False)

    print(datetime.datetime.now())

    for train_index, test_index in cv.split(X):

        X_train,X_test = X[train_index],X[test_index]
        y_train,y_test = Y[train_index],Y[test_index]

        #Undersampling
#         nr = NearMiss()
#         X_train_nr,y_train_nr = nr.fit_sample(X_train,y_train)
#         X_test_nr,y_test_nr = nr.fit_sample(X_test,y_test)


#         svclassifier = OneVsRestClassifier(BaggingClassifier(SVC(kernel='rbf',gamma='scale'),bootstrap='False'))
#         svclassifier.fit(X_train_smt,y_train_smt)
    print(X_train)
#     svclassifier = OneVsRestClassifier(SVC(kernel='rbf',gamma='auto',cache_size=7000))
    svclassifier = SVC(kernel='rbf',gamma='auto',cache_size=7000)


    svclassifier.fit(X_train,y_train)
    y_pred = svclassifier.predict(X_test)

    accuracy.append(svclassifier.score(X_test,y_test))
    precision.append(precision_score(y_test,y_pred,average='macro',labels=np.unique(y_pred)))
    recall.append(recall_score(y_test,y_pred,average='macro'))
    f1.append(f1_score(y_test,y_pred,average='macro',labels=np.unique(y_pred)))
        
#         print("done loop")
        
    print("accuracy: {}".format(np.mean(accuracy)))
    print("precision: {}".format(np.mean(precision)))
    print("recall: {}".format(np.mean(recall)))
    print("f1: {}".format(np.mean(f1)))

    print(datetime.datetime.now())
                    
    
    print(confusion_matrix(y_test,y_pred))  
#     print(classification_report(y_test,y_pred))  
#     score.append(y_pred)
#     score.append(best_svr.score(X_test,y_test))
    
# print(np.mean(scores))


X = FE_result_1.iloc[:,[0]]
Y = FE_result_1.iloc[:,[1]]

# X = ['0.1','0.2','0.3','0.4','0.5','0.6','0.7','0.8','0.9','1.0']
# Y = [1,2,3,4,5,5,4,3,2,1]

SVM_undersampling(X,Y)


  return self.partial_fit(X, y)


29172 29172
2019-04-29 17:08:25.306048
[[0.52571293]
 [0.48997685]
 [0.46546155]
 ...
 [0.44552097]
 [0.81301517]
 [1.        ]]
accuracy: 0.7278025368529311
precision: 0.7278025368529311
recall: 0.2
f1: 0.8424603174603175
2019-04-29 17:08:36.043899
[[   0    0    0    0   46]
 [   0    0    0    0   30]
 [   0    0    0    0  133]
 [   0    0    0    0  585]
 [   0    0    0    0 2123]]


525240


In [18]:
from info_gain import info_gain

# Example of color to indicate whether something is fruit or vegatable
produce = ['apple', 'apple', 'apple', 'strawberry', 'eggplant']
fruit   = [ True  ,  True  ,  True  ,  True       ,  False    ]
colour  = ['green', 'green', 'red'  , 'red'       , 'purple'  ]

ig  = info_gain.info_gain(fruit, colour)
iv  = info_gain.intrinsic_value(fruit, colour)
igr = info_gain.info_gain_ratio(fruit, colour)

print(ig, iv, igr)

TypeError: object of type 'bool' has no len()