# CLASSIFYING AMAZON FOOD REVIEWS USING GBDT Classifier

In [1]:
# Importing the libraries required for our task

import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer

In [2]:
# Importing the required sqlite database which contains the reviews

con = sqlite3.connect('database.sqlite')

'''
Our objective is to check whether the review is positive or negative.

The dataset originally consists of reviews from 1 to 5. We will consider reviews which are rated as 4 and 5 to be positive, reviews 
which are rated 1 and 2 to be negative. As, we cannot draw any conclusions from review which is rated 3 star we will eliminate
all the reviews rated 3 star.

'''

# Filtering out the data w/o the 3 star reviews

filtered_data = pd.read_sql_query("""SELECT * FROM Reviews WHERE Score != 3""", con) 

In [3]:
# Checking whether the data has been filtered properly

filtered_data['Score'].unique()

# We can see that there is no 3 star review in the data

array([5, 1, 4, 2], dtype=int64)

In [4]:
# As we have eliminated the data with 3 start reviews, we will label the remaining data(4 and 5 scores) as positive and negative(1 and 2 scores).

# Creating a function to label the data
def partition(x):
    if x < 3:
        return '0'
    return '1'


# Applying the labels to the data

actualScore = filtered_data['Score']
positiveNegative = actualScore.map(partition) 
filtered_data['Score'] = positiveNegative

In [5]:
# Checking the first few rows of the data

filtered_data.head(10)

# We can see that the score has been changed to positive and negative instead of 5,4,1,2

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,1,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,0,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,1,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,0,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,1,1350777600,Great taffy,Great taffy at a great price. There was a wid...
5,6,B006K2ZZ7K,ADT0SRK1MGOEU,Twoapennything,0,0,1,1342051200,Nice Taffy,I got a wild hair for taffy and ordered this f...
6,7,B006K2ZZ7K,A1SP2KVKFXXRU1,David C. Sullivan,0,0,1,1340150400,Great! Just as good as the expensive brands!,This saltwater taffy had great flavors and was...
7,8,B006K2ZZ7K,A3JRGQVEQN31IQ,Pamela G. Williams,0,0,1,1336003200,"Wonderful, tasty taffy",This taffy is so good. It is very soft and ch...
8,9,B000E7L2R4,A1MZYO9TZK0BBI,R. James,1,1,1,1322006400,Yay Barley,Right now I'm mostly just sprouting this so my...
9,10,B00171APVA,A21BT40VZCCYT4,Carol A. Reed,0,0,1,1351209600,Healthy Dog Food,This is a very healthy dog food. Good for thei...


In [6]:
# Let us look at the shape of the data

filtered_data.shape

(525814, 10)

In [7]:
# Dropping any duplicates if they are present in the data

duplicates_dropped=filtered_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first', inplace=False)
duplicates_dropped.shape

(364173, 10)

In [8]:
# Eliminating the rows where helpfulness numerator is greator than the helfulness denominator

final=duplicates_dropped[duplicates_dropped.HelpfulnessNumerator<=duplicates_dropped.HelpfulnessDenominator]

In [9]:
# Taking a look at the shape of the data

final.shape

(364171, 10)

In [10]:
# We will select only the required columns of which we will filter some reviews and assign them their labels and sort them wrt time

final_data = final[['ProductId','Time','Text','Score']]

In [11]:
# Checking the dimension of the data
final_data.shape


(364171, 4)

In [12]:
# Randomly selecting some reviews from the 'final_data' data

# First we will extract the values from the given dataframe

X=final_data.iloc[:,:].values

# randomly extracting 15000 reviews from the dataset

import random

n = 364171
m = 15000
p = m/n

sampled_data = [];

for i in range(0,n):
    if random.random() <= p:
        sampled_data.append(X[i,:])

In [13]:
# Assigning the extracted data to a dataframe

names = ['ProductId','Time','Text','Score']

sample = pd.DataFrame(sampled_data,columns= names)

In [14]:
# Checking the dimensions of the sampled data

sample.shape

(15109, 4)

In [15]:
# Checking the first few rows of the data

sample.head()

Unnamed: 0,ProductId,Time,Text,Score
0,B006K2ZZ7K,1340150400,This saltwater taffy had great flavors and was...,1
1,B001EO5QW8,1166313600,This is a good instant oatmeal from the best o...,1
2,B001EO5QW8,1191715200,I really like the Maple and Brown Sugar flavor...,1
3,B0019CW0HE,1333670400,My dog has a ton of allergies both environment...,1
4,B0019CW0HE,1330041600,My golden retriever is one of the most picky d...,1


In [16]:
# Now we will sort the data according to timestamp

sorted_data=sample.sort_values('Time', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')

In [17]:
# Checking if the data has been sorted or not

sorted_data.head(15)

Unnamed: 0,ProductId,Time,Text,Score
6986,B00004RYGX,1052265600,Embarrassing comedy that comes over like a chi...,0
10971,B0000TL5HI,1073606400,I purchased the organic fruit basket as a gift...,1
7040,B0000DG58Q,1074470400,For many years I tried various meat sauces: Am...,1
5004,B0001AGLV6,1074643200,I got some of this blend thinking that it woul...,1
6490,B0000DGFAC,1075420800,The plant is very healthy. It arrived quickly ...,0
144,B00016UX0K,1081555200,Mae Ploy Sweet Chili Sauce is becoming a stand...,1
10157,B0000VABYE,1082937600,Took along time getting here about 2 1/2 weeks...,0
5387,B0001ES9F8,1083110400,The Senseo machine shipped with two 18-pod sam...,0
9589,B0000CEQ6H,1084492800,"""We use and believe in stone milling because n...",1
10632,B0000TVUE6,1085702400,This cheese is wonderful. It arrived well chi...,1


In [18]:
# We can see that the data has been sorted wrt time, now we will only consider the 'Text' and 'Score' columns henceforth

sorted_final = sorted_data[['Text','Score']]


In [19]:
# Checking the first few rows of the data to ensure we have the right data

sorted_final.head()

Unnamed: 0,Text,Score
6986,Embarrassing comedy that comes over like a chi...,0
10971,I purchased the organic fruit basket as a gift...,1
7040,For many years I tried various meat sauces: Am...,1
5004,I got some of this blend thinking that it woul...,1
6490,The plant is very healthy. It arrived quickly ...,0


In [20]:
# The next task is to clean the text data so that it can be fed to the model

# Checking if there are unknown elements in the data

# find sentences containing HTML tags

import re

i=0;
for sent in sorted_final['Text'].values:
    if (len(re.findall('<.*?>', sent))):
        print(i)
        print(sent)
        break;
    i += 1;   
    
# We can see that the data contains html tags, we will need to remove those

3
I got some of this blend thinking that it would be a little different on steamed vegetables or greens, but when I tried it on eggs, I was astounded!  Used sparingly, it really makes plain old scrambled eggs into a taste treat.<p>The key  with all of these spice blends is to USE SPARINGLY!  I've used most of the other blends as well - a little goes a long way towards creating subtle overtones in your dishes.  Another good one is &quot;Are You Game?&quot;  Mostly I use that for meat dishes, but it's really good in lasagne and spaghetti sauce too.


In [21]:
'''
We will perform the data cleaning steps on the text data.
For that we will import some packages for stopwords removal, word stemmatization and cleaning html and punctuation marks.
'''
import re
from nltk.corpus import stopwords

nltk.download('stopwords')

stop = set(stopwords.words('english')) #set of stopwords
sno = nltk.stem.SnowballStemmer('english') #initialising the snowball stemmer

def cleanhtml(sentence): #function to clean the word of any html-tags
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', sentence)
    return cleantext
def cleanpunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    return  cleaned

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kulkarni\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
#Code for implementing step-by-step the checks mentioned in the pre-processing phase

i=0
str1=' '
final_string=[]
all_positive_words=[] # store words from +ve reviews here
all_negative_words=[] # store words from -ve reviews here.
s=''
for sent in sorted_final['Text'].values:
    filtered_sentence=[]
    #print(sent);
    sent=cleanhtml(sent) # remove HTMl tags
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if((cleaned_words.isalpha()) & (len(cleaned_words)>2)):    
                if(cleaned_words.lower() not in stop):
                    s=(sno.stem(cleaned_words.lower())).encode('utf8')
                    filtered_sentence.append(s)
                    if (final['Score'].values)[i] == 'positive': 
                        all_positive_words.append(s) #list of all words used to describe positive reviews
                    if(final['Score'].values)[i] == 'negative':
                        all_negative_words.append(s) #list of all words used to describe negative reviews reviews
                else:
                    continue
            else:
                continue 
    #print(filtered_sentence)
    str1 = b" ".join(filtered_sentence) #final string of cleaned words
    #print("***********************************************************************")
    
    final_string.append(str1)
    i+=1

In [23]:
sorted_final['Cleaned_Text'] = final_string

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


# We will do the classification based on various representations of the text data i.e BoW, Tf-Idf

# Using the Bag of Words text representation to create our first RandomForest models

In [24]:
# Creating the BoW representation of the text

count_vect = CountVectorizer() #in scikit-learn
final_counts = count_vect.fit_transform(sorted_final['Cleaned_Text'].values).todense()

In [25]:
# Assigning the final_tf_idf data to 'X' variable

X = final_counts

In [26]:
# Assinging the score to 'y' variable

y = np.array(sorted_final['Score'])

# We will first split the data into train and test sets, then split the train data in to train and cross-validation sets

In [27]:
# Splitting the data into train test sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # Splitting train test with 70:30 ratio




In [28]:
# Splitting the train data into cross validation and test datasets

X_tr, X_CV, y_tr, y_CV = train_test_split(X_train, y_train, test_size=0.3) # Splitting train cross-val with 70:30 ratio

# We will be using the GradientBoostingClassifier algorithm to classify the data as positive and negative. Here, we will use  'n_estimator' as the hyperparameter which will take various values to determine the best estimator depending upon the missclassification error.

In [30]:

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# We have taken the following values for 'alpha'
estimators = [10,50,100,200]
max_depth = [2,3,5,10]
accuracy_scores = []

for i in estimators:
    for j in max_depth:
        gbdt = GradientBoostingClassifier(n_estimators = i, max_depth = j)
        gbdt.fit(X_tr, y_tr)
        pred_bow = gbdt.predict(X_CV)
        # evaluate CV accuracy
        acc_bow = accuracy_score(y_CV, pred_bow, normalize=True) # * float(100)
        accuracy_scores.append(acc_bow)
        MissClErrBow = [1-x for x in accuracy_scores]
        print("for n_estimators =", i,"and max depth = ", j)
        print('Miss Classification Error:',min(MissClErrBow))

for n_estimators = 10 and max depth =  2
Miss Classification Error: 0.15663410022061142
for n_estimators = 10 and max depth =  3
Miss Classification Error: 0.15316734951150335
for n_estimators = 10 and max depth =  5
Miss Classification Error: 0.14339741569492592
for n_estimators = 10 and max depth =  10
Miss Classification Error: 0.13803971005357707
for n_estimators = 50 and max depth =  2
Miss Classification Error: 0.13803971005357707
for n_estimators = 50 and max depth =  3
Miss Classification Error: 0.12826977623699964
for n_estimators = 50 and max depth =  5
Miss Classification Error: 0.12763945792625275
for n_estimators = 50 and max depth =  10
Miss Classification Error: 0.12669398046013236
for n_estimators = 100 and max depth =  2
Miss Classification Error: 0.12669398046013236
for n_estimators = 100 and max depth =  3
Miss Classification Error: 0.12322722975102429
for n_estimators = 100 and max depth =  5
Miss Classification Error: 0.11944531988654272
for n_estimators = 100 and 

# We can see that for the estimator value of 200 and max depth of 10 we get the minimum cross validation error.

# Creating a Gradient Boosting Classifier model with BoW text representation with estimator as 200 and depth as 10

In [33]:
gbdt_bow_final = GradientBoostingClassifier(n_estimators=200, max_depth = 10)
gbdt_bow_final.fit(X_tr, y_tr)
pred_bow_final = gbdt_bow_final.predict(X_test)

# evaluate CV accuracy
acc_bow = accuracy_score(y_test, pred_bow_final, normalize=True) * float(100)

print('\nThe accuracy of the GBDT Classifier for depth = 10 and n_estimators = 200 is %f%%' % (acc_bow))



The accuracy of the GBDT Classifier for depth = 10 and n_estimators = 200 is 88.550629%


# Using the TF-IDF text representation to create our Naive Bayes models

In [32]:
# In this stage we have initialized the tf-idf vectorizer and applied it to the text data which has been stored in the final_tf_idf vriable

tf_idf_vect = TfidfVectorizer()
final_tf_idf = tf_idf_vect.fit_transform(sorted_final['Cleaned_Text'].values).todense()

In [33]:
# Assigning the final_tf_idf data to 'X1' variable

X1 = final_tf_idf

In [34]:
# Assinging the score to 'y1' variable

y1 = np.array(sorted_final['Score'])

In [None]:
# Splitting the data into train test sets

from sklearn.model_selection import train_test_split

X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.3) # Splitting train test with 70:30 ratio

In [38]:
# Splitting the train data into train and cross validation sets

X_tr1, X_CV1, y_tr1, y_CV1 = train_test_split(X_train1, y_train1, test_size=0.3) # Splitting train cross-val with 70:30 ratio

In [38]:

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# We have taken the following values for 'alpha'
estimators = [10,50,100,200]
max_depth = [2,3,5,10]
accuracy_scores_tfidf = []

for i in estimators:
    for j in max_depth:
        gbdt_tfidf = GradientBoostingClassifier(n_estimators = i, max_depth = j)
        gbdt_tfidf.fit(X_tr1, y_tr1)
        pred_tfidf = gbdt_tfidf.predict(X_CV1)
        # evaluate CV accuracy
        acc_tfidf = accuracy_score(y_CV1, pred_tfidf, normalize=True) # * float(100)
        accuracy_scores_tfidf.append(acc_tfidf)
        MissClErrtfidf = [1-x for x in accuracy_scores]
        print("for n_estimators =", i,"and max depth = ", j)
        print('Miss Classification Error:',MissClErrtfidf)





CV accuracy for alpha = 0.00001 is 88%

CV accuracy for alpha = 0.00010 is 89%

CV accuracy for alpha = 0.00100 is 83%

CV accuracy for alpha = 0.10000 is 83%

CV accuracy for alpha = 1.00000 is 83%

CV accuracy for alpha = 10.00000 is 83%

CV accuracy for alpha = 100.00000 is 83%

CV accuracy for alpha = 1000.00000 is 83%


# We can see that for the alpha value of 0.001 we get the maximum cross validation accuracy.

# Creating a SGDClassifier model with TF-IDF text representation with an alpha value of 0.001

In [40]:
sgd_tfidf = linear_model.SGDClassifier(alpha = 0.0001)
sgd_tfidf.fit(X_tr1, y_tr1)
pred_acc1 = sgd_tfidf.predict(X_test1)
acc_tfidf = accuracy_score(y_test1, pred_acc1, normalize=True) * float(100)
print('\nTest accuracy for alpha = 0.001 is %d%%' % (acc_tfidf))




Test accuracy for alpha = 0.001 is 89%


# Using the W2Vec representation of the text data to apply the SGD Classifier

In [34]:
# Importing the required models for the project

from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle



In [35]:
import gensim
i=0
list_of_sent=[]
for sent in sorted_final['Text'].values:
    filtered_sentence=[]
    sent=cleanhtml(sent)
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if(cleaned_words.isalpha()):    
                filtered_sentence.append(cleaned_words.lower())
            else:
                continue 
    list_of_sent.append(filtered_sentence)
    

In [36]:
w2v_model=gensim.models.Word2Vec(list_of_sent,min_count=5,size=50, workers=3)



In [37]:
words = list(w2v_model.wv.vocab)
print(len(words))

7834


# Creating an Avg W2Vec representation of each review

In [38]:
# Computing the Avg W2Vec representation of each review and storing it in 'sent_vectors' list

sent_vectors = []; # the avg-w2v for each sentence/review is stored in this list
for sent in list_of_sent: # for each review/sentence
    sent_vec = np.zeros(50) # as word vectors are of zero length
    cnt_words =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        try:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
        except:
            pass
    sent_vec /= cnt_words
    sent_vectors.append(sent_vec)
print(len(sent_vectors))
print(len(sent_vectors[0]))

15109
50


In [39]:
# Storing the list in a variable 'X2'

X2 = sent_vectors

In [40]:
# Storing the review score in a 'y2' variable

y2 = sorted_final['Score']

In [41]:
# Splitting the data into train test sets

#from sklearn.model_selection import train_test_split

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.3) # Splitting train test with 70:30 ratio

In [42]:
# Splitting the train data into train and cross validation sets

X_tr2, X_CV2, y_tr2, y_CV2 = train_test_split(X_train2, y_train2, test_size=0.3) # Splitting train cross-val with 70:30 ratio

In [44]:

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# We have taken the following values for 'alpha'
estimators = [10,50,100,200]
max_depth = [2,3,5,10]
accuracy_scores_avgw2vec = []

for i in estimators:
    for j in max_depth:
        gbdt_avgw2vec = GradientBoostingClassifier(n_estimators = i, max_depth = j)
        gbdt_avgw2vec.fit(X_tr2, y_tr2)
        pred_avgw2vec = gbdt_avgw2vec.predict(X_CV2)
        # evaluate CV accuracy
        acc_avgw2vec = accuracy_score(y_CV2, pred_avgw2vec, normalize=True) # * float(100)
        accuracy_scores_avgw2vec.append(acc_avgw2vec)
        MissClErravgw2vec = [1-x for x in accuracy_scores_avgw2vec]
        print("for n_estimators =", i,"and max depth = ", j)
        print('Miss Classification Error:',min(MissClErravgw2vec))
    

for n_estimators = 10 and max depth =  2
Miss Classification Error: 0.15001575795776867
for n_estimators = 10 and max depth =  3
Miss Classification Error: 0.15001575795776867
for n_estimators = 10 and max depth =  5
Miss Classification Error: 0.1465490072486606
for n_estimators = 10 and max depth =  10
Miss Classification Error: 0.1465490072486606
for n_estimators = 50 and max depth =  2
Miss Classification Error: 0.14119130160731175
for n_estimators = 50 and max depth =  3
Miss Classification Error: 0.13803971005357707
for n_estimators = 50 and max depth =  5
Miss Classification Error: 0.13772455089820357
for n_estimators = 50 and max depth =  10
Miss Classification Error: 0.13772455089820357
for n_estimators = 100 and max depth =  2
Miss Classification Error: 0.13772455089820357
for n_estimators = 100 and max depth =  3
Miss Classification Error: 0.13614875512133628
for n_estimators = 100 and max depth =  5
Miss Classification Error: 0.13236684525685471
for n_estimators = 100 and ma

# We can see that for the estimator value of 200 and depth value of 10 we get the minimum miss classification error.

# Creating a GBDT Classifier model with avgW2Vec text representation with an estimator value of 200 and max_depth value of 10.

In [46]:
gbdt_avgw2vec_final = GradientBoostingClassifier(n_estimators=200, max_depth = 10)
gbdt_avgw2vec_final.fit(X_tr2, y_tr2)
pred_avgw2vec_final = gbdt_avgw2vec_final.predict(X_test2)

# evaluate CV accuracy
acc_avgw2vec = accuracy_score(y_test2, pred_avgw2vec_final, normalize=True) * float(100)

print('\nThe accuracy of the GBDT Classifier for depth = 10 and n_estimators = 200 is %f%%' % (acc_avgw2vec))


The accuracy of the GBDT Classifier for depth = 10 and n_estimators = 200 is 86.234282%


# Creating a Tf-Idf weighted W2Vec representation of each review

In [48]:
tf_idf_vect = TfidfVectorizer()
final_tf_idf = tf_idf_vect.fit_transform(sorted_final['Cleaned_Text'].values).todense()

# TF-IDF weighted Word2Vec
tfidf_feat = tf_idf_vect.get_feature_names() # tfidf words/col-names
# final_tf_idf is the sparse matrix with row= sentence, col=word and cell_val = tfidf

tfidf_sent_vectors = []; # the tfidf-w2v for each sentence/review is stored in this list
row=0;
for sent in list_of_sent: # for each review/sentence
    sent_vec = np.zeros(50) # as word vectors are of zero length
    weight_sum =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        try:
            vec = w2v_model.wv[word]
            # obtain the tf_idfidf of a word in a sentence/review
            tf_idf = final_tf_idf[row, tfidf_feat.index(word)]
            #tfidf = final_tf_idf[row, tfidf_feat.index(word)]
            sent_vec += (vec * tf_idf)
            weight_sum += tf_idf
        except:
            pass
    sent_vec /= weight_sum
    tfidf_sent_vectors.append(sent_vec)
    row += 1



In [49]:
# Checking for any 'NaN' values in the data

np.isnan(tfidf_sent_vectors).any()

# We can see that there are na values in the data

True

In [50]:
# We will replace all the 'Nan' values w/ mean of the respective columns


from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
data_tfidf = imp.fit_transform(tfidf_sent_vectors) # Assigning the imputed matrix w/o Nan values to data variable

In [51]:
# Checking if our imputation has succeded

np.isnan(data_tfidf).any()

# We can see that there are no na values any more.

False

In [52]:
# Assigning the 'data_tfidf' matrix to 'X3' variable

X3 = data_tfidf

In [53]:
# Assigning the scores to the 'y3' variable

y3 = sorted_final['Score']

In [54]:
# Splitting the data into train test sets

#from sklearn.model_selection import train_test_split

X_train3, X_test3, y_train3, y_test3 = train_test_split(X3, y3, test_size=0.3) # Splitting train test with 70:30 ratio

In [55]:
# Splitting the train data into train and cross validation sets

X_tr3, X_CV3, y_tr3, y_CV3 = train_test_split(X_train3, y_train3, test_size=0.3) # Splitting train cross-val with 70:30 ratio

In [57]:

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# We have taken the following values for 'alpha'
estimators = [10,50,100,200]
max_depth = [2,3,5,10]
accuracy_scores_tfidfw2vec = []

for i in estimators:
    for j in max_depth:
        gbdt_tfidfw2vec = GradientBoostingClassifier(n_estimators = i, max_depth = j)
        gbdt_tfidfw2vec.fit(X_tr3, y_tr3)
        pred_tfidfw2vec = gbdt_tfidfw2vec.predict(X_CV3)
        # evaluate CV accuracy
        acc_tfidfw2vec = accuracy_score(y_CV3, pred_tfidfw2vec, normalize=True) # * float(100)
        accuracy_scores_tfidfw2vec.append(acc_tfidfw2vec)
        MissClErrtfidfw2vec = [1-x for x in accuracy_scores_tfidfw2vec]
        print("for n_estimators =", i,"and max depth = ", j)
        print('Miss Classification Error:',min(MissClErrtfidfw2vec))

for n_estimators = 10 and max depth =  2
Miss Classification Error: 0.1594705326189726
for n_estimators = 10 and max depth =  3
Miss Classification Error: 0.1594705326189726
for n_estimators = 10 and max depth =  5
Miss Classification Error: 0.1588402143082257
for n_estimators = 10 and max depth =  10
Miss Classification Error: 0.1588402143082257
for n_estimators = 50 and max depth =  2
Miss Classification Error: 0.1588402143082257
for n_estimators = 50 and max depth =  3
Miss Classification Error: 0.1588402143082257
for n_estimators = 50 and max depth =  5
Miss Classification Error: 0.1578947368421053
for n_estimators = 50 and max depth =  10
Miss Classification Error: 0.1578947368421053
for n_estimators = 100 and max depth =  2
Miss Classification Error: 0.1578947368421053
for n_estimators = 100 and max depth =  3
Miss Classification Error: 0.1575795776867318
for n_estimators = 100 and max depth =  5
Miss Classification Error: 0.15694925937598492
for n_estimators = 100 and max depth 

# We can see that for the estimator value of 200 and depth value of 5 we get the minimum miss classification error.

# Creating a GBDT Classifier model with tfidfW2Vec text representation with an estimator value of 200 and max_depth value of 5.

In [59]:
gbdt_tfidfw2vec_final = GradientBoostingClassifier(n_estimators=200, max_depth = 5)
gbdt_tfidfw2vec_final.fit(X_tr3, y_tr3)
pred_tfidfw2vec_final = gbdt_tfidfw2vec_final.predict(X_test3)

# evaluate CV accuracy
acc_tfidfw2vec = accuracy_score(y_test3, pred_tfidfw2vec_final, normalize=True) * float(100)

print('\nThe accuracy of the GBDT Classifier for depth = 5 and n_estimators = 200 is %f%%' % (acc_tfidfw2vec))


The accuracy of the GBDT Classifier for depth = 5 and n_estimators = 200 is 84.888595%


# In this problem we have applied the Random Forest Classifier to the data. We have only considered data in the form of BoW, AvgW2Vec and TF-IDF W2Vec.

# We have not considered the TF-IDF representation because the time-complexity was very large as observed while running the operations on the data.

# Following results have been observed for the data using the available text representations.

# We can observe that we get maximum accuracy for BoW representation of the text data after applying the GBDT algorithm.