In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amazonreviews/train.ft.txt.bz2
/kaggle/input/amazonreviews/test.ft.txt.bz2


In [2]:
import bz2

In [3]:
def labels_text(x):
  label=[]
  text=[]
  for line in bz2.BZ2File(x):
    decode = line.decode("utf-8")
    label.append(int(decode[9]) - 1)
    text.append(decode[10:].strip())
  return np.array(label),text

train_label, train_text = labels_text('/kaggle/input/amazonreviews/train.ft.txt.bz2')
test_label, test_text = labels_text('/kaggle/input/amazonreviews/test.ft.txt.bz2')

In [5]:
from sklearn.utils import shuffle
train_text, train_label = shuffle(train_text, train_label)
test_text, test_label = shuffle(test_text, test_label)

In [6]:
train_text[0]

"Worth getting a degree to understand it: Ok, I admit it. This book is very tough to understand. However, I will say this much: I just finished an undergraduate degree in philosophy, and I consider that entire degree worth every penny, worth every moment spent reading or writing if the only thing I gain from it is to be able to understand this book to a degree. I mean that sincerely. I recommend smoking cigars and drinking espresso and fine wine while plowing your way through this thick masterpiece. After spending a couple of weeks getting used to the book, I got up to 10 pages per hour. I'm sure that sounds like I'm exaggerating, but I'm not. This book certainly ranks in the top 5 favorite books of all time for me, and likely always will. It's truly excellent, and well worth the effort."

In [7]:
train_label[0]

1

In [8]:
len(train_text)

3600000

In [9]:
len(test_text)

400000

In [10]:
train_text=train_text[0:10000]
train_label=train_label[0:10000]

In [11]:
test_text=test_text[0:2500]
test_label=test_label[0:2500]

In [12]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus_train=[]

for i in range(10000):
    review = re.sub('".*?"', '', train_text[i]) #removing any word within quotation marks
    review = re.sub('[^a-zA-Z]', ' ', review) #keeping only letters and removing anything else
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus_train.append(review)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus_train).toarray()
y = train_label[0:10000]

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [15]:
from sklearn.svm import SVC
classifier = SVC(C=3,kernel = 'rbf', random_state = 0)#2.0=84.52,1.75=84.72(83.16),1.5=84.84(83.13),3.25=84.6
classifier.fit(X_train, y_train)

SVC(C=3, random_state=0)

In [16]:
y_pred = classifier.predict(X_val)

In [17]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_val, y_pred)
print(cm)
acc = accuracy_score(y_val, y_pred)
print(acc)

[[1060  179]
 [ 233 1028]]
0.8352


In [18]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 83.01 %
Standard Deviation: 0.89 %


In [19]:
corpus_test=[]
for i in range(2500):
    review = re.sub('".*?"', '', test_text[i]) #removing any word within quotation marks
    review = re.sub('[^a-zA-Z]', ' ', review) #keeping only letters and removing anything else
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus_test.append(review)

In [20]:
X_test = cv.transform(corpus_test).toarray()
y_test = test_label[0:2500]

In [21]:
y_pred_new = classifier.predict(X_test)

In [22]:
cm_final = confusion_matrix(y_test, y_pred_new)
print(cm_final)
acc_final = accuracy_score(y_test, y_pred_new)
print(acc_final)

[[1071  165]
 [ 210 1054]]
0.85


In [25]:
new_review = input('Enter review: ')
new_review = re.sub('".*?"', '', new_review)
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
new_X_test = cv.transform(new_corpus).toarray()
new_y_pred = classifier.predict(new_X_test)
print(new_y_pred)

Enter review: decent book, hated it
[0]


In [None]:
"""from sklearn.model_selection import GridSearchCV
parameters = [{'C': [0.25, 0.5, 0.75, 2, 3], 'kernel': ['rbf'], 'gamma': [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]}]
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 5)
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy.mean()*100))
print("Best Parameters:", best_parameters)"""