In [1]:
import numpy as np
import pandas as pd
import re
import nltk
import csv

import matplotlib.pyplot as plt
%matplotlib inline

nltk.download(['stopwords', 'punkt', 'wordnet'])
# nltk.download('punkt')
from nltk.corpus import stopwords
from nltk import word_tokenize, sent_tokenize
from nltk.stem.porter import *
from nltk.stem.wordnet import WordNetLemmatizer

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [2]:
df = pd.read_csv('musical1.tsv', sep = '\t')
df.head()
#df.info()

Unnamed: 0,Review,Score
0,This the second set of strap locks that I've o...,1
1,First of all I want to say I love a tube amp d...,1
2,"i only bought with the idea that a ""FULL"" vers...",0
3,"If you're like me, you probably bought this to...",1
4,"Didn't know what to expect for under $10, but ...",1


In [3]:
#extract words from review column in data
from keras.preprocessing.text import Tokenizer

corpus = []
tokenizer = Tokenizer()

for word in df['Review']:
    words = word.lower()
    words = "".join([ch for ch in word if ch.isalnum() or ch==' '])   #gets rid of punctuation
    words = words.split() #converts into a list
    tokenizer.fit_on_texts(words)
    words = [stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
    words = [lemmatizer.lemmatize(word.lower()) for word in words]
    words = ' '.join(words)
    corpus.append(words)
#print(corpus)

In [4]:
vectors = tokenizer.texts_to_matrix(words, mode = 'count')
print(vectors)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [5]:
#creating the bag of words model
from sklearn.feature_extraction.text import CountVectorizer

countVect = CountVectorizer(max_features = 5000)

x = countVect.fit_transform(corpus).toarray()
y = df.iloc[:,1].values

In [6]:
#splitting the dataset into the test and train sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size = 0.2, random_state = 1)

In [7]:
#random forest classifier

#fitting the classifier to the training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 400, criterion = 'entropy', random_state = 1)
classifier.fit(x_train, y_train)

#predicting test set results
y_pred = classifier.predict(x_test)

#making the confusion matrix
from sklearn.metrics import confusion_matrix
cm2 = confusion_matrix(y_test, y_pred)

In [11]:
from sklearn.metrics import accuracy_score, confusion_matrix

cm = confusion_matrix(y_test, np.round(y_pred), labels=[1,0])
tp = cm[0][0]
fp = cm[0][1]
fn = cm[1][0]
tn = cm[1][1]

print("Confusion matrix:")
print(cm)

print()
print("Test metrics:")
acc = ((tp+fn)/(tp+fp+fn+tn)) * 100
precision = tp / (tp+fp)*100
recall = tp/(tp+fn)*100
f1_score = 2*precision*recall/(precision+recall)
print('Accuracy: {}%'.format(acc))
print('Precision: {}%'.format(precision))
print('Recall: {}%'.format(recall))
print('F1-score: {}'.format(f1_score))

Confusion matrix:
[[91 16]
 [32 61]]

Test metrics:
Accuracy: 61.5%
Precision: 85.04672897196261%
Recall: 73.98373983739837%
F1-score: 79.1304347826087
